In [15]:
import os.path

import pandas as pd

gene_maps_dir = "data/gene_maps"

wbps_map = os.path.join(gene_maps_dir, "previous_to_current_id.tsv")
lo_map = os.path.join(gene_maps_dir, "gene_map.few_more.cc_uniq.tsv")

cols = ("v18", "v19", "qual")
wbps_df = pd.read_csv(wbps_map, delimiter="\t", names=cols, index_col=0)
wbps_df["source"] = "wbps"
lo_df = pd.read_csv(lo_map, delimiter="\t", names=cols, index_col=0)
lo_df["source"] = "lo"
all_df = pd.concat([lo_df, wbps_df]).sort_index()


In [3]:
def split_shared_and_unique(left_df, right_df):
    shared_df = pd.DataFrame(index=left_df.index, columns=left_df.columns)
    uniq_df = pd.DataFrame(index=left_df.index, columns=left_df.columns)
    for idx, row in left_df.iterrows():
        shared = False
        if idx in right_df.index:
            for i2, r2 in right_df[right_df.index==idx].iterrows():
                if r2.v19 == row.v19:
                    shared = True
        if shared:
            shared_df = pd.concat([shared_df, pd.DataFrame([row])])
        else:
            uniq_df = pd.concat([uniq_df, pd.DataFrame([row])])
    return shared_df.dropna(), uniq_df.dropna()

In [9]:
shared_df, _ = split_shared_and_unique(wbps_df, lo_df)
print(f"mappings shared: {len(shared_df)}")

mappings shared: 8345


Determine unique mappings from forward and reverse Liftoff runs

In [12]:
lo_for_map = os.path.join(gene_maps_dir, "gene_map_1.cc.tsv")
lo_for_df = pd.read_csv(lo_for_map, delimiter="\t", names=cols, index_col=0)
lo_rev_map = os.path.join(gene_maps_dir, "gene_map_0.cc.tsv")
lo_rev_df = pd.read_csv(lo_rev_map, delimiter="\t", names=cols, index_col=0)

_, uniq_rev = split_shared_and_unique(lo_rev_df, lo_for_df)
_, uniq_for = split_shared_and_unique(lo_for_df, lo_rev_df)


In [13]:
print(f"Unique mappings from forward Liftoff run (18 -> 19): {len(uniq_for)}")
print(f"...not in WBPS mapping: {len(uniq_for[~uniq_for.index.isin(wbps_df.index)])}")
print(f"Unique mappings from reverse Liftoff run (19 -> 18): {len(uniq_rev)}")
print(f"...not in WBPS mapping: {len(uniq_rev[~uniq_rev.index.isin(wbps_df.index)])}")

Unique mappings from forward Liftoff run (18 -> 19): 70
...not in WBPS mapping: 56
Unique mappings from reverse Liftoff run (19 -> 18): 76
...not in WBPS mapping: 53


Determine how many v19 genes are missed by both mappings

In [14]:
import gffutils
import os.path

v19_path = "data/from_WBPS/strongyloides_stercoralis.PRJNA930454.WBPS19.annotations.gff3"

if not os.path.exists("v19.db"):
    db = gffutils.create_db(v19_path, "v19.db", merge_strategy="create_unique")
else:
    db = gffutils.FeatureDB("v19.db")

missing_v19_lo = set()
missing_v19_wbps = set()
for g in db.all_features(featuretype="gene"):
    g_id = g.id.split(":")[1]
    if lo_df[lo_df.v19 == g_id].empty:
        missing_v19_lo.add(g_id)
    if wbps_df[wbps_df.v19 == g_id].empty:
        missing_v19_wbps.add(g_id)




In [21]:
print(f"Total WBPS19 genes: {len(list(db.all_features(featuretype='gene')))}")
print(f"Missing from Liftoff mappings: {len(missing_v19_lo)}")
print(f"Missing from WBPS mappings: {len(missing_v19_wbps)}")
print(f"Missing from both: {len(missing_v19_lo.intersection(missing_v19_wbps))}")

Total WBPS19 genes: 12061
Missing from Liftoff mappings: 3141
Missing from WBPS mappings: 2279
Missing from both: 1953


In [23]:
with open(os.path.join(gene_maps_dir, "missing_v19_ids.txt"), "w") as f:
    for g_id in sorted(missing_v19_lo.intersection(missing_v19_wbps)):
        f.write(f"{g_id}\n")