In [None]:
import pandas as pd
import seaborn as sns

In [None]:
accession_translator = {
    "GCF_009017415.1" : "Aspergillus flavus",
    "GCF_000143535.2" : "Botrytis cinerea",
    "GCF_033473495.1" : "Cercospora beticola",
    "GCF_000835755.1" : "Cryptococcus tetragattii",
    "GCF_000240135.3" : "Fusarium graminearum",
    "GCF_021901695.1" : "Puccinia striiformis",
    "GCF_026210795.1" : "Rhizophagus irregularis",
    "GCF_000146045.2" : "Saccharomyces cerevisiae",
    "GCA_964035595.1" : "Somion occarium",
}

In [None]:
def parse_repeats_tables(accession):
    base_path = f"/home/lisvad/mnt/nisin/geneml/benchmarking_dataset/{accession}"
    all_repeats = pd.read_csv(base_path+"/repeats.tsv", header=None)[0]
    cds_repeats = pd.read_csv(base_path+"/repeats_in_cdses.tsv", delimiter='\t', header=None)[3]
    diff = all_repeats.value_counts().subtract(cds_repeats.value_counts(), fill_value=0).clip(lower=0).astype(int)
    no_cds_repeats = diff.index.repeat(diff).to_list()
    repeats_df = pd.concat([
    pd.DataFrame({'length': cds_repeats, 'location': 'within gene'}),
    pd.DataFrame({'length': no_cds_repeats, 'location': 'outside gene'})], ignore_index=True)
    repeats_df["genome"] = accession
    repeats_df["species"] = accession_translator[accession]
    return repeats_df

In [None]:
dfs = []
for acc in accession_translator.keys():
    df = parse_repeats_tables(acc)
    dfs.append(df)
data = pd.concat(dfs, ignore_index=True)
data

In [None]:
sns.displot(data, x="length", col="species", log_scale=True, col_wrap=3)

In [None]:
g = sns.displot(data, x="length", hue="location", col="species", log_scale=True, col_wrap=3, kind="ecdf")
for ax in g.axes.flat:
    ax.axvline(x=200)
    ax.axvline(x=40)