In [1]:
import pandas as pd
import plotly.express as px
from glob import glob
from pathlib import Path

In [2]:
data_folder = "data"

In [9]:
def get_kmer_df(file_suffix, column_name):
    kmer_files = [Path(path) for path in glob(f"{data_folder}/*/{file_suffix}")]
    kmer_dfs = []
    for kmer_file in kmer_files:
        sample_name = kmer_file.parent.name
        kmer_df = pd.read_csv(kmer_file, sep="\t", header=None, names=[column_name, "count"])
        kmer_df["sample"] = sample_name
        kmer_dfs.append(kmer_df)
    all_kmer_dfs = pd.concat(kmer_dfs, ignore_index=True)
    return all_kmer_dfs

In [10]:
all_kmer_probs = get_kmer_df("kmer_probs.txt", "probs")
all_kmer_probs

Unnamed: 0,probs,count,sample
0,-200,26,Escherichia_coli_MSB1_4I.100x.random.illumina
1,-199,0,Escherichia_coli_MSB1_4I.100x.random.illumina
2,-198,0,Escherichia_coli_MSB1_4I.100x.random.illumina
3,-197,0,Escherichia_coli_MSB1_4I.100x.random.illumina
4,-196,0,Escherichia_coli_MSB1_4I.100x.random.illumina
...,...,...,...
3995,-5,1076928,Escherichia_coli_MINF_8D.100x.random.illumina
3996,-4,0,Escherichia_coli_MINF_8D.100x.random.illumina
3997,-3,0,Escherichia_coli_MINF_8D.100x.random.illumina
3998,-2,0,Escherichia_coli_MINF_8D.100x.random.illumina


In [11]:
fig_probs = px.line(all_kmer_probs_dfs, x="prob", y="count", color='sample')
fig_probs.update_layout(
    yaxis_title="Count",
    xaxis_title="Likelihood",
)
fig_probs

In [13]:
all_kmer_coverages = get_kmer_df("kmer_covgs.txt", "coverage")
all_kmer_coverages

Unnamed: 0,coverage,count,sample
0,0,2627885,Escherichia_coli_MSB1_4I.100x.random.illumina
1,1,114644,Escherichia_coli_MSB1_4I.100x.random.illumina
2,2,14062,Escherichia_coli_MSB1_4I.100x.random.illumina
3,3,2736,Escherichia_coli_MSB1_4I.100x.random.illumina
4,4,1683,Escherichia_coli_MSB1_4I.100x.random.illumina
...,...,...,...
19995,995,0,Escherichia_coli_MINF_8D.100x.random.illumina
19996,996,0,Escherichia_coli_MINF_8D.100x.random.illumina
19997,997,2,Escherichia_coli_MINF_8D.100x.random.illumina
19998,998,1,Escherichia_coli_MINF_8D.100x.random.illumina


In [15]:
fig_covgs = px.line(all_kmer_coverages, x="coverage", y="count", color='sample')
fig_covgs.update_layout(
    yaxis_title="Count",
    xaxis_title="Kmer Coverage",
)
fig_covgs

In [16]:
import chart_studio.plotly as chart_studio
chart_studio.plot(fig_covgs,
                  filename = 'kmer_covgs',
                  auto_open=True)


'https://plotly.com/~leandro.ishi.lima/49/'

In [18]:
all_kmer_probs.to_csv("kmer_probs.csv", index=False)
all_kmer_coverages.to_csv("kmer_covgs.csv", index=False)