In [8]:
import glob
import plotly.express as px
import pandas as pd


In [9]:
max_depth = 32
run_trimmed = True

kraken_data = {}

files = glob.glob("outputs/*/kraken2/*.kreport")
files.sort()

# work through all of the kraken stuff
for fname in files:
    with open(fname, 'r') as f:
        data = f.read().split("\n") # split data into separate rows
        parsed_data = {}
        
        curr_tag = ['-' for i in range(0, max_depth)]
        for i in data:
            curr_row = i.split("\t") # the data within rows is tab-delimited

            # create a tag structure
            curr_n = curr_row[-1].split("  ")
    
            if curr_n[-1] != '': # skip rows w-out taxon names (mostly to deal w empty entries)
                try:
                    curr_tag[len(curr_n)-1] = curr_n[-1]
                except:
                    print(curr_n) # just a contingency to see when we end up deeper than expected
                    
                for j in range(len(curr_n), max_depth):
                    curr_tag[j] = '-'
                    
                # extract the percentage of aligned reads
                #parsed_data['\t'.join(curr_tag)] = curr_row[0].replace(" ", "")
                parsed_data['\t'.join(curr_tag)] = curr_row[1].replace(" ", "")
                #parsed_data['\t'.join(curr_tag)] = curr_row[2].replace(" ", "")
                
        kraken_data[fname.split("/")[-3]] = dict(parsed_data)
 

In [10]:
# trim down to a specific level for visualization purposes
if run_trimmed:
    target_level = 5

    kraken_tmp = {}

    for s, sv in kraken_data.items():
        if "_dm" in s:
            kraken_tmp[s] = kraken_data[s]
        else:
            kraken_tmp[s] = {}

            for k, v in sv.items():
                tmp_k = k.split("\t")

                if tmp_k[target_level] != "-" and tmp_k[target_level+1] == "-":
                    kraken_tmp[s][tmp_k[target_level]] = v

    kraken_data = kraken_tmp
    

In [11]:
# now to organize in a way that keeps a logical order intact
keylist = []

for k, e in kraken_data.items():
    for s, l in e.items():
        keylist.append(s) # just joining it up to make filtering duplicates out easier
        
keylist = [x.split('\t') for x in list(set(keylist))]
     
if not run_trimmed:
    for i in range(1, max_depth+1):
        keylist = sorted(keylist, key=lambda x: x[-i]) # need to sort it starting from the right col to keep it from re-sorting everything
    

In [12]:
with open('kraken2.csv', 'w') as f:
    kraken_keys = list(kraken_data.keys())
    f.write(",".join(['' for i in range(0, max_depth)]+kraken_keys)+"\n")

    for kl in keylist: # work through all of sorted rows 
        tmp_row = list(kl)
        tmp_key = '\t'.join(kl)
        
        for s in kraken_keys:
            if tmp_key in kraken_data[s]:
                tmp_row.append(kraken_data[s][tmp_key])
            else:
                tmp_row.append('NaN')
    
        f.write(",".join([str(s) for s in tmp_row])+"\n")
        

In [13]:
# data as stacked bar charts
data = {
    "Organism": [],
    "Read Count": [],
    "Sample": []
}
samples = []
min_reads = 0
total_others = []

with open('kraken2.csv', 'r') as f:
    raw_data = f.read().split("\n")
    
    samples = ["Sample "+x for x in raw_data[0].split(",") if x != ""]
    for s in samples:
        total_others.append(0)
    
    for r in raw_data[1:]:
        do_add = False
        tmp_new = [0 for s in samples]
        
        tmp_r = r.split(",")
        if len(tmp_r) > 1: # dealing with the last row
            org = "|".join([x for x in tmp_r[:-len(samples)] if x != "-"])

            for i in range(0, len(samples)):
                tmp_new[i] = tmp_r[-len(samples)+i]
                
                if tmp_new[i] != "NaN" and int(tmp_new[i]) >= min_reads:
                    do_add = True
                
            if do_add:
                for i in range(0, len(samples)):
                    data["Organism"].append(org)
                    data["Read Count"].append(tmp_new[i])
                    data["Sample"].append(samples[i])
            else:
                for i in range(0, len(samples)):
                    if tmp_new[i] != "NaN":
                        total_others[i] += int(tmp_new[i])
    
for i in range(0, len(samples)):
    data["Organism"].append("other")
    data["Read Count"].append(total_others[i])
    data["Sample"].append(samples[i])
    
df = pd.DataFrame.from_dict(data)
df["Read Count"] = df["Read Count"].astype(float)


In [14]:
fig = px.bar(df, x="Sample", y="Read Count", color='Organism')
fig.update_layout(showlegend=False)
fig.write_html("kraken2.html")
fig.show()
