In [94]:
import glob
import plotly.express as px
import pandas as pd


In [111]:
dm_data = {} # get rid if the uni of indiana tags since we can't really tell what that is

for fname in glob.glob("data/*.profile.txt"):
    with open(fname, "r") as f:
        dm_data[fname.split("/")[1].split(".")[0]] = {row.split("\t")[0]: row.split("\t")[1] for row in f.read().split("\n") if len(row.split("\t"))>1}
        

In [112]:
dm_data


{'104': {'Prevotella': '8647836',
  'Bacteroides': '1905763',
  'Faecalibacterium': '744079',
  'Ruminococcus': '716317',
  'Lachnobacterium': '689068',
  'Clostridium': '498446',
  'Blautia': '466477',
  'Alistipes': '355561',
  'Dorea': '304352',
  'Megasphaera': '258433',
  'Sutterella': '220632',
  'Dialister': '213583',
  'Bifidobacterium': '168242',
  'Lactobacillus': '135463',
  'Roseburia': '116765',
  'Parabacteroides': '114986',
  'Oscillibacter': '80455',
  'Solobacterium': '69708',
  'Paraprevotella': '67836',
  'Eubacterium': '59554',
  'Collinsella': '53729',
  'Lachnospira': '48502',
  'Barnesiella': '36181',
  'Coprococcus': '31498',
  'Odoribacter': '29057',
  'Desulfovibrio': '28572',
  'Slackia': '24482',
  'Butyricicoccus': '23468',
  'Xylanibacter': '22488',
  'Butyrivibrio': '20020',
  'Butyricimonas': '19042',
  'Phascolarctobacterium': '17954',
  'Acidaminococcus': '12870',
  'Catenibacterium': '9267',
  'Eggerthella': '5934',
  'Streptococcus': '5128',
  'Halom

In [113]:
max_depth = 21
run_trimmed = True

kraken_data = {}

# work through all of the kraken stuff
for fname in glob.glob("outputs/*/kraken2/*.kreport"):
    with open(fname, 'r') as f:
        data = f.read().split("\n") # split data into separate rows
        parsed_data = {}
        
        curr_tag = ['-' for i in range(0, max_depth)]
        for i in data:
            curr_row = i.split("\t") # the data within rows is tab-delimited

            # create a tag structure
            curr_n = curr_row[-1].split("  ")
    
            if curr_n[-1] != '': # skip rows w-out taxon names (mostly to deal w empty entries)
                try:
                    curr_tag[len(curr_n)-1] = curr_n[-1]
                except:
                    print(curr_n) # just a contingency to see when we end up deeper than expected
                    
                for j in range(len(curr_n), max_depth):
                    curr_tag[j] = '-'
                    
                # extract the percentage of aligned reads
                #parsed_data['\t'.join(curr_tag)] = curr_row[0].replace(" ", "")
                parsed_data['\t'.join(curr_tag)] = curr_row[1].replace(" ", "")
                #parsed_data['\t'.join(curr_tag)] = curr_row[2].replace(" ", "")
                
        kraken_data[fname.split("/")[-3].split("_")[2]] = dict(parsed_data)
        if fname.split("/")[-3].split("_")[2] in dm_data:
            kraken_data[fname.split("/")[-3].split("_")[2]+"_dm"] = dm_data[fname.split("/")[-3].split("_")[2]]
 

In [116]:
genus_map = {}

for k, v in kraken_data.items():
    for s, _ in v.items():
        for t in s.split("\t")[5:]:
            genus_map[t] = s.split("\t")[5]

# convert genus to phylum
dm_phyl_data = {} # get rid if the uni of indiana tags since we can't really tell what that is

for s, v in dm_data.items():
    dm_phyl_data[s] = {}
    
    for g, c in v.items():
        if g in genus_map:
            if genus_map[g] not in dm_phyl_data[s]:
                dm_phyl_data[s][genus_map[g]] = 0
            dm_phyl_data[s][genus_map[g]] += int(c)

for k, v in dm_phyl_data.items():
    print(k)
    kraken_data[str(k)+"_dm"] = v

104
1
100
4
14
97


In [118]:
genus_map


{'-': 'Fungi incertae sedis',
 'Bacteroidetes': 'Bacteroidetes',
 'Bacteroidia': 'Bacteroidetes',
 'Bacteroidales': 'Bacteroidetes',
 'Bacteroidaceae': 'Bacteroidetes',
 'Bacteroides': 'Bacteroidetes',
 'Bacteroides caccae': 'Bacteroidetes',
 'Bacteroides caccae ATCC 43185': 'Bacteroidetes',
 'Bacteroides coprocola': 'Bacteroidetes',
 'Bacteroides cellulosilyticus': 'Bacteroidetes',
 'Bacteroides cellulosilyticus WH2': 'Bacteroidetes',
 'Bacteroides cellulosilyticus CL02T12C19': 'Bacteroidetes',
 'Bacteroides cellulosilyticus DSM 14838': 'Bacteroidetes',
 'Bacteroides plebeius': 'Bacteroidetes',
 'Bacteroides vulgatus': 'Bacteroidetes',
 'Bacteroides vulgatus PC510': 'Bacteroidetes',
 'Bacteroides vulgatus dnLKV7': 'Bacteroidetes',
 'Bacteroides vulgatus CL09T03C04': 'Bacteroidetes',
 'Bacteroides vulgatus str. 3975 RP4': 'Bacteroidetes',
 'Bacteroides vulgatus str. 3775 SR(B) 19': 'Bacteroidetes',
 'Bacteroides vulgatus str. 3775 SL(B) 10 (iv)': 'Bacteroidetes',
 'Bacteroides coprophi

In [119]:
# trim down to a specific level for visualization purposes
if run_trimmed:
    target_level = 5

    kraken_tmp = {}

    for s, sv in kraken_data.items():
        if "_dm" in s:
            kraken_tmp[s] = kraken_data[s]
        else:
            kraken_tmp[s] = {}

            for k, v in sv.items():
                tmp_k = k.split("\t")

                if tmp_k[target_level] != "-" and tmp_k[target_level+1] == "-":
                    kraken_tmp[s][tmp_k[target_level]] = v

    kraken_data = kraken_tmp
    

In [120]:
# now to organize in a way that keeps a logical order intact
keylist = []

for k, e in kraken_data.items():
    for s, l in e.items():
        keylist.append(s) # just joining it up to make filtering duplicates out easier
        
keylist = [x.split('\t') for x in list(set(keylist))]
     
if not run_trimmed:
    for i in range(1, max_depth+1):
        keylist = sorted(keylist, key=lambda x: x[-i]) # need to sort it starting from the right col to keep it from re-sorting everything
    

In [121]:
with open('kraken2.csv', 'w') as f:
    kraken_keys = list(kraken_data.keys())
    f.write(",".join(['' for i in range(0, max_depth)]+kraken_keys)+"\n")

    for kl in keylist: # work through all of sorted rows 
        tmp_row = list(kl)
        tmp_key = '\t'.join(kl)
        
        for s in kraken_keys:
            if tmp_key in kraken_data[s]:
                tmp_row.append(kraken_data[s][tmp_key])
            else:
                tmp_row.append('NaN')
    
        f.write(",".join([str(s) for s in tmp_row])+"\n")
        

In [122]:
kraken_data

{'4': {'Bacteroidetes': '8841650',
  'Chlorobi': '50',
  'Balneolaeota': '10',
  'Rhodothermaeota': '8',
  'Ignavibacteriae': '6',
  'Fibrobacteria': '92',
  'Gemmatimonadetes': '7',
  'Candidatus Thermokryptus': '2',
  'Candidatus Kryptobacter': '2',
  'Candidatus Chrysopegis': '1',
  'Candidatus Kryptonium': '1',
  'Candidatus Marinimicrobia bacterium': '4',
  'Clostridia': '5675595',
  'Erysipelotrichia': '352325',
  'Negativicutes': '95003',
  'unclassified Firmicutes sensu stricto': '65380',
  'Bacilli': '42658',
  'Tissierellia': '1636',
  'Limnochordia': '4',
  'Actinobacteria': '842496',
  'Coriobacteriia': '327116',
  'Acidimicrobiia': '29',
  'Thermoleophilia': '15',
  'Rubrobacteria': '6',
  'Nitriliruptoria': '4',
  'unclassified Actinobacteria': '4',
  'Candidatus Melainabacteria': '1808',
  'Cyanobacteria': '314',
  'Candidatus Margulisbacteria': '3',
  'Mollicutes': '1832',
  'unclassified Tenericutes': '33',
  'Candidatus Izimaplasma': '8',
  'Deinococci': '206',
  'Ana

In [123]:
# data as stacked bar charts
data = {
    "Organism": [],
    "Read Count": [],
    "Sample": []
}
samples = []
min_reads = 0
total_others = []

with open('kraken2.csv', 'r') as f:
    raw_data = f.read().split("\n")
    
    samples = ["Sample "+x for x in raw_data[0].split(",") if x != ""]
    for s in samples:
        total_others.append(0)
    
    for r in raw_data[1:]:
        do_add = False
        tmp_new = [0 for s in samples]
        
        tmp_r = r.split(",")
        if len(tmp_r) > 1: # dealing with the last row
            org = "|".join([x for x in tmp_r[:-len(samples)] if x != "-"])

            for i in range(0, len(samples)):
                tmp_new[i] = tmp_r[-len(samples)+i]
                
                if tmp_new[i] != "NaN" and int(tmp_new[i]) >= min_reads:
                    do_add = True
                
            if do_add:
                for i in range(0, len(samples)):
                    data["Organism"].append(org)
                    data["Read Count"].append(tmp_new[i])
                    data["Sample"].append(samples[i])
            else:
                for i in range(0, len(samples)):
                    if tmp_new[i] != "NaN":
                        total_others[i] += int(tmp_new[i])
    
for i in range(0, len(samples)):
    data["Organism"].append("other")
    data["Read Count"].append(total_others[i])
    data["Sample"].append(samples[i])
    
df = pd.DataFrame.from_dict(data)
df["Read Count"] = df["Read Count"].astype(float)


In [124]:
fig = px.bar(df, x="Sample", y="Read Count", color='Organism')
fig.update_layout(showlegend=False)
fig.write_html("kraken2.html")
fig.show()
