In [41]:
import glob


In [46]:
max_depth = 21
run_trimmed = True

kraken_data = {}

# work through all of the kraken stuff
for fname in glob.glob("outputs/*/kraken2/*.kreport"):
    with open(fname, 'r') as f:
        data = f.read().split("\n") # split data into separate rows
        parsed_data = {}
        
        curr_tag = ['-' for i in range(0, max_depth)]
        for i in data:
            curr_row = i.split("\t") # the data within rows is tab-delimited

            # create a tag structure
            curr_n = curr_row[-1].split("  ")
    
            if curr_n[-1] != '': # skip rows w-out taxon names (mostly to deal w empty entries)
                try:
                    curr_tag[len(curr_n)-1] = curr_n[-1]
                except:
                    print(curr_n) # just a contingency to see when we end up deeper than expected
                    
                for j in range(len(curr_n), max_depth):
                    curr_tag[j] = '-'
                    
                # extract the percentage of aligned reads
                #parsed_data['\t'.join(curr_tag)] = curr_row[0].replace(" ", "")
                parsed_data['\t'.join(curr_tag)] = curr_row[1].replace(" ", "")
                #parsed_data['\t'.join(curr_tag)] = curr_row[2].replace(" ", "")
                
        kraken_data[fname.split("/")[-3].split("_")[2]] = dict(parsed_data)
 

In [47]:
# trim down to a specific level for visualization purposes
if run_trimmed:
    target_level = 5

    kraken_tmp = {}

    for s, sv in kraken_data.items():
        kraken_tmp[s] = {}

        for k, v in sv.items():
            tmp_k = k.split("\t")

            if tmp_k[target_level] != "-" and tmp_k[target_level+1] == "-":
                kraken_tmp[s][tmp_k[target_level]] = v

    kraken_data = kraken_tmp

In [48]:
# now to organize in a way that keeps a logical order intact
keylist = []

for k, e in kraken_data.items():
    for s, l in e.items():
        keylist.append(s) # just joining it up to make filtering duplicates out easier
        
keylist = [x.split('\t') for x in list(set(keylist))]
     
if not run_trimmed:
    for i in range(1, max_depth+1):
        keylist = sorted(keylist, key=lambda x: x[-i]) # need to sort it starting from the right col to keep it from re-sorting everything
    

In [49]:
with open('kraken2.csv', 'w') as f:
    kraken_keys = list(kraken_data.keys())
    f.write(",".join(['' for i in range(0, max_depth)]+kraken_keys)+"\n")

    for kl in keylist: # work through all of sorted rows 
        tmp_row = list(kl)
        tmp_key = '\t'.join(kl)
        
        for s in kraken_keys:
            if tmp_key in kraken_data[s]:
                tmp_row.append(kraken_data[s][tmp_key])
            else:
                tmp_row.append('NaN')
    
        f.write(",".join(tmp_row)+"\n")
        

In [23]:
groot_data = {}

# not sure what to do with the groot stuff
for fname in glob.glob("outputs/*/groot/*/*.txt"):
    with open(fname, 'r') as f:
        data = f.read().split("\n") # split data into separate rows
        
        tmp_data = {}
        
        for e in data:
            tmp_val = e.split("\t")
            
            if tmp_val[0] != '':
                tmp_data[tmp_val[0]] = tmp_val[1:]
            
        groot_data[fname.split("/")[-4].split("_")[2]] = tmp_data
        

In [24]:
# now to organize in a way that keeps a logical order intact
keylist = []

for k, e in groot_data.items():
    for s, l in e.items():
        keylist.append(s) # just joining it up to make filtering duplicates out easier
        
keylist = list(set(keylist))
keylist.sort()


In [25]:
affixes = ["_read_count", "_gene_length", "_coverage"]

with open('groot.csv', 'w') as f:
    groot_keys = list(groot_data.keys())
    tmp_row = ['']
    
    for x in groot_keys:
        for a in affixes:
            tmp_row.append(x+a)
    f.write(",".join(tmp_row)+"\n")

    for kl in keylist: # work through all of sorted rows 
        tmp_row = [kl]
        
        for s in groot_keys:
            for i in range(0, len(affixes)):
                if kl in groot_data[s]:
                    tmp_row.append(groot_data[s][kl][i])
                else:
                    tmp_row.append('NaN')
    
        f.write(",".join(tmp_row)+"\n")
        

In [26]:
metaphlan_data = {}

# not sure what to do with the groot stuff
for fname in glob.glob("outputs/*/metaphlan/*.txt"):
    with open(fname, 'r') as f:
        data = f.read().split("\n") # split data into separate rows
        
        tmp_data = {}
        
        for e in data[4:]:
            tmp_val = e.split("\t")
            
            if tmp_val[0] != '':
                tmp_data[tmp_val[0]] = tmp_val[2]
            
        metaphlan_data[fname.split("/")[-3].split("_")[2]] = tmp_data
        

In [27]:
# now to organize in a way that keeps a logical order intact
keylist = []

for k, e in metaphlan_data.items():
    for s, l in e.items():
        keylist.append(s) # just joining it up to make filtering duplicates out easier
        
keylist = [x.split('|') for x in list(set(keylist))]
        
max_padding = 0
for x in keylist:
    if len(x) > max_padding:
        max_padding = len(x)
    
for x in keylist:
    for i in range(0, max_padding-len(x)):
        x.append("-")
    
for i in range(1, max_padding+1):
    keylist = sorted(keylist, key=lambda x: x[-i]) # need to sort it starting from the right col to keep it from re-sorting everything
    

In [28]:
with open('metaphlan.csv', 'w') as f:
    metaphlan_keys = list(metaphlan_data.keys())
    f.write(",".join(['' for i in range(0, max_padding)]+metaphlan_keys)+"\n")

    for kl in keylist: # work through all of sorted rows 
        tmp_row = list(kl)
        tmp_key = '|'.join([p for p in kl if p != "-"])
        
        for s in metaphlan_keys:
            if tmp_key in metaphlan_data[s]:
                tmp_row.append(metaphlan_data[s][tmp_key])
            else:
                tmp_row.append('NaN')
    
        f.write(",".join(tmp_row)+"\n")
        

In [29]:
references = {}

with open("uniprot_sprot.dat", "r") as f:
    tmp_ref = None
    
    for line in f.read().split("\n"):
        if line[:2] == "AC":
            tmp_ref = line.split(" ")[-1].split(";")[0]
            references[tmp_ref] = ""
        if line[:8] == "DR   GO;":
            references[tmp_ref] += line


In [30]:
draw_mode = True
genefamilies = {}

# work through all of the humann2 stuff
for fname in glob.glob("outputs/*/humann2/*_genefamilies.tsv"):
    with open(fname, "r") as f:
        data = f.read().split("\n") # split data into separate rows
        
        tmp_data = {}
        
        for e in data[1:]:
            tmp_val = e.split("\t")
            
            if tmp_val[0] != '' and "|" not in tmp_val[0] and "UNMAPPED" != tmp_val[0] and "UniRef90_unknown" != tmp_val[0]: # filter out the subdivision into organisms
                tmp_data[tmp_val[0].split("_")[1]] = tmp_val[1]
            
        genefamilies[fname.split("/")[-3].split("_")[2]] = tmp_data


In [31]:
# now to organize in a way that keeps a logical order intact
keylist = []

for k, e in genefamilies.items():
    for s, l in e.items():
        keylist.append(s) # just joining it up to make filtering duplicates out easier
        
keylist = [x.split('|') for x in list(set(keylist))]
        
max_padding = 0
for x in keylist:
    if len(x) > max_padding:
        max_padding = len(x)
    
for x in keylist:
    for i in range(0, max_padding-len(x)):
        x.append("-")
    
for i in range(1, max_padding+1):
    keylist = sorted(keylist, key=lambda x: x[-i]) # need to sort it starting from the right col to keep it from re-sorting everything
    

In [32]:
with open('genefamilies.csv', 'w') as f:
    genefamilies_keys = list(genefamilies.keys())
    f.write(",".join(['' for i in range(0, max_padding)]+genefamilies_keys+["Function"])+"\n")

    for kl in keylist: # work through all of sorted rows 
        tmp_row = list(kl)
        tmp_key = '|'.join([p for p in kl if p != "-"])

        for s in genefamilies_keys:
            if tmp_key in genefamilies[s]:
                tmp_row.append(genefamilies[s][tmp_key])
            else:
                tmp_row.append('NaN')

        if draw_mode:
            if tmp_key in references:
                tmp_row.append(references[tmp_key])
                f.write(",".join(tmp_row)+"\n")
        else:
            if tmp_key in references:
                tmp_row.append(references[tmp_key])
            else:
                tmp_row.append("unknown")

            f.write(",".join(tmp_row)+"\n")
        