In [1]:
import glob


In [146]:
max_depth = 21

kraken_data = {}

# work through all of the kraken stuff
for fname in glob.glob("kraken_outputs/outputs/*/kraken2/*.kreport"):
    with open(fname, 'r') as f:
        data = f.read().split("\n") # split data into separate rows
        parsed_data = {}
        
        curr_tag = ['-' for i in range(0, max_depth)]
        for i in data:
            curr_row = i.split("\t") # the data within rows is tab-delimited

            # create a tag structure
            curr_n = curr_row[-1].split("  ")
    
            if curr_n[-1] != '': # skip rows w-out taxon names (mostly to deal w empty entries)
                try:
                    curr_tag[len(curr_n)-1] = curr_n[-1]
                except:
                    print(curr_n) # just a contingency to see when we end up deeper than expected
                    
                for j in range(len(curr_n), max_depth):
                    curr_tag[j] = '-'
                    
                # extract the percentage of aligned reads
                #parsed_data['\t'.join(curr_tag)] = curr_row[0].replace(" ", "")
                parsed_data['\t'.join(curr_tag)] = curr_row[2].replace(" ", "")
                
        kraken_data[fname.split("/")[-3].split("_")[2]] = dict(parsed_data)
 

In [147]:
# now to organize in a way that keeps a logical order intact
keylist = []

for k, e in kraken_data.items():
    for s, l in e.items():
        keylist.append(s) # just joining it up to make filtering duplicates out easier
        
keylist = [x.split('\t') for x in list(set(keylist))]
        
for i in range(1, max_depth+1):
    keylist = sorted(keylist, key=lambda x: x[-i]) # need to sort it starting from the right col to keep it from re-sorting everything
    

In [148]:
with open('kraken2.csv', 'w') as f:
    kraken_keys = list(kraken_data.keys())
    f.write(",".join(['' for i in range(0, max_depth)]+kraken_keys)+"\n")

    for kl in keylist: # work through all of sorted rows 
        tmp_row = list(kl)
        tmp_key = '\t'.join(kl)
        
        for s in kraken_keys:
            if tmp_key in kraken_data[s]:
                tmp_row.append(kraken_data[s][tmp_key])
            else:
                tmp_row.append('NaN')
    
        f.write(",".join(tmp_row)+"\n")
        

In [158]:
groot_data = {}

# not sure what to do with the groot stuff
for fname in glob.glob("kraken_outputs/outputs/*/groot/*/*.txt"):
    with open(fname, 'r') as f:
        data = f.read().split("\n") # split data into separate rows
        
        tmp_data = {}
        
        for e in data:
            tmp_val = e.split("\t")
            
            if tmp_val[0] != '':
                tmp_data[tmp_val[0]] = tmp_val[1:]
            
        groot_data[fname.split("/")[-4].split("_")[2]] = tmp_data
        

In [168]:
# now to organize in a way that keeps a logical order intact
keylist = []

for k, e in groot_data.items():
    for s, l in e.items():
        keylist.append(s) # just joining it up to make filtering duplicates out easier
        
keylist = list(set(keylist))
keylist.sort()


In [169]:
affixes = ["_read_count", "_gene_length", "_coverage"]

with open('groot.csv', 'w') as f:
    groot_keys = list(groot_data.keys())
    tmp_row = ['']
    
    for x in groot_keys:
        for a in affixes:
            tmp_row.append(x+a)
    f.write(",".join(tmp_row)+"\n")

    for kl in keylist: # work through all of sorted rows 
        tmp_row = [kl]
        
        for s in groot_keys:
            for i in range(0, len(affixes)):
                if kl in groot_data[s]:
                    tmp_row.append(groot_data[s][kl][i])
                else:
                    tmp_row.append('NaN')
    
        f.write(",".join(tmp_row)+"\n")
        

In [187]:
metaphlan_data = {}

# not sure what to do with the groot stuff
for fname in glob.glob("humann_outputs/outputs/*/metaphlan/*.txt"):
    with open(fname, 'r') as f:
        data = f.read().split("\n") # split data into separate rows
        
        tmp_data = {}
        
        for e in data[4:]:
            tmp_val = e.split("\t")
            
            if tmp_val[0] != '':
                tmp_data[tmp_val[0]] = tmp_val[2]
            
        metaphlan_data[fname.split("/")[-3].split("_")[2]] = tmp_data
        

In [191]:
# now to organize in a way that keeps a logical order intact
keylist = []

for k, e in metaphlan_data.items():
    for s, l in e.items():
        keylist.append(s) # just joining it up to make filtering duplicates out easier
        
keylist = [x.split('|') for x in list(set(keylist))]
        
max_padding = 0
for x in keylist:
    if len(x) > max_padding:
        max_padding = len(x)
    
for x in keylist:
    for i in range(0, max_padding-len(x)):
        x.append("-")
    
for i in range(1, max_padding+1):
    keylist = sorted(keylist, key=lambda x: x[-i]) # need to sort it starting from the right col to keep it from re-sorting everything
    

In [198]:
with open('metaphlan.csv', 'w') as f:
    metaphlan_keys = list(metaphlan_data.keys())
    f.write(",".join(['' for i in range(0, max_padding)]+metaphlan_keys)+"\n")

    for kl in keylist: # work through all of sorted rows 
        tmp_row = list(kl)
        tmp_key = '|'.join([p for p in kl if p != "-"])
        
        for s in metaphlan_keys:
            if tmp_key in metaphlan_data[s]:
                tmp_row.append(metaphlan_data[s][tmp_key])
            else:
                tmp_row.append('NaN')
    
        f.write(",".join(tmp_row)+"\n")
        

In [12]:
# work through all of the humann2 stuff
print(glob.glob("humann_outputs/outputs/*/humann2/*.txt"))


['humann_outputs/outputs/output_dir_104_202005241355/humann2/mpa2_table-v2.7.7.txt', 'humann_outputs/outputs/output_dir_14_202005241353/humann2/mpa2_table-v2.7.7.txt', 'humann_outputs/outputs/output_dir_97_202005250336/humann2/mpa2_table-v2.7.7.txt', 'humann_outputs/outputs/output_dir_2_202005241356/humann2/mpa2_table-v2.7.7.txt', 'humann_outputs/outputs/output_dir_15_202005250037/humann2/mpa2_table-v2.7.7.txt', 'humann_outputs/outputs/output_dir_102_202005241234/humann2/mpa2_table-v2.7.7.txt', 'humann_outputs/outputs/output_dir_13_202005241353/humann2/mpa2_table-v2.7.7.txt', 'humann_outputs/outputs/output_dir_98_202005241519/humann2/mpa2_table-v2.7.7.txt', 'humann_outputs/outputs/output_dir_4_202005250237/humann2/mpa2_table-v2.7.7.txt', 'humann_outputs/outputs/output_dir_3_202005241358/humann2/mpa2_table-v2.7.7.txt', 'humann_outputs/outputs/output_dir_101_202005241348/humann2/mpa2_table-v2.7.7.txt', 'humann_outputs/outputs/output_dir_99_202005241519/humann2/mpa2_table-v2.7.7.txt', 'hu