In [2]:
from glob import glob
import pandas as pd
from IPython.display import display
from tqdm import tqdm
import yaml
import os

In [3]:
def get_config():
    with open("config.yaml", 'r') as con:
        config = yaml.safe_load(con)
    return config

config = get_config()
filenames = sorted(glob(os.path.abspath(config["complete_blast_output"])))
colnames = ["Read ID", "Scientific Name", "Tax ID", "e-value","bitscore", "length", "qcov" ]
blast_files = [pd.read_csv(file, sep = "\t", header=None, names=colnames) for file in filenames]

In [32]:
def merge_blast_files(blast_files, output_file):
    '''Merges the input in to one dataframe, with subject labels, 
an extra column is_vegan which holds a boolean. 
This function also selects rows with the lowest e-value per read. '''
    dfs = []
    subjects = ["A","B","C","D","E"]

    for i, blast_file in tqdm(enumerate(blast_files[:5])):
        blast_file["Read ID"] = blast_file["Read ID"].str.strip("_read_number").astype(int)
        blast_file = blast_file.loc[blast_file.groupby("Read ID")["e-value"].idxmin()]
        blast_file.loc[:,"subject"] = subjects[i]
        blast_file.loc[:,"is_vegan"] = True
        dfs.append(blast_file)
    for i, blast_file in tqdm(enumerate(blast_files[5:])):
        blast_file["Read ID"] = blast_file["Read ID"].str.strip("_read_number").astype(int)
        blast_file = blast_file.loc[blast_file.groupby("Read ID")["e-value"].idxmin()]
        blast_file.loc[:,"subject"] = subjects[i]
        blast_file.loc[:,"is_vegan"] = False
        dfs.append(blast_file)
    merged_blast = pd.concat(dfs)
    merged_blast = merged_blast.set_index(['subject', "is_vegan"])
    merged_blast = merged_blast[merged_blast['e-value']<10e-10]
    merged_blast.to_csv(output_file, sep="\t")
    
    return merged_blast

merged_blast = merge_blast_files(blast_files=blast_files, output_file=config["molded_merged_blast_output"])


5it [00:52, 10.56s/it]
5it [00:56, 11.32s/it]
