In [1]:
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup
import itertools

In [2]:
blast_folder_regex = re.compile(r"(blast[np])_vs_([a-zA-Z_]+)")
cluster_id_regex = re.compile(r"cluster([0-9]+[ab]?)_")
blast_cols = ["query_id","subject_id","pct_id","ali_len","mism",
              "gap_open","q_start","q_end","s_start","s_end",
              "e_value","bitscore","q_len","s_len","s_gi",
             "s_taxids","s_scinames","s_names","q_cov","s_description"
             ]

blast_hits = []
for folder in glob("blast_160427/blast?_vs_*"):
    tool_id,db_id = blast_folder_regex.search(folder).groups()
    for blast_filename in glob(folder+"/*.tsv"):
        blast_hits.append( pd.read_csv(blast_filename,sep="\t", header=None, names=blast_cols) )
        blast_hits[-1]["cluster"] = cluster_id_regex.search(blast_filename).group(1)
        blast_hits[-1]["tool"] = tool_id
        blast_hits[-1]["db"] = db_id

In [3]:
all_blast_hits = blast_hits[0]
for search_hits in blast_hits[1:]:
    all_blast_hits = all_blast_hits.append(search_hits)
all_blast_hits.head()

Unnamed: 0,query_id,subject_id,pct_id,ali_len,mism,gap_open,q_start,q_end,s_start,s_end,...,s_len,s_gi,s_taxids,s_scinames,s_names,q_cov,s_description,cluster,tool,db
0,GB3LKKR01DOS1W,gi|935719420|emb|CEAZ01012945.1|,86.4,228,27,4,46,271,20508,20283,...,144341,935719420,749906,,,83,gut metagenome genome assembly P6C7-k21-2014-0...,1073,blastn,env_nt
1,GB3LKKR01DOS1W,gi|935719420|emb|CEAZ01012945.1|,86.4,228,27,4,46,271,107376,107151,...,144341,935719420,749906,,,83,gut metagenome genome assembly P6C7-k21-2014-0...,1073,blastn,env_nt
2,GB3LKKR01DOS1W,gi|935454047|emb|CEAX01018485.1|,86.4,228,27,4,46,271,12216,12441,...,28297,935454047,749906,,,83,gut metagenome genome assembly P6C90-k21-2014-...,1073,blastn,env_nt
3,GB3LKKR01DOS1W,gi|935344953|emb|CEBY01021626.1|,86.4,228,27,4,46,271,74375,74150,...,86997,935344953,749906,,,83,gut metagenome genome assembly P6C0-k21-2014-0...,1073,blastn,env_nt
4,GB3LKKR01DOS1W,gi|935324036|emb|CEBY01034087.1|,86.4,228,27,4,46,271,12333,12558,...,42116,935324036,749906,,,83,gut metagenome genome assembly P6C0-k21-2014-0...,1073,blastn,env_nt


In [4]:
#all_blast_hits[all_blast_hits.e_value < 0.001].groupby(["cluster","db"])
gb = all_blast_hits[ (all_blast_hits.q_cov > 80) & (all_blast_hits.e_value < 0.001) ].groupby(["cluster","db"])
reliable_fam_hits = pd.DataFrame( hits.ix[hits.bitscore.idxmax()] for _,hits in gb )[["cluster","db","tool","query_id","subject_id","pct_id","q_cov","q_len",
                                                                                   "bitscore","e_value","s_description"]]

sorted_fam_hits = pd.concat( hits.sort_values(by="bitscore",ascending=False) for _,hits in reliable_fam_hits.groupby("cluster") )
sorted_fam_hits.head()

Unnamed: 0,cluster,db,tool,query_id,subject_id,pct_id,q_cov,q_len,bitscore,e_value,s_description
33,1073,env_nt,blastn,GB3LKKR01A150I,gi|936108378|emb|CEAB01076172.1|,98.18,99,276,479,1e-131,gut metagenome genome assembly P2E0-k21-2014-0...
3,1073,metahit_cds,blastn,contig07331,GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...,95.971,96,285,444,1.0099999999999999e-122,GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
30,1073,metahit_pep,blastp,contig07331,GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...,97.727,96,92,159,3.2399999999999996e-48,GL0080651_MH0011_[Complete]_[mRNA]_locus=scaff...
0,113b,metahit_cds,blastn,GB3LKKR01C5IFF,GL0006538_MH0023_[Complete]_[mRNA]_locus=C1324...,98.913,100,92,165,2.2400000000000003e-39,GL0006538_MH0023_[Complete]_[mRNA]_locus=C1324...
5,113b,env_nt,blastn,GB3LKKR02F10X4,gi|557595202|gb|AVOA01007617.1|,87.36,89,98,100,3.0000000000000002e-18,"Gut metagenome Scaffold6187_1, whole genome sh..."


In [5]:
sorted_fam_hits.to_csv("filtered_blast_hits.csv",index=False)