In [232]:
import pandas as pd
from Bio import SeqIO

In [306]:
tool_list=['VORF','IVA']
file_ext='_blast_results_wspecie_names.csv'
df=[]

for tool in tool_list:
    tb = pd.read_csv(tool+file_ext,sep=";")
    df.append(tb)

df=pd.concat(df) \
    .rename(columns={"V1": "contig", "V8": "Acc. Length", "sample":"sampleID"})

old_df=df

df=df.drop(list(df.filter(regex = 'V\d+')), axis = 1)

df=pd.concat([df,df.specie.str.extract(r'(?P<gene>.*) \[(?P<species>.*)\].*')],axis=1) \
    .drop(['specie'],axis=1) \
    .applymap(lambda x: x.strip() if isinstance(x, str) else x) \
    .replace('C167', 'CV167')


df['virus']=None
df.loc[df.sampleID.str.contains("ERR"), "virus"]="Human immunodeficiency virus 1"
df.loc[df.sampleID.str.isnumeric(), "virus"]="Lassa mammarenavirus"
df.loc[df.sampleID.str.contains("CV"), "virus"]="Severe acute respiratory syndrome coronavirus 2"

df.contig=df.contig.str.lstrip("contig.0").fillna(df.contig)

df.columns = df.columns.str.lstrip()



Unnamed: 0,contig,Acc. Length,sampleID,tool,gene,species,virus
70,2,39,ERR3953710,VORF,"gag protein, partial",Human immunodeficiency virus 1,Human immunodeficiency virus 1
0,1,6766,934,IVA,polymerase,Lassa mammarenavirus,Lassa mammarenavirus
1,2,1782,934,IVA,nucleoprotein,Lassa mammarenavirus,Lassa mammarenavirus
2,1,4993,1880,IVA,polymerase,Lassa mammarenavirus,Lassa mammarenavirus
3,2,2238,1880,IVA,polymerase,Lassa mammarenavirus,Lassa mammarenavirus
4,3,1166,1880,IVA,"glycoprotein precursor, partial",Lassa mammarenavirus,Lassa mammarenavirus
5,4,1177,1880,IVA,nucleoprotein,Lassa mammarenavirus,Lassa mammarenavirus
6,5,918,1880,IVA,polymerase,Lassa mammarenavirus,Lassa mammarenavirus
7,6,750,1880,IVA,"glycoprotein precursor, partial",Lassa mammarenavirus,Lassa mammarenavirus
8,7,714,1880,IVA,nucleoprotein,Lassa mammarenavirus,Lassa mammarenavirus


In [270]:
from Bio import SeqIO

df["Length"] = None
sample_IDs=df.sampleID.unique()

contigs=pd.DataFrame({'sampleID': [],'tool': [], 'contig': [],'length':[]})

for tool in tool_list:
    for s in sample_IDs:
        sid= (s + "_VORF") if tool=="VORF" else (s + "_contigs")
        filename='Data/%s_assemblies/%s.fasta' % (tool, sid)
        fasta_sequences = SeqIO.parse(open(filename),'fasta')
        for fasta in fasta_sequences:
            
            name, sequence = fasta.id, str(fasta.seq)
            length=len(sequence)
            contigs.loc[len(contigs.index)] = [s,tool,name,length]
            
            name=int(name) if tool=="VORF" else name.lstrip('contig.0')
            df.loc[(df.contig==name) & (df.sampleID==s), ["Length"]]=length


contigs['virus']=None
contigs.loc[contigs.sampleID.str.contains("ERR"), "virus"]="HIV"
contigs.loc[contigs.sampleID.str.isnumeric(), "virus"]="Lassa"
contigs.loc[contigs.sampleID.str.contains("CV"), "virus"]="SARS-CoV-2"

print(df.head())



df.to_csv("blast_dataset.csv",sep=";")
contigs.to_csv("contigs.csv",sep=";")

  contig  Acc. Length    sampleID  tool                    gene  \
0      0          491  ERR3953716  VORF    gag protein, partial   
1      4           32  ERR3953716  VORF    gag protein, partial   
2      2          569        1880  VORF           nucleoprotein   
3      5          490        1880  VORF  glycoprotein precursor   
4     14         2032         540  VORF               L protein   

                          species Length  
0  Human immunodeficiency virus 1    491  
1  Human immunodeficiency virus 1     37  
2            Lassa mammarenavirus    569  
3            Lassa mammarenavirus    490  
4            Lassa mammarenavirus   2048  


In [293]:
from IPython.display import display
# average # of contigs identified by each tool

display(contigs.groupby(['tool']).agg({'contig':['count'], 'length':['mean', 'sum']}))

display(contigs.groupby(['tool', 'virus']).agg({'contig':['count'], 'length':['mean', 'sum']}))




# average # of contigs identified by each tool in each sample

# average length of contigs identified by each tool

# average length of contigs identified by each tool in each sample

# total length of all segments identified by each tool

# correlation between # of reads and segments found

#  number of base pairs aligned to parnet genome

# base pairs aligned to bacterial genomes

# base pairs aligned to nothing

# proteins identified by each tool in each sample




Unnamed: 0_level_0,contig,length,length
Unnamed: 0_level_1,count,mean,sum
tool,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
IVA,97,1891.896907,183514.0
VORF,223,77.394619,17259.0


Unnamed: 0_level_0,Unnamed: 1_level_0,contig,length,length
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,sum
tool,virus,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
IVA,HIV,11,1152.272727,12675.0
IVA,Lassa,13,2640.923077,34332.0
IVA,SARS-CoV-2,73,1869.958904,136507.0
VORF,HIV,25,84.4,2110.0
VORF,Lassa,80,110.325,8826.0
VORF,SARS-CoV-2,118,53.584746,6323.0


In [327]:
# number of contigs in a virus aligned to the right species

display(df[(df.virus==df.species) & (df.tool=="VORF")].groupby(['virus', 'species']).agg({'species':'count'}))

display(df[(df.virus==df.species) & (df.tool=="IVA")].groupby(['virus', 'species']).agg({'species':'count'}))

display(df[df.virus==df.species].groupby(['tool']).agg({'tool':'count'}))




Unnamed: 0_level_0,Unnamed: 1_level_0,species
virus,species,Unnamed: 2_level_1
Human immunodeficiency virus 1,Human immunodeficiency virus 1,8
Lassa mammarenavirus,Lassa mammarenavirus,7


Unnamed: 0_level_0,Unnamed: 1_level_0,species
virus,species,Unnamed: 2_level_1
Human immunodeficiency virus 1,Human immunodeficiency virus 1,11
Lassa mammarenavirus,Lassa mammarenavirus,13
Severe acute respiratory syndrome coronavirus 2,Severe acute respiratory syndrome coronavirus 2,36


Unnamed: 0_level_0,tool
tool,Unnamed: 1_level_1
IVA,60
VORF,15


In [329]:
# contigs aligned to bacterial genomes

display(df[(df.virus!=df.species) & (df.tool=="VORF")].groupby(['virus']).agg({'virus':'count'}))

Unnamed: 0_level_0,virus
virus,Unnamed: 1_level_1
Lassa mammarenavirus,1
Severe acute respiratory syndrome coronavirus 2,55
