## taxonomy classification

"Taxonomy lineage information was obtained for each blast hits by matching the accession number with the taxonomy database, which was subsequently used to identify reads of virus origin

In [1]:
# ensure you are in the cloned github repo that contains the correct files
!pwd

/Users/lilliang/Downloads


In [4]:
import pandas as pd

# read taxid files (only one column)
w1_taxids = pd.read_csv("working_files/w1_taxids.txt", header = None, names = ["taxid"], dtype = str)
print(w1_taxids.head())

w2_taxids = pd.read_csv("working_files/w2_taxids.txt", header = None, names = ["taxid"], dtype = str)
print(w2_taxids.head())

    taxid
0  104664
1   10682
2   10843
3   10845
4   10849
     taxid
0    10682
1    10843
2    10845
3    10849
4  1159907


In [5]:
# load names.dmp
def load_names(file_path):
    taxid_to_name = {}
    with open(file_path) as f:
        for line in f:
            parts = [p.strip() for p in line.split("|")]
            if len(parts) >= 4 and parts[3] == "scientific name":
                taxid_to_name[parts[0]] = parts[1]
    return taxid_to_name

# load nodes.dmp
def load_nodes(file_path):
    child_to_parent = {}
    with open(file_path) as f:
        for line in f:
            parts = line.strip().split("\t|\t")
            child = parts[0].strip()
            parent = parts[1].strip()
            child_to_parent[child] = parent
    return child_to_parent

names = load_names("working_files/taxonomy_db/names.dmp")
parents = load_nodes("working_files/taxonomy_db/nodes.dmp")

In [6]:
def get_lineage(taxid, names_dict, parents_dict):
    lineage = []
    visited = set()

    while taxid not in visited and taxid != "1":
        visited.add(taxid)
        name = names_dict.get(taxid, "N/A")
        lineage.append(name)
        taxid = parents_dict.get(taxid, "1")
    
    return " > ".join(reversed(lineage))

In [7]:
# apply get_lineage 
w1_taxids["lineage"] = w1_taxids["taxid"].apply(lambda tid: get_lineage(tid, names, parents))
w1_taxids.head()

Unnamed: 0,taxid,lineage
0,104664,Viruses > Riboviria > Orthornavirae > Kitrinov...
1,10682,Viruses > Duplodnaviria > Heunggongvirae > Uro...
2,10843,Viruses > Monodnaviria > Sangervirae > Phixvir...
3,10845,Viruses > Monodnaviria > Sangervirae > Phixvir...
4,10849,Viruses > Monodnaviria > Sangervirae > Phixvir...


In [8]:
# apply get_lineage 
w2_taxids["lineage"] = w2_taxids["taxid"].apply(lambda tid: get_lineage(tid, names, parents))
w2_taxids.head()

Unnamed: 0,taxid,lineage
0,10682,Viruses > Duplodnaviria > Heunggongvirae > Uro...
1,10843,Viruses > Monodnaviria > Sangervirae > Phixvir...
2,10845,Viruses > Monodnaviria > Sangervirae > Phixvir...
3,10849,Viruses > Monodnaviria > Sangervirae > Phixvir...
4,1159907,Viruses > Riboviria > Orthornavirae > Pisuviri...


In [9]:
# saving files
w1_taxids.to_csv("working_files/w1_taxid_lineage_output.txt", sep = "\t", index = False)
w2_taxids.to_csv("working_files/w2_taxid_lineage_output.txt", sep = "\t", index = False)

In [10]:
# join blastn hits df w/ lineage df by accession 
# read in blastn files
w1_blast_df = pd.read_csv("working_files/wuhan1_blastn_hits.txt", sep = "\t", header = None)
w2_blast_df = pd.read_csv("working_files/wuhan2_blastn_hits.txt", sep = "\t", header = None)

# add column headers based on output format 6 (https://www.metagenomics.wiki/tools/blast/blastn-output-format-6)
w1_blast_df.columns = [
    "qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", 
    "qstart", "qend", "sstart", "send", "evalue", "bitscore"
]
w2_blast_df.columns = [
    "qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", 
    "qstart", "qend", "sstart", "send", "evalue", "bitscore"
]

print(w1_blast_df.head())
print(w2_blast_df.head())

     qseqid       sseqid   pident  length  mismatch  gapopen  qstart  qend  \
0  k141_124  NC_026592.1   98.087     732        14        0      30   761   
1  k141_124  NC_026592.1  100.000      37         0        0       1    37   
2   k141_24  NC_074405.1  100.000     136         0        0     338   473   
3  k141_178  NC_026592.1   98.125     320         6        0       1   320   
4  k141_178  NC_026592.1   97.232     289         8        0     315   603   

   sstart  send         evalue  bitscore  
0    4904  5635   0.000000e+00    1275.0  
1    4845  4809   5.020000e-10      69.4  
2     136     1   3.460000e-65     252.0  
3    4845  4526  2.260000e-157     558.0  
4    4296  4584  8.360000e-137     490.0  
    qseqid       sseqid   pident  length  mismatch  gapopen  qstart  qend  \
0  k141_22  NC_050152.1   98.519     540         8        0       1   540   
1  k141_22  NC_050152.1  100.000      83         0        0     536   618   
2  k141_23  NC_054662.1   86.441      59  

1.  qseqid   |   query or source (gene) sequence id

2.  sseqid   |   subject or target (reference genome) sequence id

3.  pident   |   percentage of identical positions

4.  length   |   alignment length (sequence overlap)

5.  mismatch  |  number of mismatches

6.  gapopen  |   number of gap openings

7.  qstart   |   start of alignment in query

8.  qend    |    end of alignment in query

9.  sstart   |   start of alignment in subject

10.  send    |    end of alignment in subject

11.  evalue   |   expect value

12.  bitscore  |  bit score

In [11]:
# load accession to taxid map
w1_taxid_map = pd.read_csv("working_files/w1_taxid_map.txt", sep = "\t", header = None, names = ["accession_base", "accession_full", "taxid", "gi"])
w2_taxid_map = pd.read_csv("working_files/w2_taxid_map.txt", sep = "\t", header = None, names = ["accession_base", "accession_full", "taxid", "gi"])

w1_taxid_map = w1_taxid_map[["accession_full", "taxid"]]
w2_taxid_map = w2_taxid_map[["accession_full", "taxid"]]

print(w1_taxid_map.head())
print(w2_taxid_map.head())

  accession_full    taxid
0    NC_001330.1    10849
1    NC_001420.2    10843
2    NC_001422.1  2886930
3    NC_001450.1    11665
4    NC_001730.1  2507234
  accession_full    taxid
0    NC_001330.1    10849
1    NC_001420.2    10843
2    NC_001422.1  2886930
3    NC_001450.1    11665
4    NC_001730.1  2507234


In [12]:
# load newly made lineage files
w1_lineage_df = pd.read_csv("working_files/w1_taxid_lineage_output.txt", sep = "\t", header = 0)
w2_lineage_df = pd.read_csv("working_files/w2_taxid_lineage_output.txt", sep = "\t", header = 0)

print(w1_lineage_df.head())
print(w2_lineage_df.head())

    taxid                                            lineage
0  104664  Viruses > Riboviria > Orthornavirae > Kitrinov...
1   10682  Viruses > Duplodnaviria > Heunggongvirae > Uro...
2   10843  Viruses > Monodnaviria > Sangervirae > Phixvir...
3   10845  Viruses > Monodnaviria > Sangervirae > Phixvir...
4   10849  Viruses > Monodnaviria > Sangervirae > Phixvir...
     taxid                                            lineage
0    10682  Viruses > Duplodnaviria > Heunggongvirae > Uro...
1    10843  Viruses > Monodnaviria > Sangervirae > Phixvir...
2    10845  Viruses > Monodnaviria > Sangervirae > Phixvir...
3    10849  Viruses > Monodnaviria > Sangervirae > Phixvir...
4  1159907  Viruses > Riboviria > Orthornavirae > Pisuviri...


In [14]:
# merge accession to taxid into blastn
w1_merged_blast = w1_blast_df.merge(w1_taxid_map, left_on = "sseqid", right_on = "accession_full", how = "left")
w2_merged_blast = w2_blast_df.merge(w2_taxid_map, left_on = "sseqid", right_on = "accession_full", how = "left")

print(w1_merged_blast.head())
print(w2_merged_blast.head())

     qseqid       sseqid   pident  length  mismatch  gapopen  qstart  qend  \
0  k141_124  NC_026592.1   98.087     732        14        0      30   761   
1  k141_124  NC_026592.1  100.000      37         0        0       1    37   
2   k141_24  NC_074405.1  100.000     136         0        0     338   473   
3  k141_178  NC_026592.1   98.125     320         6        0       1   320   
4  k141_178  NC_026592.1   97.232     289         8        0     315   603   

   sstart  send         evalue  bitscore accession_full    taxid  
0    4904  5635   0.000000e+00    1275.0    NC_026592.1  1214459  
1    4845  4809   5.020000e-10      69.4    NC_026592.1  1214459  
2     136     1   3.460000e-65     252.0    NC_074405.1  2786389  
3    4845  4526  2.260000e-157     558.0    NC_026592.1  1214459  
4    4296  4584  8.360000e-137     490.0    NC_026592.1  1214459  
    qseqid       sseqid   pident  length  mismatch  gapopen  qstart  qend  \
0  k141_22  NC_050152.1   98.519     540         8  

In [15]:
# merge taxid to lineage into final df
w1_tax_class = w1_merged_blast.merge(w1_lineage_df, on = "taxid", how = "left")
w2_tax_class = w2_merged_blast.merge(w2_lineage_df, on = "taxid", how = "left")

print(w1_tax_class.head())
print(w2_tax_class.head())

     qseqid       sseqid   pident  length  mismatch  gapopen  qstart  qend  \
0  k141_124  NC_026592.1   98.087     732        14        0      30   761   
1  k141_124  NC_026592.1  100.000      37         0        0       1    37   
2   k141_24  NC_074405.1  100.000     136         0        0     338   473   
3  k141_178  NC_026592.1   98.125     320         6        0       1   320   
4  k141_178  NC_026592.1   97.232     289         8        0     315   603   

   sstart  send         evalue  bitscore accession_full    taxid  \
0    4904  5635   0.000000e+00    1275.0    NC_026592.1  1214459   
1    4845  4809   5.020000e-10      69.4    NC_026592.1  1214459   
2     136     1   3.460000e-65     252.0    NC_074405.1  2786389   
3    4845  4526  2.260000e-157     558.0    NC_026592.1  1214459   
4    4296  4584  8.360000e-137     490.0    NC_026592.1  1214459   

                                             lineage  
0  Viruses > Riboviria > Orthornavirae > Kitrinov...  
1  Viruses >

In [16]:
# save final files
w1_tax_class.to_csv("working_files/wuhan1_blast_with_lineage.txt", sep = "\t", index = False)
w2_tax_class.to_csv("working_files/wuhan2_blast_with_lineage.txt", sep = "\t", index = False)