## taxonomy classification

"Taxonomy lineage information was obtained for each blast hits by matching the accession number with the taxonomy database, which was subsequently used to identify reads of virus origin

In [1]:
# ensure you are in the cloned github repo that contains the correct files
!pwd

/Users/lilliang/Documents/Spring_2025/EEOB5460/EEOB5460_final_project2025


In [2]:
import pandas as pd

# read taxid files (only one column)
w1_taxids = pd.read_csv("working_files/w1_taxids.txt", header = None, names = ["taxid"], dtype = str)
print(w1_taxids.head())

w2_taxids = pd.read_csv("working_files/w2_taxids.txt", header = None, names = ["taxid"], dtype = str)
print(w2_taxids.head())

    taxid
0  101850
1   10359
2  104664
3   10682
4   10730
   taxid
0  10682
1  10843
2  10845
3  10849
4  10863


In [3]:
# load names.dmp
def load_names(file_path):
    taxid_to_name = {}
    with open(file_path) as f:
        for line in f:
            parts = [p.strip() for p in line.split("|")]
            if len(parts) >= 4 and parts[3] == "scientific name":
                taxid_to_name[parts[0]] = parts[1]
    return taxid_to_name

# load nodes.dmp
def load_nodes(file_path):
    child_to_parent = {}
    with open(file_path) as f:
        for line in f:
            parts = line.strip().split("\t|\t")
            child = parts[0].strip()
            parent = parts[1].strip()
            child_to_parent[child] = parent
    return child_to_parent

names = load_names("working_files/taxonomy_db/names.dmp")
parents = load_nodes("working_files/taxonomy_db/nodes.dmp")

In [4]:
def get_lineage(taxid, names_dict, parents_dict):
    lineage = []
    visited = set()

    while taxid not in visited and taxid != "1":
        visited.add(taxid)
        name = names_dict.get(taxid, "N/A")
        lineage.append(name)
        taxid = parents_dict.get(taxid, "1")
    
    return " > ".join(reversed(lineage))

In [5]:
# apply get_lineage 
w1_taxids["lineage"] = w1_taxids["taxid"].apply(lambda tid: get_lineage(tid, names, parents))
w1_taxids.head()

Unnamed: 0,taxid,lineage
0,101850,Viruses > Viruses incertae sedis > Naldavirice...
1,10359,Viruses > Duplodnaviria > Heunggongvirae > Pep...
2,104664,Viruses > Riboviria > Orthornavirae > Kitrinov...
3,10682,Viruses > Duplodnaviria > Heunggongvirae > Uro...
4,10730,Viruses > Duplodnaviria > Heunggongvirae > Uro...


In [6]:
# apply get_lineage 
w2_taxids["lineage"] = w2_taxids["taxid"].apply(lambda tid: get_lineage(tid, names, parents))
w2_taxids.head()

Unnamed: 0,taxid,lineage
0,10682,Viruses > Duplodnaviria > Heunggongvirae > Uro...
1,10843,Viruses > Monodnaviria > Sangervirae > Phixvir...
2,10845,Viruses > Monodnaviria > Sangervirae > Phixvir...
3,10849,Viruses > Monodnaviria > Sangervirae > Phixvir...
4,10863,Viruses > Monodnaviria > Loebvirae > Hofneivir...


In [12]:
# saving files
w1_taxids.to_csv("working_files/w1_taxid_lineage_output.txt", sep = "\t", index = False)
w2_taxids.to_csv("working_files/w2_taxid_lineage_output.txt", sep = "\t", index = False)

In [7]:
# join blastn hits df w/ lineage df by accession 
# read in blastn files
w1_blast_df = pd.read_csv("working_files/wuhan1_blastn_hits.txt", sep = "\t", header = None)
w2_blast_df = pd.read_csv("working_files/wuhan2_blastn_hits.txt", sep = "\t", header = None)

# add column headers based on output format 6 (https://www.metagenomics.wiki/tools/blast/blastn-output-format-6)
w1_blast_df.columns = [
    "qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", 
    "qstart", "qend", "sstart", "send", "evalue", "bitscore"
]
w2_blast_df.columns = [
    "qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", 
    "qstart", "qend", "sstart", "send", "evalue", "bitscore"
]

print(w1_blast_df.head())
print(w2_blast_df.head())

           qseqid       sseqid   pident  length  mismatch  gapopen  qstart  \
0  SRR10903402.21  NC_045512.2  100.000     113         0        0       1   
1  SRR10903402.21  NC_045512.2  100.000      28         0        0     112   
2  SRR10903402.24  NC_045512.2  100.000     150         0        0       1   
3  SRR10903402.26  NC_045512.2  100.000     151         0        0       1   
4  SRR10903402.26  NC_004718.3   96.026     151         6        0       1   

   qend  sstart   send        evalue  bitscore  
0   113    8072   7960  4.370000e-53     209.0  
1   139    7944   7971  7.800000e-06      52.8  
2   150    1513   1662  1.300000e-73     278.0  
3   151   29748  29598  3.630000e-74     279.0  
4   151   29605  29455  3.690000e-64     246.0  
           qseqid       sseqid  pident  length  mismatch  gapopen  qstart  \
0   SRR10903401.7  NC_045512.2   100.0     104         0        0      21   
1   SRR10903401.7  NC_045512.2   100.0      29         0        0       1   
2  SRR

1.  qseqid   |   query or source (gene) sequence id

2.  sseqid   |   subject or target (reference genome) sequence id

3.  pident   |   percentage of identical positions

4.  length   |   alignment length (sequence overlap)

5.  mismatch  |  number of mismatches

6.  gapopen  |   number of gap openings

7.  qstart   |   start of alignment in query

8.  qend    |    end of alignment in query

9.  sstart   |   start of alignment in subject

10.  send    |    end of alignment in subject

11.  evalue   |   expect value

12.  bitscore  |  bit score

In [9]:
# load accession to taxid map
w1_taxid_map = pd.read_csv("working_files/w1_taxid_map.txt", sep = "\t", header = None, names = ["accession_base", "accession_full", "taxid", "gi"])
w2_taxid_map = pd.read_csv("working_files/w2_taxid_map.txt", sep = "\t", header = None, names = ["accession_base", "accession_full", "taxid", "gi"])

w1_taxid_map = w1_taxid_map[["accession_full", "taxid"]]
w2_taxid_map = w2_taxid_map[["accession_full", "taxid"]]

print(w1_taxid_map.head())
print(w2_taxid_map.head())

  accession_full    taxid
0    NC_000902.1    97081
1    NC_000924.1    10730
2    NC_001330.1    10849
3    NC_001416.1  2681611
4    NC_001420.2    10843
  accession_full    taxid
0    NC_001330.1    10849
1    NC_001416.1  2681611
2    NC_001420.2    10843
3    NC_001422.1  2886930
4    NC_001450.1    11665


In [13]:
# load newly made lineage files
w1_lineage_df = pd.read_csv("working_files/w1_taxid_lineage_output.txt", sep = "\t", header = 0)
w2_lineage_df = pd.read_csv("working_files/w2_taxid_lineage_output.txt", sep = "\t", header = 0)

print(w1_lineage_df.head())
print(w2_lineage_df.head())

    taxid                                            lineage
0  101850  Viruses > Viruses incertae sedis > Naldavirice...
1   10359  Viruses > Duplodnaviria > Heunggongvirae > Pep...
2  104664  Viruses > Riboviria > Orthornavirae > Kitrinov...
3   10682  Viruses > Duplodnaviria > Heunggongvirae > Uro...
4   10730  Viruses > Duplodnaviria > Heunggongvirae > Uro...
   taxid                                            lineage
0  10682  Viruses > Duplodnaviria > Heunggongvirae > Uro...
1  10843  Viruses > Monodnaviria > Sangervirae > Phixvir...
2  10845  Viruses > Monodnaviria > Sangervirae > Phixvir...
3  10849  Viruses > Monodnaviria > Sangervirae > Phixvir...
4  10863  Viruses > Monodnaviria > Loebvirae > Hofneivir...


In [14]:
# merge accession to taxid into blastn
w1_merged_blast = w1_blast_df.merge(w1_taxid_map, left_on = "sseqid", right_on = "accession_full", how = "left")
w2_merged_blast = w2_blast_df.merge(w2_taxid_map, left_on = "sseqid", right_on = "accession_full", how = "left")

print(w1_merged_blast.head())
print(w2_merged_blast.head())

           qseqid       sseqid   pident  length  mismatch  gapopen  qstart  \
0  SRR10903402.21  NC_045512.2  100.000     113         0        0       1   
1  SRR10903402.21  NC_045512.2  100.000      28         0        0     112   
2  SRR10903402.24  NC_045512.2  100.000     150         0        0       1   
3  SRR10903402.26  NC_045512.2  100.000     151         0        0       1   
4  SRR10903402.26  NC_004718.3   96.026     151         6        0       1   

   qend  sstart   send        evalue  bitscore accession_full    taxid  
0   113    8072   7960  4.370000e-53     209.0    NC_045512.2  2697049  
1   139    7944   7971  7.800000e-06      52.8    NC_045512.2  2697049  
2   150    1513   1662  1.300000e-73     278.0    NC_045512.2  2697049  
3   151   29748  29598  3.630000e-74     279.0    NC_045512.2  2697049  
4   151   29605  29455  3.690000e-64     246.0    NC_004718.3   227984  
           qseqid       sseqid  pident  length  mismatch  gapopen  qstart  \
0   SRR10903401.

In [15]:
# merge taxid to lineage into final df
w1_tax_class = w1_merged_blast.merge(w1_lineage_df, on = "taxid", how = "left")
w2_tax_class = w2_merged_blast.merge(w2_lineage_df, on = "taxid", how = "left")

print(w1_tax_class.head())
print(w2_tax_class.head())

           qseqid       sseqid   pident  length  mismatch  gapopen  qstart  \
0  SRR10903402.21  NC_045512.2  100.000     113         0        0       1   
1  SRR10903402.21  NC_045512.2  100.000      28         0        0     112   
2  SRR10903402.24  NC_045512.2  100.000     150         0        0       1   
3  SRR10903402.26  NC_045512.2  100.000     151         0        0       1   
4  SRR10903402.26  NC_004718.3   96.026     151         6        0       1   

   qend  sstart   send        evalue  bitscore accession_full    taxid  \
0   113    8072   7960  4.370000e-53     209.0    NC_045512.2  2697049   
1   139    7944   7971  7.800000e-06      52.8    NC_045512.2  2697049   
2   150    1513   1662  1.300000e-73     278.0    NC_045512.2  2697049   
3   151   29748  29598  3.630000e-74     279.0    NC_045512.2  2697049   
4   151   29605  29455  3.690000e-64     246.0    NC_004718.3   227984   

                                             lineage  
0  Viruses > Riboviria > Orthor

In [16]:
# save final files
w1_tax_class.to_csv("working_files/wuhan1_blast_with_lineage.txt", sep = "\t", index = False)
w2_tax_class.to_csv("working_files/wuhan2_blast_with_lineage.txt", sep = "\t", index = False)