In [4]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("06_Compile_Viral_Mutations")

from config import *
from helper_functions import get_Fasta, NWSeqAlignment

import pandas as pd
pd.options.display.max_columns = 999

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()

This notebook compares the sequences of all SARS-CoV-2 proteins with their SARS-CoV homologs where available and compiles a list of all sequence deviations (Viral_Muts.txt)

- Inputs:
  - Proteins.txt


- Outputs:
  - Viral_Muts.txt


- Dependencies:
  - Must be run after 03_Generate_Proteins

# Compile Viral Mutations

In [2]:
# Fetch list of SARS1 UniProt IDs / Sequences
# Obtained From https://www.uniprot.org/proteomes/UP000000354
SARS1_fasta = ["Q7TFA1", "P59595", "P0C6X7", "P0C6U8", "P59636", "Q7TFA0", "P59634", "P59633", "Q80H93", "P59635", "P59632", "P59594", "P59637", "P59596", "Q7TLC7"]
SARS1_fasta = {x:get_Fasta(x) for x in tqdm_notebook(SARS1_fasta)}

HBox(children=(IntProgress(value=0, max=15), HTML(value=u'')))




In [3]:
# Read in SARS2 Sequences
COVID_fasta = pd.read_csv("{0}/Proteins.txt".format(output_dir), sep="\t").set_index("ID")["Sequence"].to_dict()
COVID_fasta = {k:v for k, v in COVID_fasta.iteritems() if "COVID" in k}

In [5]:
# Perform Mapping to Determine Best Hits
sars2covid = dict()
for p2, s1 in tqdm_notebook(list(SARS1_fasta.iteritems())):
    for p1, s2 in COVID_fasta.iteritems():
        align = NWSeqAlignment(s1, s2)
        alpha = 0.95
        score = alpha*align["Pident"]*align["Coverage1"] + (1-alpha)*align["Coverage1"] + alpha*align["Pident"]*align["Coverage2"] + (1-alpha)*align["Coverage2"]
        if(not p1 in sars2covid):
            sars2covid[p1] = (p2, score, align["Pident"], align["Coverage1"], align["Coverage2"])
        elif(score > sars2covid[p1][1]):
            sars2covid[p1] = (p2, score, align["Pident"], align["Coverage1"], align["Coverage2"])

HBox(children=(IntProgress(value=0, max=15), HTML(value=u'')))




In [6]:
# Manually Screen for Bad Alignments / Update Blacklist
blacklist = [("P0C6U8", "COVID19orf3b"), ("P0C6X7", "COVID19orf8"), ("P0C6U8", "COVID19orf10")]

# Any alignments not filtered by the blacklist will be parsed to identify
# sequence differences.
#
# NOTE: Because SARS1 UniProt Seuqences include some proteins (NSP) as one large
#       entry, the positions here aren't a true 1-1 map for SARS1 protein position
#       to SARS2 protein position. But we don't care about having the exactly
#       right SARS1 position at the protein level (combined UniProt level should
#       be fine).
all_muts = []
for p2 in sars2covid:
    p1 = sars2covid[p2][0]
    if((p1, p2) in blacklist):
        continue
    print p1, p2
    align, posmap = NWSeqAlignment(SARS1_fasta[p1], COVID_fasta[p2], show_align=True, genPosMap=True)
    print "\n"*10
    
    muts = posmap[(posmap["Align2_AA"] != "")&(posmap["Align2_AA"] != posmap["Align1_AA"])].copy()
    muts.columns = ["SARS_Pos", "SARS_AA", "COVID_Pos", "COVID_AA"]
    muts["SARS_ID"] = p1
    muts["COVID_ID"] = p2
    muts = muts[["COVID_ID", "SARS_ID", "COVID_Pos", "COVID_AA", "SARS_Pos", "SARS_AA"]]
    
    all_muts.append(muts)
all_muts = pd.concat(all_muts)

P0C6U8 COVID19nsp7
Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- -------

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
              |-|||||||| ||||||||+| |||-|||||| +-|||||||| +|++|||||| ||+|||||||      
Align2:     1 AYTRYVDNNF CGPDGYPLEC IKDLLARAGK ASCT

Align1:    1 MSDNGPQSNQ RSAPRITFGG PTDSTDNNQN GGRNGARPKQ RRPQGLPNNT ASWFTALTQH 60  
             ||||||| || |+|||||||| |+|||-+||| |-|+|||-|| |||||||||| ||||||||||     
Align2:    1 MSDNGPQ-NQ RNAPRITFGG PSDSTGSNQN GERSGARSKQ RRPQGLPNNT ASWFTALTQH 59  

Align1:   61 GKEELRFPRG QGVPINTNSG PDDQIGYYRR ATRRVRGGDG KMKELSPRWY FYYLGTGPEA 120 
             |||+|+|||| |||||||||- |||||||||| ||||+||||| |||+|||||| ||||||||||     
Align2:   60 GKEDLKFPRG QGVPINTNSS PDDQIGYYRR ATRRIRGGDG KMKDLSPRWY FYYLGTGPEA 119 

Align1:  121 SLPYGANKEG IVWVATEGAL NTPKDHIGTR NPNNNAATVL QLPQGTTLPK GFYAEGSRGG 180 
             -|||||||+| |+|||||||| |||||||||| ||-||||-|| |||||||||| ||||||||||     
Align2:  120 GLPYGANKDG IIWVATEGAL NTPKDHIGTR NPANNAAIVL QLPQGTTLPK GFYAEGSRGG 179 

Align1:  181 SQASSRSSSR SRGNSRNSTP GSSRGNSPAR MASGGGETAL ALLLLDRLNQ LESKVSGKGQ 240 
             |||||||||| ||-+|||||| |||||-|||| ||--||+-|| |||||||||| ||||+|||||     
Align2:  180 SQASSRSSSR SRNSSRNSTP GSSRGTSPAR MAGNGGDAAL ALLLLDRLNQ LESKM

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

In [7]:
all_muts.head()

Unnamed: 0,COVID_ID,SARS_ID,COVID_Pos,COVID_AA,SARS_Pos,SARS_AA
3905,COVID19nsp7,P0C6U8,70,K,3906,R
3274,COVID19nsp5,P0C6U8,35,V,3275,T
3285,COVID19nsp5,P0C6U8,46,S,3286,A
3304,COVID19nsp5,P0C6U8,65,N,3305,S
3325,COVID19nsp5,P0C6U8,86,V,3326,L


In [8]:
# Save
all_muts.to_csv("{0}/Viral_Muts.txt".format(output_dir), sep="\t", index=None)