In [2]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("X")

import requests
import pandas as pd
pd.options.display.max_columns = 999

import numpy as np
import helper as my

import glob
import os
import sys

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

from mjm_tools import zip_res_range, unzip_res_range

from collections import defaultdict

# Paramaters for file locations

In [3]:
# Base directory for whole project
base_dir = "/home/sdw95/3D_SARS2"

# Compile Viral Mutations

In [4]:
# Fetch list of SARS1 UniProt IDs / Sequences
# Obtained From https://www.uniprot.org/proteomes/UP000000354
SARS1_fasta = ["Q7TFA1", "P59595", "P0C6X7", "P0C6U8", "P59636", "Q7TFA0", "P59634", "P59633", "Q80H93", "P59635", "P59632", "P59594", "P59637", "P59596", "Q7TLC7"]
SARS1_fasta = {x:my.get_Fasta(x) for x in tqdm_notebook(SARS1_fasta)}

HBox(children=(IntProgress(value=0, max=15), HTML(value=u'')))




In [5]:
# Read in SARS2 Sequences
COVID_fasta = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t").set_index("ID")["Sequence"].to_dict()
COVID_fasta = {k:v for k, v in COVID_fasta.iteritems() if "COVID" in k}

In [7]:
# Perform Mapping to Determine Best Hits
sars2covid = dict()
for p2, s1 in tqdm_notebook(list(SARS1_fasta.iteritems())):
    for p1, s2 in COVID_fasta.iteritems():
        align = my.NWSeqAlignment(s1, s2)
        alpha = 0.95
        score = alpha*align["Pident"]*align["Coverage1"] + (1-alpha)*align["Coverage1"] + alpha*align["Pident"]*align["Coverage2"] + (1-alpha)*align["Coverage2"]
        if(not p1 in sars2covid):
            sars2covid[p1] = (p2, score, align["Pident"], align["Coverage1"], align["Coverage2"])
        elif(score > sars2covid[p1][1]):
            sars2covid[p1] = (p2, score, align["Pident"], align["Coverage1"], align["Coverage2"])

HBox(children=(IntProgress(value=0, max=15), HTML(value=u'')))




In [10]:
# Manually Screen for Bad Alignments / Update Blacklist
blacklist = [("P0C6U8", "COVID19orf3b"), ("P0C6X7", "COVID19orf8"), ("P0C6U8", "COVID19orf10")]

# Any alignments not filtered by the blacklist will be parsed to identify
# sequence differences.
#
# NOTE: Because SARS1 UniProt Seuqences include some proteins (NSP) as one large
#       entry, the positions here aren't a true 1-1 map for SARS1 protein position
#       to SARS2 protein position. But we don't care about having the exactly
#       right SARS1 position at the protein level (combined UniProt level should
#       be fine).
all_muts = []
for p2 in sars2covid:
    p1 = sars2covid[p2][0]
    if((p1, p2) in blacklist):
        continue
    print p1, p2
    align, posmap = my.NWSeqAlignment(SARS1_fasta[p1], COVID_fasta[p2], show_align=True, genPosMap=True)
    print "\n"*10
    
    muts = posmap[(posmap["Align2_AA"] != "")&(posmap["Align2_AA"] != posmap["Align1_AA"])].copy()
    muts.columns = ["SARS_Pos", "SARS_AA", "COVID_Pos", "COVID_AA"]
    muts["SARS_ID"] = p1
    muts["COVID_ID"] = p2
    muts = muts[["COVID_ID", "SARS_ID", "COVID_Pos", "COVID_AA", "SARS_Pos", "SARS_AA"]]
    
    all_muts.append(muts)
all_muts = pd.concat(all_muts)

P59632 COVID19orf3a
Align1:    1 MDLFMRFFTL GSITAQPVKI DNASPASTVH ATATIPLQAS LPFGWLVIGV AFLAVFQSAT 60  
             ||||||-||+ |++|-+--+| -+|+|+--|- ||||||+||| ||||||++|| |-|||||||+     
Align2:    1 MDLFMRIFTI GTVTLKQGEI KDATPSDFVR ATATIPIQAS LPFGWLIVGV ALLAVFQSAS 60  

Align1:   61 KIIALNKRWQ LALYKGFQFI CNLLLLFVTI YSHLLLVAAG MEAQFLYLYA LIYFLQCINA 120 
             |||-|-|||| |||-||--|+ |||||||||+ |||||||||| +||-|||||| |+||||-||-     
Align2:   61 KIITLKKRWQ LALSKGVHFV CNLLLLFVTV YSHLLLVAAG LEAPFLYLYA LVYFLQSINF 120 

Align1:  121 CRIIMRCWLC WKCKSKNPLL YDANYFVCWH THNYDYCIPY NSVTDTIVVT EGDGISTPKL 180 
             -|||||-||| |||+|||||| ||||||+||| |+-||||||| ||||-+||+| -|||-++|--     
Align2:  121 VRIIMRLWLC WKCRSKNPLL YDANYFLCWH TNCYDYCIPY NSVTSSIVIT SGDGTTSPIS 180 

Align1:  181 KEDYQIGGYS EDRHSGVKDY VVVHGYFTEV YYQLESTQIT TDTGIENATF FIFNKLVKDP 240 
             +-|||||||+ |---|||||- ||+|-|||-- ||||-|||++ ||||+|+-|| ||+||+|-+|     
Align2:  181 EHDYQIGGYT EKWESGVKDC VVLHSYFTSD YYQLYST

Align1:    1 MSDNGPQSNQ RSAPRITFGG PTDSTDNNQN GGRNGARPKQ RRPQGLPNNT ASWFTALTQH 60  
             ||||||| || |+|||||||| |+|||-+||| |-|+|||-|| |||||||||| ||||||||||     
Align2:    1 MSDNGPQ-NQ RNAPRITFGG PSDSTGSNQN GERSGARSKQ RRPQGLPNNT ASWFTALTQH 59  

Align1:   61 GKEELRFPRG QGVPINTNSG PDDQIGYYRR ATRRVRGGDG KMKELSPRWY FYYLGTGPEA 120 
             |||+|+|||| |||||||||- |||||||||| ||||+||||| |||+|||||| ||||||||||     
Align2:   60 GKEDLKFPRG QGVPINTNSS PDDQIGYYRR ATRRIRGGDG KMKDLSPRWY FYYLGTGPEA 119 

Align1:  121 SLPYGANKEG IVWVATEGAL NTPKDHIGTR NPNNNAATVL QLPQGTTLPK GFYAEGSRGG 180 
             -|||||||+| |+|||||||| |||||||||| ||-||||-|| |||||||||| ||||||||||     
Align2:  120 GLPYGANKDG IIWVATEGAL NTPKDHIGTR NPANNAAIVL QLPQGTTLPK GFYAEGSRGG 179 

Align1:  181 SQASSRSSSR SRGNSRNSTP GSSRGNSPAR MASGGGETAL ALLLLDRLNQ LESKVSGKGQ 240 
             |||||||||| ||-+|||||| |||||-|||| ||--||+-|| |||||||||| ||||+|||||     
Align2:  180 SQASSRSSSR SRNSSRNSTP GSSRGTSPAR MAGNGGDAAL ALLLLDRLNQ LESKM

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

Align1:     1 MFIFLLFLTL TSGSDLDRCT TFDDVQAPNY TQHTSSMRGV YYPDEIFRSD TLYLTQDLFL 60   
              ||+||+-|-| -| |-----| |---+ -|-| |  -|--||| ||||++|||- -|+-||||||      
Align2:     1 MFVFLVLLPL VS-SQCVNLT TRTQL-PPAY T--NSFTRGV YYPDKVFRSS VLHSTQDLFL 56   

Align1:    61 PFYSNVTGFH TI-----NHT --FGNPVIPF KDGIYFAATE KSNVVRGWVF GSTMNNKSQS 113  
              ||+||||-|| -|     |-|   |-|||+|| -||+|||+|| |||++|||+| |+|+++|+||      
Align2:    57 PFFSNVTWFH AIHVSGTNGT KRFDNPVLPF NDGVYFASTE KSNIIRGWIF GTTLDSKTQS 116  

Align1:   114 VIIINNSTNV VIRACNFELC DNPFFAV--- -SKPMGTQTH TMIFDNAFNC TFEYISDAFS 169  
              ++|+||+||| ||+-|-|+-| ++||--|     +-----++- --++-+|-|| ||||+|--|-      
Align2:   117 LLIVNNATNV VIKVCEFQFC NDPFLGVYYH KNNKSWMESE FRVYSSANNC TFEYVSQPFL 176  

Align1:   170 LDVSEKSGNF KHLREFVFKN KDGFLYVYKG YQPIDVVRDL PSGFNTLKPI FKLPLGINIT 229  
              +|+--|-||| |+|||||||| -||+--+|-- +-||++|||| |-||+-|+|+ --||+|||||      
Align2:   177 MDLEGKQGNF KNLREFVFKN IDGYFKIYSK HTPI

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
              |-|||||||| ||||||||+| |||-|||||| +-|||||||| +|++|||||| ||+|||||||      
Align2:     1 AYTRYVDNNF CGPDGYPLEC IKDLLARAGK ASCT

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
              |||||-|-|| |||||||||| |||||||||| |||||||-|| |||+|||+|| |||||+||||      
Align2:     1 MESLVPGFNE KTHVQLSLPV LQVRDVLVRG FGDSVEEVLS EARQHLKDGT CGLVEVEKGV 60   

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
              |||||||||| ||||||-+-- |||-+||||| |++||||||| |-|||||||| |||-|+|||-      
Align2:    61 LPQLEQPYVF IKRSDARTAP HGHVMVELVA ELEGIQYGRS GETLGVLVPH VGEIPVAYRK 120  

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
              |||||||||| |||||||-|| ||+||||||| |||-||+++| |||||-||-- |||-||||||      
Align2:   121 VLLRKNGNKG AGGHSYGADL KSFDLGDELG TDPYEDFQEN WNTKHSSGVT RELMRELNGG 180  

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:   181 ---------- ---------- ---------- ----

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

Align1:     1 MESLVLGVNE KTHVQLSLPV LQVRDVLVRG FGDSVEEALS EAREHLKNGT CGLVELEKGV 60   
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:    61 LPQLEQPYVF IKRSDALSTN HGHKVVELVA EMDGIQYGRS GITLGVLVPH VGETPIAYRN 120  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   121 VLLRKNGNKG AGGHSYGIDL KSYDLGDELG TDPIEDYEQN WNTKHGSGAL RELTRELNGG 180  
                                                                                     
Align2:     1 ---------- ---------- ---------- ---------- ---------- ---------- 0    

Align1:   181 AVTRYVDNNF CGPDGYPLDC IKDFLARAGK SMCTLSEQLD YIESKRGVYC CRDHEHEIAW 240  
                                                                                     
Align2:     1 ---------- ---------- ---------- ----

In [11]:
all_muts.head()

Unnamed: 0,COVID_ID,SARS_ID,COVID_Pos,COVID_AA,SARS_Pos,SARS_AA
6,COVID19orf3a,P59632,7,I,7,F
9,COVID19orf3a,P59632,10,I,10,L
11,COVID19orf3a,P59632,12,T,12,S
12,COVID19orf3a,P59632,13,V,13,I
14,COVID19orf3a,P59632,15,L,15,A


In [12]:
# Save
all_muts.to_csv("{0}/Data/Viral_Muts.txt".format(base_dir), sep="\t", index=None)