In [35]:
import os
import pandas as pd
import numpy as np

from Bio import SeqIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from tqdm import tqdm

import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib.animation import FuncAnimation
import seaborn as sns
%matplotlib notebook

import warnings
warnings.filterwarnings('ignore')

In [2]:
genomic = './sars_cov2_spike_aligned/sars_cov2_spike_aligned.fasta'
report = './sars_cov2_spike.csv'

In [3]:
%%time
sequences = {}
for seq_record in tqdm(SeqIO.parse(genomic, 'fasta')):
    if len(seq_record.seq) == 3822 and (set(seq_record.seq)<=set(['A','T','C','G','-','N','n'])):
        sequences[seq_record.id] = seq_record.seq
len(sequences)

110625it [05:19, 346.01it/s]

Wall time: 5min 19s





109288

In [4]:
%%time
df = pd.read_csv(report)
df = df[df.Accession.isin(sequences.keys())]
df = df.sort_values(by='Accession')
df.head()

Wall time: 990 ms


Unnamed: 0,Accession,ReleaseDate,PangoClass,Location,Length,Gene,Protein,Begin,End,CDS_Length
1,MW422255.1,2020-12-30,alpha,"USA: San Diego, California",29763,S,surface glycoprotein,21500,25312,3813
2,MW422256.1,2020-12-30,alpha,USA,29817,S,surface glycoprotein,21524,25336,3813
3,MW430966.1,2021-01-04,alpha,USA: California,29835,S,surface glycoprotein,21523,25335,3813
4,MW430974.1,2021-01-04,alpha,USA: Florida,29861,S,surface glycoprotein,21551,25363,3813
5,MW440433.1,2021-01-05,alpha,"USA: New York, Saratoga County",29792,S,surface glycoprotein,21514,25326,3813


In [5]:
accession_list = df.Accession.tolist()
len_acc = len(accession_list)
len_acc

109288

In [40]:
alpha_data = pd.concat([df[df.PangoClass=='B'],df[df.PangoClass=='alpha']])
beta_data = pd.concat([df[df.PangoClass=='B'],df[df.PangoClass=='Beta']])
gamma_data = pd.concat([df[df.PangoClass=='B'],df[df.PangoClass=='Gamma']])
delta_data = pd.concat([df[df.PangoClass=='B'],df[df.PangoClass=='Delta']])
lambda_data = pd.concat([df[df.PangoClass=='B'],df[df.PangoClass=='Lambda']])
omicron_data = pd.concat([df[df.PangoClass=='B'],df[df.PangoClass=='Omicron']])

In [67]:
omicron_data.shape

(1, 10)

In [17]:
sequences_int = {}
for acc in tqdm(accession_list):
    seq = sequences[acc]
    seq_int = ""
    for b in seq:
        if b=='A': seq_int += '0'
        elif b=='T': seq_int += '1'
        elif b=='C': seq_int += '2'
        elif b=='G': seq_int += '3'
        else: seq_int += '4'
    sequences_int[acc] = seq_int

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 109288/109288 [16:45<00:00, 108.73it/s]


In [18]:
print(sequences_int['MW422255.1'][:25])

1311131111121131111011322


In [68]:
mut_stat = np.zeros((5,3822))
for acc in tqdm(alpha_data.Accession.tolist()):
    seq_int = sequences_int[acc]
    for j in range(3822):
        mut_stat[int(seq_int[j]),j] += 1

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 99328/99328 [32:02<00:00, 51.68it/s]


In [69]:
mut_stat

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.9328e+04, 9.9328e+04,
        9.9324e+04],
       [9.9328e+04, 0.0000e+00, 9.9327e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [0.0000e+00, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [0.0000e+00, 9.9328e+04, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]])

In [70]:
mut_prob = mut_stat/np.sum(mut_stat,axis=0)
mut_prob

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 9.99959729e-01],
       [1.00000000e+00, 0.00000000e+00, 9.99989932e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00676546e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 4.02706186e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [71]:
np.savetxt("./probs/alpha_probs.csv", mut_prob, delimiter=",")