In [24]:
import os
import csv
import pandas as pd

from Bio import SeqIO
from Bio.Seq import Seq
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from tqdm import tqdm

In [2]:
%%time
fastafn = 'sars_cov2_s_genomic.fasta'
sequences = {}
for seq_record in SeqIO.parse(fastafn,'fasta'):
    sequences[seq_record.id] = seq_record.seq
len(sequences)

Wall time: 5.42 s


110626

In [3]:
%%time
csvfn = 'sars_cov2_s_report.csv'
df = pd.read_csv(csvfn)
df.head()

Wall time: 175 ms


Unnamed: 0,Accession,ReleaseDate,PangoClass,Location,Length,Gene,Protein,Begin,End,CDS_Length
0,NC_045512.2,2020-01-13,B,China,29903,S,surface glycoprotein,21563,25384,3822
1,MW422255.1,2020-12-30,B.1.1.7,"USA: San Diego, California",29763,S,surface glycoprotein,21500,25312,3813
2,MW422256.1,2020-12-30,B.1.1.7,USA,29817,S,surface glycoprotein,21524,25336,3813
3,MW430966.1,2021-01-04,B.1.1.7,USA: California,29835,S,surface glycoprotein,21523,25335,3813
4,MW430974.1,2021-01-04,B.1.1.7,USA: Florida,29861,S,surface glycoprotein,21551,25363,3813


In [19]:
# each alignment will take approx. 1.5s
# total time approx. 42h

refseq = sequences['NC_045512.2']
accession_list = df.Accession.tolist()
# accession_list.remove('NC_045512.2')
len(accession_list)

110626

In [38]:
uni_sequences = []
for acc in tqdm(accession_list):
    cds_len = df[df.Accession==acc].CDS_Length.item()
    sup = '-'*(3850-cds_len)
    new_seq = str(sequences[acc])+sup
    uni_sequences.append(
        SeqRecord(
            Seq(new_seq),
            id=acc,
            name='S',
            description='surface glycoprotein',
        )
    )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 110626/110626 [16:04<00:00, 114.71it/s]


In [39]:
%%time
multialign = MultipleSeqAlignment(uni_sequences)

Wall time: 191 ms


In [42]:
print(multialign[:,3700:])

Alignment with 110626 rows and 150 columns
TTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTT...--- NC_045512.2
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW422255.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW422256.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW430966.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW430974.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW440433.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW447771.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW462650.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW462651.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW462652.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW463056.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW474742.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW485088.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW485292.1
GACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGAT...--- MW485323.1
GACCAGTTGCTGTAGTTGTCTCAAGG

In [None]:
for seq_id in tqdm(accession_list):
    seq2 = sequences[seq_id]
    alignment = pairwise2.align.globalms(refseq,seq2,2,-1,-10,-0.5,one_alignment_only=True,penalize_end_gaps=False)
    sequences[seq_id] = alignment[0].seqB

In [46]:
accession_list[-10:]

['OK448772.1',
 'OK449040.1',
 'OK449041.1',
 'OK449381.1',
 'OK449382.1',
 'OK449941.1',
 'OK450321.1',
 'OK451410.1',
 'OK452164.1',
 'OK452241.1']

In [47]:
seqtest1, seqtest2 = sequences['NC_045512.2'], sequences['OK452241.1']

In [49]:
%%time
alignment = pairwise2.align.localms(seqtest1, seqtest2, 2,-1,-10,-0.5, one_alignment_only=True, penalize_end_gaps=False)
seqA1 = alignment[0].seqA
seqB1 = alignment[0].seqB
print(len(seqA1),len(seqB1))

3822 3822
Wall time: 7.02 s


In [50]:
%%time
alignment1 = pairwise2.align.localms(seqtest1[:1000], seqtest2[:1000], 2,-1,-10,-0.5, one_alignment_only=True, penalize_end_gaps=False)
alignment2 = pairwise2.align.localms(seqtest1[1000:2000], seqtest2[1000:2000], 2,-1,-10,-0.5, one_alignment_only=True, penalize_end_gaps=False)
alignment3 = pairwise2.align.localms(seqtest1[2000:3000], seqtest2[2000:3000], 2,-1,-10,-0.5, one_alignment_only=True, penalize_end_gaps=False)
alignment4 = pairwise2.align.localms(seqtest1[3000:], seqtest2[3000:], 2,-1,-10,-0.5, one_alignment_only=True, penalize_end_gaps=False)

seqA2 = alignment1[0].seqA+alignment2[0].seqA+alignment3[0].seqA+alignment4[0].seqA
seqB2 = alignment1[0].seqB+alignment2[0].seqB+alignment3[0].seqB+alignment4[0].seqB

Wall time: 1.57 s


In [54]:
print(alignment[0])

Alignment(seqA='TGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGAT