In [105]:
%load_ext autoreload
%autoreload 2
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
from Shared_Functions import *
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import matplotlib.pyplot as plt
from scipy.special import softmax
import torch
import esm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Model into GPU

In [25]:
model, alphabet = esm.pretrained.load_model_and_alphabet('esm2_t36_3B_UR50D')
model.eval()
batch_converter = alphabet.get_batch_converter()
device = torch.device("cuda")
if torch.cuda.is_available():
    model =  model.to(device)
    print("Transferred model to GPU")

# Download Reference Sequence and Embed

In [26]:
from Bio import Entrez
from Bio import SeqIO

Entrez.email = "sample@example.org"

handle = Entrez.efetch(db="nucleotide",
                       id="NC_045512.2",
                       rettype="gb",
                       retmode="gb")
whole_sequence = SeqIO.read(handle, "genbank")

In [27]:
model_layers = 36

# Embed Initial Sequences

In [28]:
ref_spike_seq = 'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT'

In [43]:
haplotype_sequence_embeddings = process_fasta('Sequences/earliest_haplo_spike_UK.fa','S:0',ref_spike_seq,model,model_layers,batch_converter,device,alphabet,insertions=True)

Semantic score:  1.7094954376443638
Sequence Grammaticality:  -431.3873
Relative Sequence Grammaticality:  -1.273407
Semantic score:  3.1920375193319614
Sequence Grammaticality:  -436.28207
Relative Sequence Grammaticality:  -6.1681824
Semantic score:  3.358635534955283
Sequence Grammaticality:  -437.98254
Relative Sequence Grammaticality:  -7.8686523


In [44]:
compressed_pickle('haplotype_pandemic_spikes',haplotype_sequence_embeddings)

In [45]:
columns = ['label', 'semantic_score', 'sequence_grammaticality', 'relative_sequence_grammaticality', ]
indel_table = []
for key in haplotype_sequence_embeddings['S:0'].keys():
    if key != 'Reference':
        row = pd.DataFrame([haplotype_sequence_embeddings['S:0'][key].get(c) for c in columns]).T
        row.columns = columns
        indel_table.append(row)
indel_table = pd.concat(indel_table)

In [46]:
indel_table.columns = ['label','indel_semantic_score','indel_sequence_grammaticality','indel_relative_sequence_grammaticality']

In [49]:
metadata = pd.read_csv('Metadata/earliest_haplo_spike_UK.tsv',sep='\t')
metadata

Unnamed: 0,SequenceID,ClusterName,ClusterSize,Clade,Pango lineage,Variant,Collection date,Location
0,EPI_ISL_10013279,Cluster 0,50,GRA,BA.1.17.2,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,,Europe / United Kingdom / England
1,EPI_ISL_1051786,Cluster 1,12,GR,B.1.1.7,Former VOC Alpha GRY (B.1.1.7+Q.*) first detec...,2021-01-11,Europe / United Kingdom / England
2,EPI_ISL_11943412,Cluster 10,1,GRA,BA.1.1.12,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,2022-01-16,Europe / United Kingdom / England
3,EPI_ISL_1104239,Cluster 100,32,GV,W.4,,2021-01-02,Europe / United Kingdom / Scotland
4,EPI_ISL_551515,Cluster 1000,1,GR,B.1.1,,2020-06-24,Europe / United Kingdom / England
...,...,...,...,...,...,...,...,...
28273,EPI_ISL_6348159,Cluster 9995,2,GK,AY.6,Former VOC Delta GK (B.1.617.2+AY.*) first det...,2021-11-10,Europe / United Kingdom / England
28274,EPI_ISL_6366094,Cluster 9996,2,GK,AY.4,Former VOC Delta GK (B.1.617.2+AY.*) first det...,2021-11-06,Europe / United Kingdom / England
28275,EPI_ISL_6365848,Cluster 9997,2,GK,AY.4,Former VOC Delta GK (B.1.617.2+AY.*) first det...,2021-11-06,Europe / United Kingdom / Wales
28276,EPI_ISL_6347659,Cluster 9998,1,GK,AY.4,Former VOC Delta GK (B.1.617.2+AY.*) first det...,2021-11-10,Europe / United Kingdom / England


In [50]:
initial_table = pd.merge(metadata,indel_table,how='left',left_on='SequenceID',right_on='label')

In [51]:
initial_table = initial_table.sort_values('indel_semantic_score')
initial_table['indel_semantic_rank'] = initial_table.reset_index().index.astype(int) + 1
initial_table = initial_table.sort_values('indel_sequence_grammaticality')
initial_table['indel_sequence_grammatical_rank'] =initial_table.reset_index().index.astype(int) + 1
initial_table['indel_sequence_acquisition_priority'] = initial_table['indel_semantic_rank'] + initial_table['indel_sequence_grammatical_rank']

In [53]:
initial_table.columns = ['label', 'cluster_name', 'cluster_size', 'clade', 'lineage',
       'Voc', 'sample_date', 'location', 'label_x',
       'indel_semantic_score', 'indel_sequence_grammaticality',
       'indel_relative_sequence_grammaticality', 'indel_semantic_rank',
       'indel_sequence_grammatical_rank',
       'indel_sequence_acquisition_priority']

In [54]:
initial_table = initial_table.drop('label_x',axis=1)

In [56]:
initial_table.to_csv('Scores/haplotype_scores.csv',index=False)

# Moving Averages

In [57]:
initial_table = pd.read_csv('Scores/haplotype_scores.csv')

In [58]:
#Load haplotype embeddings 
haplotype_sequence_embeddings = decompress_pickle('haplotype_pandemic_spikes.pbz2')

In [59]:
#For Indel pickle
grouped_embeddings = []
for sequence_id in haplotype_sequence_embeddings['S:0'].keys():
    if sequence_id != 'Reference':
        embedding = haplotype_sequence_embeddings['S:0'][sequence_id]
        if sequence_id != 'Reference':
            row = {"label":sequence_id,
                  'embedding':embedding['Mean_Embedding']}
            grouped_embeddings.append(pd.Series(row))
        
grouped_embeddings = pd.concat(grouped_embeddings,axis=1).T
grouped_embeddings = pd.merge(grouped_embeddings,initial_table[['label','sample_date']],how='left',left_on='label',right_on='label')
grouped_embeddings.sample_date = grouped_embeddings.sample_date.astype('datetime64[ns]')
grouped_embeddings = grouped_embeddings.sort_values('sample_date')
grouped_embeddings

Unnamed: 0,label,embedding,sample_date
0,EPI_ISL_1180015,"[-0.022716747596859932, -0.019655432552099228,...",2021-01-28
1,EPI_ISL_1180033,"[-0.022242441773414612, -0.018704110756516457,...",2021-01-28
2,EPI_ISL_1179711,"[-0.02240000106394291, -0.018780121579766273, ...",2021-02-10


# Load sequences and ungap them

In [60]:
initial_sequences_ungapped =[]  
for fasta in SeqIO.parse('Sequences/earliest_haplo_spike_UK.fa', "fasta"):
    sequence_row = [fasta.id,str(fasta.seq)]
    sequence_row_logits = torch.FloatTensor(haplotype_sequence_embeddings['S:0'][fasta.id]['Logits'])
    sequence_logits = []
    for pos in range(len(sequence_row[1])):
        word = sequence_row[1][pos]
        word_idx = alphabet.get_idx(word)
        prob = sequence_row_logits[pos + 1, word_idx]
        sequence_logits.append(prob.item())
    sequence_row.append(sequence_logits)
    sequence_row = pd.DataFrame(sequence_row).T
    initial_sequences_ungapped.append(sequence_row)
initial_sequences_ungapped =pd.concat(initial_sequences_ungapped)

In [61]:
initial_sequences_ungapped.columns = ['label','sequence','sequence_logits']
initial_sequences_ungapped

Unnamed: 0,label,sequence,sequence_logits
0,EPI_ISL_1180015,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,"[-0.0005610798834823072, -0.14159958064556122,..."
0,EPI_ISL_1180033,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,"[-0.0004577780782710761, -0.14605365693569183,..."
0,EPI_ISL_1179711,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,"[-0.0004472924047149718, -0.14671146869659424,..."


In [62]:
initial_sequences_ungapped = pd.merge(initial_sequences_ungapped,grouped_embeddings,how='left',left_on='label',right_on='label')

# Filter sequences that are too small (Probably not real)

In [63]:
initial_sequences_ungapped['sequence_length'] = [ len(s.replace('*','')) for s in initial_sequences_ungapped.sequence]

In [65]:
metadata = pd.read_csv('Metadata/earliest_haplo_spike_UK.tsv',sep='\t')
metadata

Unnamed: 0,SequenceID,ClusterName,ClusterSize,Clade,Pango lineage,Variant,Collection date,Location
0,EPI_ISL_10013279,Cluster 0,50,GRA,BA.1.17.2,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,,Europe / United Kingdom / England
1,EPI_ISL_1051786,Cluster 1,12,GR,B.1.1.7,Former VOC Alpha GRY (B.1.1.7+Q.*) first detec...,2021-01-11,Europe / United Kingdom / England
2,EPI_ISL_11943412,Cluster 10,1,GRA,BA.1.1.12,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,2022-01-16,Europe / United Kingdom / England
3,EPI_ISL_1104239,Cluster 100,32,GV,W.4,,2021-01-02,Europe / United Kingdom / Scotland
4,EPI_ISL_551515,Cluster 1000,1,GR,B.1.1,,2020-06-24,Europe / United Kingdom / England
...,...,...,...,...,...,...,...,...
28273,EPI_ISL_6348159,Cluster 9995,2,GK,AY.6,Former VOC Delta GK (B.1.617.2+AY.*) first det...,2021-11-10,Europe / United Kingdom / England
28274,EPI_ISL_6366094,Cluster 9996,2,GK,AY.4,Former VOC Delta GK (B.1.617.2+AY.*) first det...,2021-11-06,Europe / United Kingdom / England
28275,EPI_ISL_6365848,Cluster 9997,2,GK,AY.4,Former VOC Delta GK (B.1.617.2+AY.*) first det...,2021-11-06,Europe / United Kingdom / Wales
28276,EPI_ISL_6347659,Cluster 9998,1,GK,AY.4,Former VOC Delta GK (B.1.617.2+AY.*) first det...,2021-11-10,Europe / United Kingdom / England


In [66]:
initial_sequences_ungapped = pd.merge(initial_sequences_ungapped,metadata,how='left',left_on='label',right_on='SequenceID')

In [67]:
initial_sequences_ungapped = initial_sequences_ungapped[(initial_sequences_ungapped.sequence_length >= 1265) & (initial_sequences_ungapped.ClusterSize > 1)]

In [None]:
# initial_sequences_ungapped.to_csv('PAPER_SEQUENCES/whole_pandemic_sequences/embeddings_and_logits.tsv',sep='\t')

# Make logits to grammaticalities so that they can be compared (logits have different lengths)

In [68]:
initial_sequences_ungapped['sequence_grammaticality'] = [np.sum(row.sequence_logits[:-1]) for i,row in initial_sequences_ungapped.iterrows()]

# Sliding Windows

In [70]:
from itertools import chain
from datetime import datetime, timedelta
from tqdm import tqdm
dates = {}

# min_date = combined_embedding_tables.sample_date.iloc[0]
min_date = datetime(2019, 12, 1)
max_date = initial_sequences_ungapped.sample_date.max()
weighted = False

time_period=90
sliding_window_embeddings = []
for current_date in tqdm(initial_sequences_ungapped.sort_values('sample_date').sample_date.unique()):
    current_date = pd.Timestamp(current_date)
    vaild_dates  = initial_sequences_ungapped[(initial_sequences_ungapped.sample_date<=current_date) & (initial_sequences_ungapped.sample_date>=(current_date) - timedelta(days=time_period))]
    if len(vaild_dates) !=0:
#         print(vaild_dates)
        internal = []
        if weighted == True:
            for j, internal_row in vaild_dates.iterrows():
                internal+=[internal_row for k in range(internal_row.ClusterSize)]
                internal = pd.concat(internal,axis=1).T
        else:
            internal = vaild_dates
        valid_embeddings = np.vstack(internal.embedding.to_list()).mean(axis=0)
        valid_sequence_grammaticality = internal.sequence_grammaticality.mean(axis=0)
        sliding_window_embeddings.append(pd.Series({"date":current_date,
                                                    "mean_sliding_embedding":valid_embeddings,
                                                    "mean_sliding_sequence_grammaticality":valid_sequence_grammaticality,
                                                    }))
    current_date = current_date +  timedelta(days=time_period)
sliding_window_embeddings = pd.concat(sliding_window_embeddings,axis=1).T
sliding_window_embeddings

100%|██████████| 1/1 [00:00<00:00, 139.10it/s]


Unnamed: 0,date,mean_sliding_embedding,mean_sliding_sequence_grammaticality
0,2021-01-28,"[-0.022479594685137272, -0.019179771654307842,...",-433.834684


In [71]:
sliding_window_embeddings['mean_sliding_embedding_semantic_score'] = [float(sum(abs(target-base) for base, target in zip(haplotype_sequence_embeddings['S:0']['Reference']['Mean_Embedding'],row.mean_sliding_embedding)))
                                                                      for i,row in sliding_window_embeddings.iterrows()]

In [100]:
sliding_window_embeddings.to_csv('Scores/90_day_mean_sliding_embeddings',sep='\t')

In [73]:
sliding_rows = []
for i,row in initial_sequences_ungapped.iterrows():
    embedding_date = str(row.sample_date)[:10]
    if embedding_date == 'NaT':
        sliding_rows.append(pd.DataFrame([row.label, np.nan,  np.nan]).T)
    else:
        embedding_date = datetime.strptime(embedding_date, '%Y-%m-%d')
        embedding = row.embedding
        sliding_row = sliding_window_embeddings[sliding_window_embeddings.date == embedding_date]

        sliding_window_semantic_score = float(sum(abs(target-base) for target, base in zip(embedding ,sliding_row.mean_sliding_embedding.iloc[0])))
        sliding_window_sequence_grammaticality = row["sequence_grammaticality"] - sliding_row.mean_sliding_sequence_grammaticality.iloc[0]
        new_row = pd.DataFrame([row.label,sliding_window_semantic_score, sliding_window_sequence_grammaticality]).T
        sliding_rows.append(new_row)

In [74]:
sliding_rows = pd.concat(sliding_rows)

In [75]:
sliding_rows.columns = ['label','sliding_semantic_score','sliding_sequence_grammaticality']

In [76]:
initial_table = pd.merge(initial_sequences_ungapped,sliding_rows,how='left',left_on='label',right_on='label')

In [77]:
initial_table = pd.merge(initial_table,metadata,how='left',left_on='label',right_on='SequenceID')

In [78]:
initial_table

Unnamed: 0,label,sequence,sequence_logits,embedding,sample_date,sequence_length,SequenceID_x,ClusterName_x,ClusterSize_x,Clade_x,...,sliding_semantic_score,sliding_sequence_grammaticality,SequenceID_y,ClusterName_y,ClusterSize_y,Clade_y,Pango lineage_y,Variant_y,Collection date_y,Location_y
0,EPI_ISL_1180015,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,"[-0.0005610798834823072, -0.14159958064556122,...","[-0.022716747596859932, -0.019655432552099228,...",2021-01-28,1273,EPI_ISL_1180015,Cluster 47,7,GV,...,1.374365,2.447372,EPI_ISL_1180015,Cluster 47,7,GV,B.1.177,,2021-01-28,Europe / United Kingdom / England
1,EPI_ISL_1180033,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,"[-0.0004577780782710761, -0.14605365693569183,...","[-0.022242441773414612, -0.018704110756516457,...",2021-01-28,1270,EPI_ISL_1180033,Cluster 14310,4,GRY,...,1.374365,-2.447372,EPI_ISL_1180033,Cluster 14310,4,GRY,B.1.1.7,Former VOC Alpha GRY (B.1.1.7+Q.*) first detec...,2021-01-28,Europe / United Kingdom / England


In [79]:
initial_table.columns = [col[:-2] if 'x' in col else col for col in initial_table.columns]

In [80]:
initial_table = initial_table[initial_table.columns[:-8]]

In [81]:
indel_table.columns = ['label','semantic_score','sequence_grammaticality','sequence_relative_grammaticality']

In [82]:
initial_table.columns = ['label', 'sequence', 'sequence_logits','embedding','sample_date',
       'sequence_length','sequence_id', 'cluster_name',
       'cluster_size', 'clade', 'lineage', 'Voc', 'collection_date',
       'location','sequence_grammaticality', 'sliding_semantic_score',
       'sliding_sequence_grammaticality', ]

In [83]:
initial_table

Unnamed: 0,label,sequence,sequence_logits,embedding,sample_date,sequence_length,sequence_id,cluster_name,cluster_size,clade,lineage,Voc,collection_date,location,sequence_grammaticality,sliding_semantic_score,sliding_sequence_grammaticality
0,EPI_ISL_1180015,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,"[-0.0005610798834823072, -0.14159958064556122,...","[-0.022716747596859932, -0.019655432552099228,...",2021-01-28,1273,EPI_ISL_1180015,Cluster 47,7,GV,B.1.177,,2021-01-28,Europe / United Kingdom / England,-431.387312,1.374365,2.447372
1,EPI_ISL_1180033,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,"[-0.0004577780782710761, -0.14605365693569183,...","[-0.022242441773414612, -0.018704110756516457,...",2021-01-28,1270,EPI_ISL_1180033,Cluster 14310,4,GRY,B.1.1.7,Former VOC Alpha GRY (B.1.1.7+Q.*) first detec...,2021-01-28,Europe / United Kingdom / England,-436.282056,1.374365,-2.447372


In [84]:
initial_table.Voc = initial_table.Voc.fillna('Non-VOC')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  initial_table.Voc = initial_table.Voc.fillna('Non-VOC')


In [85]:
initial_table.rename(columns = {'Voc':'Variant'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  initial_table.rename(columns = {'Voc':'Variant'},inplace=True)


In [86]:
initial_table = pd.merge(initial_table,indel_table,how='left',left_on='label',right_on='label')

In [87]:
initial_table = initial_table.drop('sequence_grammaticality_x',axis=1)

In [88]:
initial_table = initial_table.rename(columns={'sequence_grammaticality_y':'sequence_grammaticality'})

In [89]:
initial_table

Unnamed: 0,label,sequence,sequence_logits,embedding,sample_date,sequence_length,sequence_id,cluster_name,cluster_size,clade,lineage,Variant,collection_date,location,sliding_semantic_score,sliding_sequence_grammaticality,semantic_score,sequence_grammaticality,sequence_relative_grammaticality
0,EPI_ISL_1180015,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,"[-0.0005610798834823072, -0.14159958064556122,...","[-0.022716747596859932, -0.019655432552099228,...",2021-01-28,1273,EPI_ISL_1180015,Cluster 47,7,GV,B.1.177,Non-VOC,2021-01-28,Europe / United Kingdom / England,1.374365,2.447372,1.709495,-431.387299,-1.273407
1,EPI_ISL_1180033,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,"[-0.0004577780782710761, -0.14605365693569183,...","[-0.022242441773414612, -0.018704110756516457,...",2021-01-28,1270,EPI_ISL_1180033,Cluster 14310,4,GRY,B.1.1.7,Former VOC Alpha GRY (B.1.1.7+Q.*) first detec...,2021-01-28,Europe / United Kingdom / England,1.374365,-2.447372,3.192038,-436.282074,-6.168182


In [91]:
final_table = pd.merge(initial_table[['label',
       'sequence_length','sliding_semantic_score',
       'sliding_sequence_grammaticality']],pd.read_csv('Scores/haplotype_scores.csv'),how='left',left_on='label',right_on='label')

In [92]:
final_table.columns = [c.replace('indel_','') for c in final_table.columns]

In [94]:
voc_df = pd.DataFrame([initial_table.Variant.unique(),
                       ['Non-VOC','Alpha','Lambda','Beta','Iota','Kappa','Eta','Mu','Delta','Zeta','Epsilon','Gamma','Omicron','Omicron','Omicron','Omicron','Omicron','Omicron','Omicron','Omicron','Omicron','Omicron','Omicron','Omicron']]).T
voc_df.columns = ['Variant','Voc']
final_table = final_table.rename(columns={"Voc":'Variant'})
final_table = pd.merge(final_table,voc_df,
         how='left',left_on='Variant',right_on='Variant')

In [95]:
final_table.Voc = final_table.Voc.fillna('Non-VOC')

In [96]:
#Filter spikes with no submission time
final_table = final_table[final_table.sample_date.isna() == False]

In [98]:
final_table.to_csv('Scores/90_proper_earliest_haplo_spike_with_sliding_windows_scores.csv')
              

In [99]:
initial_table.to_csv('Scores/updated_90_proper_earliest_haplo_spike_with_sliding_windows.tsv',sep='\t')