See how prevalent each clade is over time in a big picture view

In [1]:
# set up
import pandas as pd
import os
import plotly.graph_objects as go
import plotly.express as px

project_folder = os.path.join("..", "..")

In [2]:
# read in our timespan dataframes
dfs = []
for i in range(11):
    dfs.append(pd.read_csv(os.path.join(project_folder, 'data', 'final', "T" + str(i + 1) + ".csv")))

dfs[4].head(1)

Unnamed: 0,state,p_sequence,p_accession,date,count,n_accession,n_sequence,clade
0,GA,"""MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRS...",QVJ86123,2021,1,MZ217195,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,20B


In [3]:
# read in timespan labels
timespan_labels_df = pd.read_csv(os.path.join(project_folder, 'data','origonal', 'timespan_dates.txt'), sep = '\t')
timespan_labels_df.head(2)

Unnamed: 0,timespan,start,end
0,1,"Jan 1, 2020","Mar 20, 2020"
1,2,"Mar 21, 2020","Apr 30, 2020"


In [4]:
# make timestamp dictionary from dataframe for easy labeling
timestamp_dictionary = {i:start + " to " + end for (i, start, end) in zip(timespan_labels_df['timespan'], timespan_labels_df['start'],timespan_labels_df['end'])}
timestamp_dictionary[2]

'Mar 21, 2020 to Apr 30, 2020'

In [7]:

data = []
colors = px.colors.diverging.Spectral
for i in range(11):
    q = dfs[i].groupby("clade").sum('count')
    b = go.Bar(name = timestamp_dictionary[i + 1], x=q.index, y=q['count'], marker_color=colors[i])
    data.append(b)
  
fig = go.Figure(data=data)
fig.update_layout(barmode='group', title = "Total reports of each clade by timestamp")
fig.show()
fig.write_image(os.path.join(project_folder, "visualizations", "Total reports of each clade by timestamp.png"), width=2000, height=1000)

In [36]:
# timestamp on x-axis and colors as clades
data = []
colors = px.colors.diverging.Spectral
clades = ['19A','19B','20A','20B','20C', '20G', '20E (EU1)', "21C (Epsilon)", "20D", "20J (Gamma, V3)",
          "21F (Iota)", "20I (Alpha, V1)", "21B (Kappa)", "21D (Eta)", "21G (Lambda)", "21H (Mu)", "21A (Delta)", "21I (Delta)", "21J (Delta)", "20H (Beta, V2)"]
clades.sort()

clade_to_integer = {clade:i for i,clade in enumerate(clades)}
print(print(clade_to_integer))
clade_info = {}
for i in range(11):
    q = dfs[i].groupby("clade").sum('count')
    for clade, count in zip(q.index, q['count']):
        clade_dict = clade_info.get(clade, dict())
        x = clade_dict.get('x', list())
        x.append(i)
        clade_dict['x'] = x
        y = clade_dict.get('y', list())
        y.append(count)
        clade_dict['y'] = y
        clade_info[clade] = clade_dict
for clade_name, info in clade_info.items():  
    b = go.Bar(name = clade_name, x=list(map(lambda xi: timestamp_dictionary[xi+1],info['x'])), y=info['y'], marker_color=colors[clade_to_integer.get(clade_name, 0) % 10])
    data.append(b)
  
fig = go.Figure(data=data)
fig.update_layout(barmode='group', title = "Total reports of each clade by timestamp -- Flipped")
fig.show()

{'19A': 0, '19B': 1, '20A': 2, '20B': 3, '20C': 4, '20D': 5, '20E (EU1)': 6, '20G': 7, '20H (Beta, V2)': 8, '20I (Alpha, V1)': 9, '20J (Gamma, V3)': 10, '21A (Delta)': 11, '21B (Kappa)': 12, '21C (Epsilon)': 13, '21D (Eta)': 14, '21F (Iota)': 15, '21G (Lambda)': 16, '21H (Mu)': 17, '21I (Delta)': 18, '21J (Delta)': 19}
None


In [None]:
Too many colors since not in rainbow order. could try grouping into 