Accession emergence over time

In [52]:
import pandas as pd
import os
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import datetime

project_folder = os.path.join("..", "..")

In [25]:
df = pd.read_csv(os.path.join(project_folder, 'data', 'final', "all.csv"))
df.head(5)

Unnamed: 0,state,p_sequence,p_accession,date,count,n_accession,n_sequence,clade,timespan
0,MA,"""MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRS...",QTP71261,2020,2,MW885877,GGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTA...,20A,1
1,WA,"""MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRS...",QLJ57227,2020,1367,MT252714,CTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGT...,19B,1
2,WA,"""MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRS...",QLJ57383,2020,1,MT252753,CCCTTHAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTT...,19A,1
3,CA,"""MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRS...",QHW06059,2020-01-29,2,MT027064,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,19A,1
4,WA,"""MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRS...",QKS89879,2020-02-29,26,MT627216,TAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATC...,19B,1


In [41]:
US_regions = {"West":["WA", "OR", "ID", "MT", "WY", "CA", "NV", "UT", "CO", "AZ", "NM"],
              "Midwest":["ND", "MN", "WI", "MI", "OH", "IN", "IL", "MO", "IA", "SD", "NE", "KS"],
             "Northeast":["ME", "VT", "NH", "MA","CT", "RI", "NY", "NJ", "PA"],
             "South":["TX", "OK", "AR", "LA", "MS", "TH", "KY", "WV", "MD", "DE", "VA", "NC", "SC", "GA", "AL", "FL"]}
mapping = {}
for region, states in US_regions.items():
    for s in states:
        mapping[s] = region

df['region'] =  df['state'].map(mapping)
df['date'] = pd.to_datetime(df['date'])

In [56]:
# get timespan begining and end to mark
timespan_df = pd.read_csv(os.path.join(project_folder,'data', 'origonal', 'timespan_dates.txt'), sep = '\t')
timespan_df['start2'] = pd.to_datetime(timespan_df['start'])
timespan_df

Unnamed: 0,timespan,start,end,start2
0,1,"Jan 1, 2020","Mar 20, 2020",2020-01-01
1,2,"Mar 21, 2020","Apr 30, 2020",2020-03-21
2,3,"May 1, 2020","Sep 20, 2020",2020-05-01
3,4,"Sep 21, 2020","Dec 31, 2020",2020-09-21
4,5,"Jan 1, 2021","Jan 31, 2021",2021-01-01
5,6,"Feb 1, 2021","Feb 28, 2021",2021-02-01
6,7,"Mar 1, 2021","Mar 31, 2021",2021-03-01
7,8,"Apr 1, 2021","Apr 30, 2021",2021-04-01
8,9,"May 1, 2021","May 31, 2021",2021-05-01
9,10,"Jun 1, 2021","June 30, 2021",2021-06-01


In [64]:
def get_clade(df, clade):
    return df[df['clade'] == clade].sort_values('count', ascending = False)
def get_symbol(clade):
    if '19' in clade:
        return 'diamond'
    if '20' in clade:
        return 'square'
    if '21' in clade:
        return 'circle'

def add_clade_trace(df, fig, name):
    customdf = np.stack((df['count'], df['p_accession'] ),axis = -1)
    fig.add_trace(
        go.Scatter(mode = 'markers', x=df["date"], y=[df['region'],df["state"]],
                        showlegend = True,
                   name = name,meta = [name],
                   customdata = customdf,
                   hovertemplate = "<b>%{customdata[1]}</b><br>%{customdata[0]} reports hear after<br>%{meta[0]}<extra>%{y[0]}<br>%{y[1]}<br>%{x}<br></extra>",
                     marker = dict(
                         size=df["count"],
                         sizeref = 30, 
                         sizemin = 3,
                         opacity = 0.65,
                         symbol = get_symbol(name)),
                  ))

fig = go.Figure()

# for every clade, add trace
for c in df['clade'].sort_values(ascending = False).unique():
    add_clade_trace(get_clade(df, c), fig, c)
    
# timespan start lines
for row in timespan_df.iterrows():
    fig.add_vline(x=datetime.datetime.strptime(row[1]['start2'].strftime("%Y-%m-%d"), "%Y-%m-%d").timestamp() * 1000,
                  annotation_text = 'T' + str(row[1]['timespan']), line_width = 1, line_color = 'black')


fig.update_layout(
            title = dict(text = "Clade Emergence in the USA", x= 0.5),
            plot_bgcolor = 'white',
            legend = dict(itemsizing = 'constant', traceorder = "reversed"),
            height = 1000
            )
fig.update_yaxes(
            title = dict(text = 'State'),
            showline = True,
            gridcolor = 'lightgrey',
            categoryorder = 'array',
            categoryarray = ['West', 'Midwest', 'Northeast', 'South'],
            type = 'multicategory',
            showdividers=True,
            tickson = 'labels'
)
fig.update_xaxes(
            title = dict(text = 'Date')
)
config = dict(scrollZoom = False, doubleClick = 'reset', displayModeBar = True,
                  modeBarButtonsToRemove=['zoom2d','zoomIn2d', 'zoomOut2d','autoScale2d','lasso2d','select2d'])
fig.show(config = config)
fig.write_html('../../visualizations/clade_emergence.html', config = config)