We want to find the top clade in each state for each timespan and map it on the us states. 

In [1]:
# set up
import pandas as pd
import plotly.express as px
import os

project_folder = os.path.join("..", "..")

In [2]:
df = pd.read_csv(os.path.join(project_folder, 'data', 'final', "all.csv"))
df.head()

Unnamed: 0,state,p_sequence,p_accession,date,count,n_accession,n_sequence,clade,timespan
0,MA,"""MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRS...",QTP71261,2020,2,MW885877,GGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTA...,20A,1
1,WA,"""MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRS...",QLJ57227,2020,1367,MT252714,CTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGT...,19B,1
2,WA,"""MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRS...",QLJ57383,2020,1,MT252753,CCCTTHAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTT...,19A,1
3,CA,"""MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRS...",QHW06059,2020-01-29,2,MT027064,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,19A,1
4,WA,"""MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRS...",QKS89879,2020-02-29,26,MT627216,TAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATC...,19B,1


In [3]:
def transform_to_top_clade(timeperiod_df, single_timespan = False):
    by_columns = ['state', 'timespan','clade']
    if single_timespan:
        by_columns = ['state','clade'] 
    
    timeperiod_df = timeperiod_df.groupby(by = by_columns, as_index=False).agg({"count":"sum"}) #.sort_values(by=['state','count'])
    by_columns.remove("clade")
    max_values_df = timeperiod_df.groupby(by = by_columns, as_index=False).agg({"count":"max"})
    df = max_values_df.merge(timeperiod_df, how ='inner',  on= by_columns.append('count'))
    if single_timespan:
        return df
    return df.sort_values(by='timespan')
# not sure what to do with ties yet

In [4]:
# px.colors.diverging.Spectral blended
rainbow_20 = ['rgb(158,1,66)',
              'rgb(186,32,73)',
             'rgb(213,62,79)',
              'rgb(229,86,73)',
             'rgb(244,109,67)',
              'rgb(249,142,82)',
             'rgb(253,174,97)',
              'rgb(254,199,118)',
             'rgb(254,224,139)',
              'rgb(255,240,165)',
             'rgb(255,255,191)',
              'rgb(243,250,172)',
             'rgb(230,245,152)',
              'rgb(201,233,158)',
             'rgb(171,221,164)',
              'rgb(137,208,165)',
             'rgb(102,194,165)',
              'rgb(76,165,177)',
             'rgb(50,136,189)',
              'rgb(72,108,176)',
             'rgb(94,79,162)',
             'rgb(67,56,116)', 'rgb(40,34,69)']

clades = ['19A','19B','20A','20B','20C', '20G', '20E (EU1)', "21C (Epsilon)", "20D", "20J (Gamma, V3)",
          "21F (Iota)", "20I (Alpha, V1)", "21B (Kappa)", "21D (Eta)", "21G (Lambda)", "21H (Mu)", "21A (Delta)", "21I (Delta)", "21J (Delta)", "20H (Beta, V2)"]
clades.sort()

In [9]:
# https://plotly.com/python-api-reference/generated/plotly.express.scatter_geo
def plot_top_clade_bubble(per_capita = False, population_dictionary= {}, save = False):
    temp_df = transform_to_top_clade(df)
    temp_df['per_capita'] = temp_df['count']
    title = "Top clade (most recordings) for each state and timestamp"
    hover_data_dict = {'clade':True, 'count':True, 'state':False, 'timespan':False, 'per_capita':False}
    if per_capita:
        temp_df['per_capita'] = (temp_df['count'] / temp_df['state'].map(population_dictionary) )* 100000
        temp_df['per_capita'] = temp_df['per_capita'].fillna(0)
        hover_data_dict['per_capita'] = True
        title = "Top clade Per Capita (most recordings per 100,000 people) for each state and timestamp" 
    fig = px.scatter_geo(temp_df, locations="state", color="clade", 
                         size="per_capita", size_max = 60,
                         hover_name="state", hover_data = hover_data_dict,
                         animation_frame="timespan",
                        locationmode="USA-states", scope='usa',projection='albers usa', 
                         title = title,
                        category_orders = {"clade":clades}, color_discrete_sequence =rainbow_20)
    fig.show()
    if save:
        fig.write_html(os.path.join(project_folder, 'visualizations', 'Top Clade Bubble Maps', save +".html"))

In [11]:
plot_top_clade_bubble(save = "regular")

add per capita - count per 100,000 for sizing the dot

In [7]:
population_df = pd.read_csv(os.path.join(project_folder, "data", "population", "clean_state_population.csv"))
# make population dictionary with postal code key and population as the value
population_dictionary = {state:pop for (state, pop) in zip(population_df['Postal Code'], population_df['Population'])}

In [12]:
plot_top_clade_bubble(per_capita = True, population_dictionary = population_dictionary, save = "per_capita")

 Want to see if we can do pie chart to show top 2 clades as percentage of total reports & others, and pie radius scaled by number of reports