### Making Predictions

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


### Getting all the datasets

In [197]:
# importing the prediction data
df_pred = pd.read_csv('../clean_data/predictions.csv')

# lets rename columns for easy use
df_pred.rename(columns={"Unnamed: 0": "date" , "7dayrollingavg_newlyconfirmed":"7D_roll_pred", 'County':'county'},inplace = True)

# lets reset the index to county
df_pred.set_index('county', inplace= True)

# import data wise california vaccine allocation

df_vacc_alloc = pd.read_csv('../clean_data/vaccine_population.csv')
df_vacc_alloc = df_vacc_alloc[df_vacc_alloc['jurisdiction'] == 'California']
df_vacc_alloc.drop(columns=['2019_population','hhs_region','total_first_allocation' ], inplace = True)

# importing county population data
df_county_pop = pd.read_csv('../clean_data/cases_with_mask_use.csv')

# get county and population information only
df_county_pop = pd.DataFrame(df_county_pop.groupby(by = 'county')['population'].max())

### EDA on county population

In [210]:
df_county_pop.head()

Unnamed: 0_level_0,population
county,Unnamed: 1_level_1
Alameda,1671329
Alpine,1129
Amador,39752
Butte,219186
Calaveras,45905


### EDA on prediction data

In [198]:
# lets look at the data
df_pred.head(2)

Unnamed: 0_level_0,date,7D_roll_pred
county,Unnamed: 1_level_1,Unnamed: 2_level_1
Alameda,2020-09-27,5.572982
Alameda,2020-09-28,5.402031


### EDA on vaccination data

In [199]:
df_vacc_alloc

Unnamed: 0,jurisdiction,doses_allocated_12_14,doses_allocated_12_21,doses_allocated_12_28,doses_allocated_01_04,doses_allocated_01_10,doses_allocated_01_18,doses_allocated_01_25
4,California,327600,905625,529675,463450,465325,485800,485800


In [200]:
# lets transpose the vacc_alloc data set so that we have by date doses allocation
df_vacc_alloc = df_vacc_alloc.T

In [201]:
# lets rename the column for dosage appropriatly 
df_vacc_alloc.rename(columns= {4:'dosage_alloc'}, inplace = True)

In [202]:
# lets remove the first row
df_vacc_alloc.drop( ['jurisdiction'],axis = 0,inplace = True)

In [203]:
# reset the index
df_vacc_alloc.reset_index(inplace=True)
df_vacc_alloc = df_vacc_alloc.rename(columns = {'index':'date'})

In [204]:
# removing the doses_allocated text
df_vacc_alloc['date'] = df_vacc_alloc['date'].str.replace('doses_allocated_', '')

In [205]:
# replacing Underscore with a dash

df_vacc_alloc['date'] = df_vacc_alloc['date'].str.replace('_', '-')

In [206]:
# lets define a function to update the date
def update_date(in_df):
    cnt = len(in_df)
    for i in range(cnt):
            if (in_df['date'][i][:2:]) == '12':
                in_df['date'][i] = '2020-'+in_df['date'][i]
            else:
                in_df['date'][i] = '2021-'+in_df['date'][i]


In [208]:
# lets update the date 
update_date(df_vacc_alloc)

In [209]:
# take a look at the data
df_vacc_alloc

Unnamed: 0,date,dosage_alloc
0,2020-12-14,327600
1,2020-12-21,905625
2,2020-12-28,529675
3,2021-01-04,463450
4,2021-01-10,465325
5,2021-01-18,485800
6,2021-01-25,485800


### Merge data sets together

#### create a dataframe that has County, Date, Prediction, County population, vaccine allocation

In [213]:
# lets add population to predictions 
final =  pd.merge(df_pred ,df_county_pop, on ='county' , how = 'left' )

In [214]:
final.head()

Unnamed: 0_level_0,date,7D_roll_pred,population
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alameda,2020-09-27,5.572982,1671329
Alameda,2020-09-28,5.402031,1671329
Alameda,2020-09-29,5.419126,1671329
Alameda,2020-09-30,4.726777,1671329
Alameda,2020-10-01,4.367782,1671329


In [215]:
# since the 7 day rolling average was based off 100K , we need to calculate actual number of 7 day new patients

final['7D_roll_pred_tot'] = np.round((final['7D_roll_pred'] * final['population'])/ 100_000)

In [216]:
# lets look at merged data
final[final['date'] >= '2021-01-26']


Unnamed: 0_level_0,date,7D_roll_pred,population,7D_roll_pred_tot
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alameda,2021-01-26,8.328146,1671329,139.0
Alameda,2021-01-27,8.905375,1671329,149.0
Alameda,2021-01-28,8.976390,1671329,150.0
Alameda,2021-01-29,8.986604,1671329,150.0
Alameda,2021-01-30,8.988692,1671329,150.0
...,...,...,...,...
Yuba,2021-02-10,19.384285,78668,15.0
Yuba,2021-02-11,19.384285,78668,15.0
Yuba,2021-02-12,19.384285,78668,15.0
Yuba,2021-02-13,19.384285,78668,15.0


### Append the 'county to population ratio' to merged data set.

In [217]:
# ratio of county population to total california population. We will use this for dosage distribution
df_county_pop['ratio_county_pop'] = (df_county_pop['population']/ df_county_pop['population'].sum())

In [218]:
final_2 = pd.merge(final, df_county_pop['ratio_county_pop'], on="county", how="left")

In [220]:
final_2

Unnamed: 0_level_0,date,7D_roll_pred,population,7D_roll_pred_tot,ratio_county_pop
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alameda,2020-09-27,5.572982,1671329,93.0,0.042299
Alameda,2020-09-28,5.402031,1671329,90.0,0.042299
Alameda,2020-09-29,5.419126,1671329,91.0,0.042299
Alameda,2020-09-30,4.726777,1671329,79.0,0.042299
Alameda,2020-10-01,4.367782,1671329,73.0,0.042299
...,...,...,...,...,...
Yuba,2021-02-10,19.384285,78668,15.0,0.001991
Yuba,2021-02-11,19.384285,78668,15.0,0.001991
Yuba,2021-02-12,19.384285,78668,15.0,0.001991
Yuba,2021-02-13,19.384285,78668,15.0,0.001991


In [222]:
# use the ratio_county_pop to divide the dosage across county. Create a new column

final_2['dosage_allocated_01_25'] = round(final_2['ratio_county_pop'] * 485800)

In [223]:
final_2['delta'] = final_2['dosage_allocated_01_25'] - final_2['7D_roll_pred_tot']

In [224]:
final_2

Unnamed: 0_level_0,date,7D_roll_pred,population,7D_roll_pred_tot,ratio_county_pop,dosage_allocated_01_25,delta
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alameda,2020-09-27,5.572982,1671329,93.0,0.042299,20549.0,20456.0
Alameda,2020-09-28,5.402031,1671329,90.0,0.042299,20549.0,20459.0
Alameda,2020-09-29,5.419126,1671329,91.0,0.042299,20549.0,20458.0
Alameda,2020-09-30,4.726777,1671329,79.0,0.042299,20549.0,20470.0
Alameda,2020-10-01,4.367782,1671329,73.0,0.042299,20549.0,20476.0
...,...,...,...,...,...,...,...
Yuba,2021-02-10,19.384285,78668,15.0,0.001991,967.0,952.0
Yuba,2021-02-11,19.384285,78668,15.0,0.001991,967.0,952.0
Yuba,2021-02-12,19.384285,78668,15.0,0.001991,967.0,952.0
Yuba,2021-02-13,19.384285,78668,15.0,0.001991,967.0,952.0


In [None]:
## Approach to solve the problem
## Step1: create a function that inputs the dosage allocated to California and the Date
## Step2: calculate the dosage for that week
## Step 3: Output the dataframe with County, Week,  County infection population, Dosage allocated, Shortfall


### Append the 'county to population ratio' to merged data set.

In [None]:
def pred_dosage(dosage, week_dt):
    dsge = int(dosage)
    dt = week_dt

    