In [1]:
%%capture
%load_ext autoreload
%autoreload 2
from setup_nb_env import *

from epsampling.utils import load_csv
# pd.set_option('display.float_format', lambda x: '%.3f' % x)
from epsampling.utils import drop_sers_with_nans
from epsampling.utils import date_str_to_int

DATA_DIR = '/work/users/k/4/k4thryn/Repos/EpSampling/data/'
DT = datetime.today().strftime('%Y%m%d-%H%M%S')

### <font color=blue> Read in deaths df and acs df.

In [2]:
timestamp = '20241002-004543'

fpath = os.path.join(DATA_DIR,'processed', f'processed_naive_deaths_{timestamp}.csv')
df_deaths = pd.read_csv(fpath)
# display(df)

## Drop rows with nans.
df_deaths.dropna(inplace=True)
# display(df)

## Drop samples with negative inc deaths.
df_deaths = df_deaths[df_deaths.True_inc_deaths >= 0]
# display(df_deaths)

# # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # 

from epsampling.utils import drop_duplicate_cols

timestamp = '20241001-223952'

fpath = os.path.join(DATA_DIR,'processed',f'formatted_acs_results_{timestamp}.csv')
df_acs = pd.read_csv(fpath,index_col='Fips')

# Remove duplicate columns
df_acs = drop_duplicate_cols(df_acs)

### <font color=blue> Remove highly correlated features from acs df.

In [3]:
## Remove highly correlated features.

corr_thresh = 0.95

corr_matrix = df_acs.corr().abs()
## Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
## Find features with correlation greater than like .99 ...
to_drop = [column for column in upper.columns if any(upper[column] > corr_thresh)]
## Drop features 
df_filt = df_acs.drop(to_drop, axis=1, inplace=False)
df_acs = df_filt

### <font color=blue> Join naive deaths df with filtered acs df.

In [4]:
df = df_deaths.merge(df_acs, on='Fips')
display(df)

Unnamed: 0,Fips,State_fips,Pop,State_pop,Ratio,Date,Proj_inc_deaths,True_cum_deaths,Cum_deaths_tm1,True_inc_deaths,...,HU_UISOTHER,HU_UIS0304,OCC_NRCM_FFF_x11,HU_VAC,IND_AFFHM,POP_OTH2PLNH,IND_PUBA_x7,POP_ASIANNH,POP_AIANNH,POP_AIAN
0,1001,1,58239,4997675,0.011653,20200620,83.302156,9.0,6.0,3.0,...,4200,400,145,2314,200,1684,2363,647,98,98
1,1001,1,58239,4997675,0.011653,20200627,68.179357,12.0,9.0,3.0,...,4200,400,145,2314,200,1684,2363,647,98,98
2,1001,1,58239,4997675,0.011653,20200704,76.238975,13.0,12.0,1.0,...,4200,400,145,2314,200,1684,2363,647,98,98
3,1001,1,58239,4997675,0.011653,20200711,88.362530,15.0,13.0,2.0,...,4200,400,145,2314,200,1684,2363,647,98,98
4,1001,1,58239,4997675,0.011653,20200718,114.279381,21.0,15.0,6.0,...,4200,400,145,2314,200,1684,2363,647,98,98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306758,56045,56,6891,576641,0.011950,20220409,8.000000,18.0,18.0,0.0,...,666,55,10,677,752,455,179,52,88,90
306759,56045,56,6891,576641,0.011950,20220416,5.000000,18.0,18.0,0.0,...,666,55,10,677,752,455,179,52,88,90
306760,56045,56,6891,576641,0.011950,20220423,4.000000,18.0,18.0,0.0,...,666,55,10,677,752,455,179,52,88,90
306761,56045,56,6891,576641,0.011950,20220430,4.000000,18.0,18.0,0.0,...,666,55,10,677,752,455,179,52,88,90


### <font color=blue> Random forest.

In [5]:
def get_chunks(lst, n):
    """Return successive n-sized chunks from lst."""
    chunks = []
    for i in range(0, len(lst), n):
        chunks.append(lst[i:i + n])
    return chunks

In [8]:
chunks = get_chunks(list(df.Date.unique()), 4)

# idc = [2,10,18]
# idc = [10]

# for idx in idc:
    
idx = 18 #10

dates_test = chunks[idx]
dates_train = [x for x in df.Date.unique() if x not in dates_test]

df_train = df[df.Date.isin(dates_train)]
df_test = df[df.Date.isin(dates_test)]

feats = list(df_acs.columns) + ['Proj_inc_deaths','Cum_deaths_tm1',
                                'Naive_inc_deaths','Pop','Ratio']

X_train = df_train[feats]
X_test = df_test[feats]

y_train = df_train['True_inc_deaths']
y_test = df_test['True_inc_deaths']

y_naive = df_test['Naive_inc_deaths']

In [9]:
from sklearn.ensemble import RandomForestRegressor
# train time for random split at 0.1 test_size? 2 min
clf = RandomForestRegressor(random_state=666, max_depth=6)
clf.fit(X_train, y_train)

In [10]:
from epsampling.utils import get_performance

y_pred = clf.predict(X_test)

model_names = ['Deaths_pred', 'Deaths_naive']
model_preds = [y_pred, y_naive]

metrics_dict = get_performance(model_names, model_preds, y_test, y_naive)
metrics_dict

{'Deaths_pred': {'MAE': 1.8195914748446056,
  'MSE': 21.84015502893068,
  'RMSE': 4.673345164754116,
  'r2': 0.612628136677358,
  'relMAE': 2.0214995761123475},
 'Deaths_naive': {'MAE': 1.8284486014710162,
  'MSE': 22.9026792245397,
  'RMSE': 4.78567437510532,
  'r2': 0.59378248393665,
  'relMAE': 1.0}}

### <font color=blue> Do _naive_ metrics vary a lot across counties?

In [12]:
from epsampling.utils import get_performance
import pprint

county_res_dict = {}

fipss = df.Fips.unique()

for fips in tqdm(df.Fips.unique(), total = len(fipss)):
    
    df_fips = df[df.Fips==fips]
    
    y_naive = df_fips['Naive_inc_deaths'].values
    y_test = df_fips['True_inc_deaths'].values

    model_names = ['Naive_inc_deaths'] #,'Deaths_true']
    model_preds = [y_naive]
    
    metrics_dict = get_performance(model_names, model_preds, y_test, y_naive)
    county_res_dict[fips] = list(metrics_dict.values())[0]
    
df_res = pd.DataFrame([[county,metric,value] 
                       for county,d in county_res_dict.items() 
                       for metric,value in d.items()], 
                       columns = ['Fips','metric','value'])

df_res = df_res.pivot(index='Fips',columns='metric',values='value')
df_res.drop('relMAE',axis=1,inplace=True)

df_res.reset_index(inplace=True)
df_res.columns.name = None
df_res.index.name = None
df_res

100%|██████████| 3131/3131 [00:05<00:00, 528.54it/s]


Unnamed: 0,Fips,MAE,MSE,RMSE,r2
0,1001,1.442469,4.288161,2.070788,0.300375
1,1003,5.212893,65.629623,8.101211,0.171386
2,1005,0.897593,4.181049,2.044761,0.047110
3,1007,0.966661,5.255706,2.292533,0.023673
4,1009,1.489784,5.462142,2.337123,0.373556
...,...,...,...,...,...
3126,56037,1.002291,2.748585,1.657886,0.312222
3127,56039,0.580852,0.667827,0.817207,-2.952518
3128,56041,0.537657,0.670594,0.818898,-0.011463
3129,56043,0.472109,0.945976,0.972613,0.063673


In [14]:
print(f'\n\n* MAE * \nvariance: {round(df_res.MAE.var(),3)}\n',df_res.MAE.describe(),
      f'\n\n* MSE * \nvariance: {round(df_res.MSE.var(),3)}\n',df_res.MSE.describe(),
      f'\n\n* RMSE * \nvariance: {round(df_res.RMSE.var(),3)}\n',df_res.RMSE.describe(),
      f'\n\n* r2 * \nvariance: {round(df_res.r2.var(),3)}\n',df_res.r2.describe())



* MAE * 
variance: 12.723
 count    3131.000000
mean        1.732063
std         3.566985
min         0.000515
25%         0.499713
50%         0.882941
75%         1.674890
max       101.942214
Name: MAE, dtype: float64 

* MSE * 
variance: 929859.891
 count    3.131000e+03
mean     6.701559e+01
std      9.642924e+02
min      4.295054e-07
25%      6.684872e-01
50%      2.087512e+00
75%      7.961485e+00
max      4.629140e+04
Name: MSE, dtype: float64 

* RMSE * 
variance: 57.42
 count    3131.000000
mean        3.100570
std         7.577625
min         0.000655
25%         0.817611
50%         1.444822
75%         2.821610
max       215.154354
Name: RMSE, dtype: float64 

* r2 * 
variance: 0.136
 count    3131.000000
mean        0.088794
std         0.368472
min        -8.753772
25%         0.000000
50%         0.086853
75%         0.219057
max         0.867657
Name: r2, dtype: float64
