In [1]:
%%capture
%load_ext autoreload
%autoreload 2
from setup_nb_env import *

from epsampling.utils import load_csv
# pd.set_option('display.float_format', lambda x: '%.3f' % x)
from epsampling.utils import drop_sers_with_nans
from epsampling.utils import date_str_to_int

DATA_DIR = '/work/users/k/4/k4thryn/Repos/EpSampling/data/'
DT = datetime.today().strftime('%Y%m%d-%H%M%S')

### <font color=blue> Read in deaths df and acs df.

In [2]:
timestamp = '20241002-004543'

fpath = os.path.join(DATA_DIR,'processed', f'processed_naive_deaths_{timestamp}.csv')
df_deaths = pd.read_csv(fpath)
# display(df)

## Drop rows with nans.
df_deaths.dropna(inplace=True)
# display(df)

## Drop samples with negative inc deaths.
df_deaths = df_deaths[df_deaths.True_inc_deaths >= 0]
# display(df_deaths)

# # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # 

from epsampling.utils import drop_duplicate_cols

timestamp = '20241001-223952'

fpath = os.path.join(DATA_DIR,'processed',f'formatted_acs_results_{timestamp}.csv')
df_acs = pd.read_csv(fpath,index_col='Fips')

# Remove duplicate columns
df_acs = drop_duplicate_cols(df_acs)

### <font color=blue> Remove highly correlated features from acs df.

In [8]:
## Remove highly correlated features.

corr_thresh = 0.95

corr_matrix = df_acs.corr().abs()
## Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
## Find features with correlation greater than like .99 ...
to_drop = [column for column in upper.columns if any(upper[column] > corr_thresh)]
## Drop features 
df_filt = df_acs.drop(to_drop, axis=1, inplace=False)
df_acs = df_filt

### <font color=blue> Join naive deaths df with filtered acs df.

In [9]:
df = df_deaths.merge(df_acs, on='Fips')
display(df)

Unnamed: 0,Fips,State_fips,Pop,State_pop,Ratio,Date,Proj_inc_deaths,True_cum_deaths,Cum_deaths_tm1,True_inc_deaths,...,HU_UISOTHER,HU_UIS0304,OCC_NRCM_FFF_x11,HU_VAC,IND_AFFHM,POP_OTH2PLNH,IND_PUBA_x7,POP_ASIANNH,POP_AIANNH,POP_AIAN
0,1001,1,58239,4997675,0.011653,20200620,83.302156,9.0,6.0,3.0,...,4200,400,145,2314,200,1684,2363,647,98,98
1,1001,1,58239,4997675,0.011653,20200627,68.179357,12.0,9.0,3.0,...,4200,400,145,2314,200,1684,2363,647,98,98
2,1001,1,58239,4997675,0.011653,20200704,76.238975,13.0,12.0,1.0,...,4200,400,145,2314,200,1684,2363,647,98,98
3,1001,1,58239,4997675,0.011653,20200711,88.362530,15.0,13.0,2.0,...,4200,400,145,2314,200,1684,2363,647,98,98
4,1001,1,58239,4997675,0.011653,20200718,114.279381,21.0,15.0,6.0,...,4200,400,145,2314,200,1684,2363,647,98,98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306758,56045,56,6891,576641,0.011950,20220409,8.000000,18.0,18.0,0.0,...,666,55,10,677,752,455,179,52,88,90
306759,56045,56,6891,576641,0.011950,20220416,5.000000,18.0,18.0,0.0,...,666,55,10,677,752,455,179,52,88,90
306760,56045,56,6891,576641,0.011950,20220423,4.000000,18.0,18.0,0.0,...,666,55,10,677,752,455,179,52,88,90
306761,56045,56,6891,576641,0.011950,20220430,4.000000,18.0,18.0,0.0,...,666,55,10,677,752,455,179,52,88,90


### <font color=blue> Random forest.

In [10]:
def get_chunks(lst, n):
    """Return successive n-sized chunks from lst."""
    chunks = []
    for i in range(0, len(lst), n):
        chunks.append(lst[i:i + n])
    return chunks

In [14]:
chunks = get_chunks(list(df.Date.unique()), 4)

# idc = [2,10,18]
# idc = [10]

# for idx in idc:
    
idx = 18 #10

dates_test = chunks[idx]
dates_train = [x for x in df.Date.unique() if x not in dates_test]

df_train = df[df.Date.isin(dates_train)]
df_test = df[df.Date.isin(dates_test)]

feats = list(df_acs.columns) + ['Proj_inc_deaths','Cum_deaths_tm1']

X_train = df_train[feats]
X_test = df_test[feats]

y_train = df_train['True_inc_deaths']
y_test = df_test['True_inc_deaths']

y_naive = df_test['Naive_inc_deaths']

In [15]:
from sklearn.ensemble import RandomForestRegressor
# train time for random split at 0.1 test_size? 2 min
clf = RandomForestRegressor(random_state=666, max_depth=6)
clf.fit(X_train, y_train)

In [16]:
from epsampling.utils import get_performance

y_pred = clf.predict(X_test)

model_names = ['Deaths_pred', 'Deaths_naive']
model_preds = [y_pred, y_naive]

metrics_dict = get_performance(model_names, model_preds, y_test, y_naive)
metrics_dict

{'Deaths_pred': {'MAE': 2.058096492279519,
  'MSE': 25.55086421669854,
  'r2': 0.5468124714309353,
  'relMAE': 3.8210880945779957},
 'Deaths_naive': {'MAE': 1.8284486014710162,
  'MSE': 22.9026792245397,
  'r2': 0.59378248393665,
  'relMAE': 1.0}}

## <font color=magenta> Next ToDo ... ... ?? Look at variance across counties? Set up for loop for cross validation. Figure out if it can run faster somehow. Add mobility data? Look at importance? 