In [1]:
import pickle
import random
import math
import warnings
import sklearn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from datetime import datetime
from datetime import timedelta
%matplotlib inline
# plt.matplotlib.rcParams.update({'font.size': 50})
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (12.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'   
plt.rcParams["patch.force_edgecolor"] = False
plt.rc('figure', titlesize=25)

In [93]:
PICKLE_PATH = '../augmented_datasets/pickles/hopkins_confirmed_for_regression0904.pkl'
OUTPUT_TREE_PATH = '../products/decision_tree/decision_tree_{0}.dot'
MAX_DT_DEPTH = 5
OUTLIER_QUARTILES = (0.02, 0.98) # Min, max quartiles

In [158]:
from sklearn.model_selection import KFold
from sklearn import tree, metrics

def train_test_split(X, n_splits=5):
    '''
    Splits rows into training indices and test indices.
    :param X: numpy array of training data, e.g.  np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) - each sample has two features
    :return: Returns indices of rows for train and test for n_splits. e.g. n_splits=2: 
    train_folds = [[0,2,3], [1,2,3]] test_folds = [[1], [0]] 
    '''

    kf = KFold(n_splits=n_splits, random_state=2346, shuffle=True)
    kf.get_n_splits(X)
    
    train_folds, test_folds = [], []
    
    for train_index, test_index in kf.split(X):
        train_folds.append(train_index)
        test_folds.append(test_index)
    
    return train_folds, test_folds


def decision_tree_train(X_train, y_train):
    dt = tree.DecisionTreeRegressor(max_depth=MAX_DT_DEPTH)
    trained_model = dt.fit(X_train, y_train)
    return trained_model


def linear_regression_train(X_train, y_train):
    regr = sklearn.linear_model.LinearRegression()
    trained_model = regr.fit(X_train, y_train)
    return trained_model

def remove_outlier(df, range_, columns):
    low = range_[0]
    high = range_[1]
    qnt = df.quantile([low, high])
    data = df.copy()
    for col in list(columns):
        data[col] = \
            data[(data[col] > qnt.loc[low, col]) &\
               (data[col] < qnt.loc[high, col])][col]
    return data.dropna()

def regression(data_, feature_cols, label_cols, outlier_quartiles=(0.02, 0.98)):
    arrays = [[1,1,1,2,2,2,3,3,3,4,4,4,5,5,5],\
              ['test','train','naive','test','train','naive','test','train',\
               'naive','test','train','naive','test','train','naive']]
    idx = pd.MultiIndex.from_arrays(arrays, names=('run', 'mse'))
    regr_rslt = pd.DataFrame(index=idx)
    for col in label_cols:
        regr_rslt[col] = np.nan
    summary = pd.DataFrame(
                          {col: np.nan for col in label_cols},
                          index=['avg(naive - test)', 'avg(naive - train)']
                          )
    model_fn = linear_regression_train
    metric_fn = metrics.mean_squared_error

    for label_col in label_cols:
        data = data_[data_[label_col] > 0].copy()
        data = remove_outlier(data, outlier_quartiles, [label_col, 'avg_interval_tmp'])
        len(data)

        X, y = np.array(data[feature_cols]), np.array(data[[label_col]])
        # Split the data into train, test for n_splits train-test rounds
        try:
            train_folds, test_folds = train_test_split(X, n_splits=5)
        except ValueError:
            continue
        # Train-test the model for each of the n_splits:
        for train_test_round in range(len(train_folds)):
            train_index = train_folds[train_test_round]
            test_index = test_folds[train_test_round]
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            regr_trained = model_fn(X_train, y_train)

            # evaluate on test
            y_pred = regr_trained.predict(X_test)
            regr_rslt.loc[train_test_round + 1, 'test'][label_col] = metric_fn(y_test, y_pred)

            # Compare to train data
            y_pred_train = regr_trained.predict(X_train)
            regr_rslt.loc[train_test_round + 1, 'train'][label_col] = metric_fn(y_train, y_pred_train)

            # Compare to a naive mean-value model:
            y_pred_naive = np.ndarray(y_pred.shape)
            y_pred_naive.fill(y_train.mean())
            regr_rslt.loc[train_test_round + 1, 'naive'][label_col] = metric_fn(y_test, y_pred_naive)

    for col in regr_rslt.columns:
        summary.loc['avg(naive - test)'][col] = (regr_rslt[col].loc[:,'naive'] - regr_rslt[col].loc[:,'test']).mean()
        summary.loc['avg(naive - train)'][col] = (regr_rslt[col].loc[:,'naive'] - regr_rslt[col].loc[:,'train']).mean()

    return summary, regr_rslt

def dt(data_, feature_cols, label_cols, outlier_quartiles=(0.02, 0.98)):
    arrays = [[1,1,1,2,2,2,3,3,3,4,4,4,5,5,5],\
              ['test','train','naive','test','train','naive','test','train',\
               'naive','test','train','naive','test','train','naive']]
    idx = pd.MultiIndex.from_arrays(arrays, names=('run', 'mse'))
    regr_rslt = pd.DataFrame(index=idx)
    for col in label_cols:
        regr_rslt[col] = np.nan

    summary = pd.DataFrame(
                          {col: np.nan for col in label_cols},
                          index=['avg(naive - test)', 'avg(naive - train)']
                          )

    model_fn = decision_tree_train

    metric_fn = metrics.mean_squared_error

    for label_col in label_cols:
        data = data_[data_[label_col] > 0].copy()
        data = remove_outlier(data, outlier_quartiles, [label_col, 'avg_interval_tmp'])
        len(data)

        X, y = np.array(data[feature_cols]), np.array(data[[label_col]])
        # Split the data into train, test for n_splits train-test rounds
        try:
            train_folds, test_folds = train_test_split(X, n_splits=5)
        except ValueError:
            continue
        # Train-test the model for each of the n_splits:
        for train_test_round in range(len(train_folds)):
            train_index = train_folds[train_test_round]
            test_index = test_folds[train_test_round]
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            dt_trained = model_fn(X_train, y_train)
            if model_fn == decision_tree_train: 
                # tree.plot_tree(dt_trained, feature_names=feature_cols)
                suffix = label_col + '_run' + str(train_test_round)
                _ = tree.export_graphviz(dt_trained, OUTPUT_TREE_PATH.format(suffix), feature_names=feature_cols) # dot -Tpng tree.dot -o tree.png 
            # evaluate on test
            y_pred = dt_trained.predict(X_test)
            regr_rslt.loc[train_test_round + 1, 'test'][label_col] = metric_fn(y_test, y_pred)

            # Compare to train data
            y_pred_train = dt_trained.predict(X_train)
            regr_rslt.loc[train_test_round + 1, 'train'][label_col] = metric_fn(y_train, y_pred_train)

            # Compare to a naive mean-value model:
            y_pred_naive = np.ndarray(y_pred.shape)
            y_pred_naive.fill(y_train.mean())
            regr_rslt.loc[train_test_round + 1, 'naive'][label_col] = metric_fn(y_test, y_pred_naive)

    for col in regr_rslt.columns:
        summary.loc['avg(naive - test)'][col] = (regr_rslt[col].loc[:,'naive'] - regr_rslt[col].loc[:,'test']).mean()
        summary.loc['avg(naive - train)'][col] = (regr_rslt[col].loc[:,'naive'] - regr_rslt[col].loc[:,'train']).mean()

    return summary, regr_rslt



###### Prepare data

Some of the 'first_7' rows are nans (for samples where n>20 but 7 days have not passed)

Some of the Province_State are nans (Not all countries have provinces)

In [97]:
hopkins_confirmed = pd.read_pickle(PICKLE_PATH)
nan_countries = []
nan_first_7 = []
for column in hopkins_confirmed.columns:
    any_nan = hopkins_confirmed[column].loc[:, 'data'].isna().any()
    nulls = hopkins_confirmed[column].loc[:, 'data'].isna().tolist()
    if any_nan:
        size = len([nul for nul in nulls if nul])
        print('{0}: {1}, {2}'.format(column, any_nan, size))
        for i in range(len(nulls)):
            if column == 'Country_Region':
                nc = hopkins_confirmed['Country_Region'].index[i][0]
                nan_countries.append(nc)
            if column == 'first_7':
                nc = hopkins_confirmed['first_7'].index[i][0]
                nan_first_7.append(nc)
# hopkins_confirmed.drop(nan_first_7, level=0, inplace=True) # This was removed because it was buggy. the dropna is better and robust
hopkins_confirmed['Province_State'] = hopkins_confirmed['Province_State'].fillna('')
hopkins_confirmed.dropna(inplace=True)
# Normalize first_7
hopkins_confirmed['first_7'] = ((hopkins_confirmed['first_7'] + 1) ** (1/7) - 1)
hopkins_confirmed.insert(4, 'tmp**2', np.nan)
hopkins_confirmed.insert(4, 'tmp*rh', np.nan)
hopkins_confirmed.insert(4, 'urb**2', np.nan)
hopkins_confirmed.insert(4, 'tpc*max_cases', np.nan)

hopkins_confirmed['tmp**2'] = hopkins_confirmed['avg_interval_tmp'] ** 2
hopkins_confirmed['tmp*rh'] = hopkins_confirmed['avg_interval_tmp'] * hopkins_confirmed['avg_interval_RH']
hopkins_confirmed['urb**2'] = hopkins_confirmed['Urbanization'] ** 2
hopkins_confirmed['tpc*max_cases'] = hopkins_confirmed['Tests \ Pop'] * hopkins_confirmed['Max_Cases']


In [99]:
hopkins_confirmed.describe()

Unnamed: 0,GDP,Urbanization,tpc*max_cases,urb**2,tmp*rh,tmp**2,Median Age,State Population,Total Tests,Tests \ Pop,...,3/29/2020,3/30/2020,3/31/2020,4/1/2020,4/2/2020,4/3/2020,4/4/2020,4/5/2020,4/6/2020,4/7/2020
count,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0,...,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0
mean,52010.850986,74.345655,4.04961,5741.04176,889.387968,216.82117,37.55842,8420330.0,53037.816557,0.005411,...,801.33302,877.954845,973.146754,1065.43556,1168.716839,1275.482596,1402.503293,1498.430856,1593.976482,1697.771402
std,21134.743739,14.627599,44.223118,2007.52404,528.155304,214.44704,4.25356,8973202.0,69192.186907,0.004496,...,6814.353811,7483.586652,8312.659775,9136.641046,10074.0171,11074.739093,12233.869566,13114.647699,14028.638112,14993.075916
min,396.0,14.338,0.0,205.578244,-791.065974,0.000947,15.1,0.0,0.0,0.0,...,0.0,0.0,2.0,2.0,4.0,4.0,0.0,0.0,0.0,17.0
25%,46232.98962,66.3,0.094008,4395.69,472.749874,48.06213,36.55,1934408.0,11246.0,0.003057,...,11.0,13.0,16.0,18.0,21.0,23.5,27.0,30.5,33.0,37.0
50%,55172.0,75.1,0.256355,5640.01,809.339645,143.700156,38.5,6045680.0,31090.0,0.004467,...,28.0,33.0,38.0,44.0,51.0,58.0,66.0,73.0,79.0,88.0
75%,61594.0,86.2,0.870187,7430.44,1245.540278,321.092273,39.3,10617420.0,74655.0,0.006904,...,118.0,135.0,152.0,173.5,203.5,222.0,251.0,265.5,286.0,308.5
max,200277.0,100.0,1343.831024,10000.0,2403.747846,1232.01,53.1,39512220.0,340058.0,0.017481,...,140909.0,161837.0,188172.0,213372.0,243616.0,275586.0,308850.0,337072.0,366667.0,396223.0


###### Regression

In [149]:
hopkins_confirmed#.loc[euro_coords, :, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Province_State,Country_Region,GDP,Urbanization,tpc*max_cases,urb**2,tmp*rh,tmp**2,Median Age,Democracy,...,3/29/2020,3/30/2020,3/31/2020,4/1/2020,4/2/2020,4/3/2020,4/4/2020,4/5/2020,4/6/2020,4/7/2020
coordinate,information,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"(-41.4545, 145.9707)",data,Tasmania,Australia,57373.68668,86.012,0.000000,7398.064144,1011.709440,176.464656,37.900002,9.09,...,66.0,66.0,69.0,69.0,72.0,74.0,80.0,82.0,86.0,89.0
"(-40.9006, 174.886)",data,,New Zealand,41945.33167,86.538,0.000000,7488.825444,1191.210938,220.336914,37.900002,9.26,...,514.0,589.0,647.0,708.0,797.0,868.0,950.0,1039.0,1106.0,1160.0
"(-38.4161, -63.6167)",data,,Argentina,11683.94962,91.870,0.000000,8440.096900,950.908587,341.277008,31.900000,7.02,...,745.0,820.0,1054.0,1054.0,1133.0,1265.0,1451.0,1451.0,1554.0,1628.0
"(-37.8136, 144.9631)",data,Victoria,Australia,57373.68668,86.012,0.000000,7398.064144,1172.411570,290.237686,37.900002,9.09,...,769.0,821.0,917.0,968.0,1036.0,1085.0,1115.0,1135.0,1158.0,1191.0
"(-35.6751, -71.543)",data,,Chile,15923.35874,87.564,0.000000,7667.454096,1081.266667,273.902500,35.400002,8.08,...,2139.0,2449.0,2738.0,3031.0,3404.0,3737.0,4161.0,4471.0,4815.0,5116.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(61.8926, -6.9118)",data,Faroe Islands,Denmark,61350.34791,87.874,0.000000,7721.839876,382.921330,25.210970,42.299999,9.22,...,159.0,168.0,169.0,173.0,177.0,179.0,181.0,181.0,183.0,184.0
"(63.0, 16.0)",data,,Sweden,54608.36025,87.431,0.000000,7644.179761,-28.533608,0.165981,41.000000,9.39,...,3700.0,4028.0,4435.0,4947.0,5568.0,6131.0,6443.0,6830.0,7206.0,7693.0
"(64.0, 26.0)",data,,Finland,50152.34014,85.382,0.000000,7290.085924,-191.987520,7.420176,42.799999,9.25,...,1240.0,1352.0,1418.0,1446.0,1518.0,1615.0,1882.0,1927.0,2176.0,2308.0
"(64.80726247, -146.5692662)",data,Alaska,US,73205.00000,66.000,0.614241,4356.000000,-344.814815,29.641975,34.000000,7.96,...,23.0,28.0,30.0,35.0,40.0,42.0,46.0,53.0,53.0,65.0


In [182]:
us_feature_cols = ['GDP', 'Urbanization', 'avg_interval_tmp', 'avg_interval_RH',\
               'tmp**2', 'tmp*rh', 'urb**2', 'tpc*max_cases', 'Democracy', 'Median Age', 'Tests \ Pop']

global_feature_cols = ['GDP', 'Urbanization', 'avg_interval_tmp', 'avg_interval_RH',\
               'tmp**2', 'tmp*rh', 'urb**2', 'Democracy', 'Median Age']

label_cols = ['first_7', 'GF_Q1', 'GF_Q2', 'GF_Q3', 'EXP_GF_Q1', 'EXP_GF_Q2', 'EXP_GF_Q3']
coords = list(set(hopkins_confirmed.index.get_level_values(0)))
euro_coords = [coord for coord in coords if (coord[0] > 36.7 and coord[1] > -16 and coord[1] < 25)]
thrsh_data = hopkins_confirmed[hopkins_confirmed['GDP'] >= 40000]\
                        [hopkins_confirmed['Urbanization'] >= 88].copy()


us_regr_sum, us_regr_full = regression(hopkins_confirmed[hopkins_confirmed['Country_Region'] =='US'].copy(),\
                                       us_feature_cols, label_cols)

us_dt_sum, us_dt_full = dt(hopkins_confirmed[hopkins_confirmed['Country_Region'] =='US'].copy(),\
                           us_feature_cols, label_cols)

euro_regr_sum, euro_regr_full = regression(hopkins_confirmed.loc[euro_coords, :, :].copy(),\
                                          global_feature_cols, label_cols)

euro_dt_sum, euro_dt_full = dt(hopkins_confirmed.loc[euro_coords, :, :].copy(),\
                              global_feature_cols, label_cols)

global_regr_sum, global_regr_full = regression(hopkins_confirmed, global_feature_cols, label_cols)

global_dt_sum, global_dt_full = dt(hopkins_confirmed, global_feature_cols, label_cols)

thrsh_regr_sum, thrsh_regr_full = regression(thrsh_data, global_feature_cols, label_cols)

thrsh_dt_sum, thrsh_dt_full = dt(thrsh_data, global_feature_cols, label_cols)

print('\nregression - US')
us_regr_sum
print('Decision tree - US')
us_dt_sum

print('\nregression - Europe')
euro_regr_sum
print('Decision tree - Europe')
euro_dt_sum

print('\nregression - global with thresholds')
thrsh_regr_sum
print('Decision tree - global with thresholds')
thrsh_dt_sum

print('\nregression - global')
global_regr_sum
print('Decision tree - global')
global_dt_sum



regression - US


Unnamed: 0,first_7,GF_Q1,GF_Q2,GF_Q3,EXP_GF_Q1,EXP_GF_Q2,EXP_GF_Q3
avg(naive - test),0.000102,4.9e-05,0.00015,-7.9e-05,-0.000477,-0.000245,0.000236
avg(naive - train),0.000209,0.000209,0.000217,0.000458,0.000204,0.000242,0.000637


Decision tree - US


Unnamed: 0,first_7,GF_Q1,GF_Q2,GF_Q3,EXP_GF_Q1,EXP_GF_Q2,EXP_GF_Q3
avg(naive - test),-0.000317,0.000287,-7.2e-05,-0.00019,-0.000546,-0.000132,-0.001207
avg(naive - train),0.000967,0.000728,0.00095,0.001667,0.000672,0.001291,0.002479



regression - Europe


Unnamed: 0,first_7,GF_Q1,GF_Q2,GF_Q3,EXP_GF_Q1,EXP_GF_Q2,EXP_GF_Q3
avg(naive - test),-0.000696,-0.000208,-0.000628,-0.000808,,-0.001499,-0.00228
avg(naive - train),0.001253,0.000249,0.000293,0.000664,,0.000859,0.001647


Decision tree - Europe


Unnamed: 0,first_7,GF_Q1,GF_Q2,GF_Q3,EXP_GF_Q1,EXP_GF_Q2,EXP_GF_Q3
avg(naive - test),-0.000752,-0.00018,-0.000376,-0.001871,,-0.002754,-0.006962
avg(naive - train),0.001867,0.000404,0.00074,0.001516,,0.001713,0.003848



regression - global with thresholds


Unnamed: 0,first_7,GF_Q1,GF_Q2,GF_Q3,EXP_GF_Q1,EXP_GF_Q2,EXP_GF_Q3
avg(naive - test),-0.000109,-0.000135,-0.000848,-0.00121,-0.125721,-0.001385,-0.001417
avg(naive - train),0.000507,0.000334,0.000262,0.000798,0.000286,0.00072,0.001366


Decision tree - global with thresholds


Unnamed: 0,first_7,GF_Q1,GF_Q2,GF_Q3,EXP_GF_Q1,EXP_GF_Q2,EXP_GF_Q3
avg(naive - test),-0.002178,-0.000349,-0.000594,-0.001042,-0.000816,-0.002121,-0.008416
avg(naive - train),0.002257,0.000978,0.001253,0.002103,0.000723,0.002494,0.006953



regression - global


Unnamed: 0,first_7,GF_Q1,GF_Q2,GF_Q3,EXP_GF_Q1,EXP_GF_Q2,EXP_GF_Q3
avg(naive - test),2.1e-05,2.8e-05,0.000107,0.000537,-0.000188,8.9e-05,0.000468
avg(naive - train),0.000102,0.000104,0.000171,0.000691,0.000194,0.0002,0.000801


Decision tree - global


Unnamed: 0,first_7,GF_Q1,GF_Q2,GF_Q3,EXP_GF_Q1,EXP_GF_Q2,EXP_GF_Q3
avg(naive - test),-0.00051,3.2e-05,9.8e-05,4.9e-05,-0.000566,-5.1e-05,-0.000107
avg(naive - train),0.00077,0.000513,0.000738,0.002082,0.000702,0.001014,0.002528


In [189]:
hopkins_confirmed.columns[8:]

Index(['Median Age', 'Democracy', 'State Population', 'Total Tests',
       'Tests \ Pop', 'avg_m_tmp', 'avg_m_RH', 'avg_m_precip', 'avg_m_wind',
       'Max_Cases', 'first_7', 'last relevant date', 'Max_Date', '5%_Date',
       'avg_interval_tmp', 'avg_interval_RH', 'GF_Q1', 'GF_Q2', 'GF_Q3',
       'EXP_GF_Q1', 'EXP_GF_Q2', 'EXP_GF_Q3', '1/22/2020', '1/23/2020',
       '1/24/2020', '1/25/2020', '1/26/2020', '1/27/2020', '1/28/2020',
       '1/29/2020', '1/30/2020', '1/31/2020', '2/1/2020', '2/2/2020',
       '2/3/2020', '2/4/2020', '2/5/2020', '2/6/2020', '2/7/2020', '2/8/2020',
       '2/9/2020', '2/10/2020', '2/11/2020', '2/12/2020', '2/13/2020',
       '2/14/2020', '2/15/2020', '2/16/2020', '2/17/2020', '2/18/2020',
       '2/19/2020', '2/20/2020', '2/21/2020', '2/22/2020', '2/23/2020',
       '2/24/2020', '2/25/2020', '2/26/2020', '2/27/2020', '2/28/2020',
       '2/29/2020', '3/1/2020', '3/2/2020', '3/3/2020', '3/4/2020', '3/5/2020',
       '3/6/2020', '3/7/2020', '3/8/2020', '3

In [184]:
# _ = dt(thrsh_data, global_feature_cols, label_cols)
_ = dt(hopkins_confirmed[hopkins_confirmed['Country_Region'] =='US'].copy(), us_feature_cols, label_cols)
!dot -Tpng ../products/decision_tree/decision_tree_GF_Q1_run4.dot -o tree.png && open tree.png