In [1]:
# Libraries
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
import datetime as dt
import xgboost as xgb
from sklearn import preprocessing

# Written code
import utils
import preprocess

In [2]:
train = pd.read_csv('train_trendency.csv')
test = pd.read_csv('test.csv')

In [3]:
TRAIN_START = train.Date.min()
TEST_START = test.Date.min()
TRAIN_END = train.Date.max()
TEST_END = test.Date.max()

DATEFORMAT = '%m-%d-%Y'

In [4]:
# Preprocessing

train, test = preprocess.get_data()
train_clean = preprocess.process_by_state(train)

train_clean = train_clean[[
        'Province_State', 'Date',
        'LogConfirmed', 'LogDeaths',
        'LogConfirmedDelta', 'LogDeathsDelta'
]]
print(train_clean)

location_encoder = preprocessing.LabelEncoder()
train_clean['EncodedLocation'] = location_encoder.fit_transform(train_clean.Province_State)

train_features = preprocess.generate_rolling_features(train_clean)
print(train_features)

     Province_State        Date  LogConfirmed  LogDeaths  LogConfirmedDelta  \
0           Alabama  01-12-2021     12.918652   8.625868           0.007686   
50          Alabama  01-13-2021     12.926339   8.658866           0.008692   
100         Alabama  01-14-2021     12.935031   8.690474           0.007078   
150         Alabama  01-15-2021     12.942109   8.704668           0.007523   
200         Alabama  01-16-2021     12.949632   8.719317           0.004547   
...             ...         ...           ...        ...                ...   
3749        Wyoming  03-27-2021     10.933946   6.545350           0.000000   
3799        Wyoming  03-28-2021     10.933946   6.545350           0.002566   
3849        Wyoming  03-29-2021     10.936512   6.545350           0.000818   
3899        Wyoming  03-30-2021     10.937330   6.545350           0.001315   
3949        Wyoming  03-31-2021     10.938645   6.552508                NaN   

      LogDeathsDelta  
0           0.032998  
50   

In [5]:
# Train

features = [
        'Rolled_LogConfirmed_21d', 'Rolled_LogDeaths_21d', 'Rolled_LogConfirmed_10d', 'Rolled_LogDeaths_10d',
        'Rolled_LogConfirmed_7d', 'Rolled_LogDeaths_7d', 'Rolled_LogConfirmed_3d', 'Rolled_LogDeaths_3d',
        'Rolled_LogConfirmed_1d', 'Rolled_LogDeaths_1d', 'Rolled_LogConfirmed_0d', 'Rolled_LogDeaths_0d', 
        'EncodedLocation', 'DC_Ratio'
]

config = dict(
        min_child_weight=5,
        eta=0.01, colsample_bytree=0.8, 
        max_depth=5, subsample=0.9, nthread=2, booster='gbtree',
        eval_metric='rmse', objective='reg:squarederror'
)

X_train = train_features[(train_features.Date >= TRAIN_START) & (train_features.Date < TRAIN_END)].copy()
X_train['DaysUntil'] = -(pd.to_datetime(train_features.Date) - dt.datetime.strptime(TRAIN_END, DATEFORMAT)).dt.days

print(X_train)

     Province_State        Date  LogConfirmed  LogDeaths  LogConfirmedDelta  \
0           Alabama  01-12-2021     12.918652   8.625868           0.007686   
50          Alabama  01-13-2021     12.926339   8.658866           0.008692   
100         Alabama  01-14-2021     12.935031   8.690474           0.007078   
150         Alabama  01-15-2021     12.942109   8.704668           0.007523   
200         Alabama  01-16-2021     12.949632   8.719317           0.004547   
...             ...         ...           ...        ...                ...   
3699        Wyoming  03-26-2021     10.933946   6.545350           0.000000   
3749        Wyoming  03-27-2021     10.933946   6.545350           0.000000   
3799        Wyoming  03-28-2021     10.933946   6.545350           0.002566   
3849        Wyoming  03-29-2021     10.936512   6.545350           0.000818   
3899        Wyoming  03-30-2021     10.937330   6.545350           0.001315   

      LogDeathsDelta  EncodedLocation  Rolled_LogCo

In [6]:
D_confirmed = xgb.DMatrix(X_train[features].round(2), label=X_train.LogConfirmedDelta, weight=utils.calc_weight(X_train.DaysUntil))
D_deaths = xgb.DMatrix(X_train[features].round(2), label=X_train.LogDeathsDelta, weight=utils.calc_weight(X_train.DaysUntil))

model_confirmed = xgb.train(config, D_confirmed, 900, evals=[(D_confirmed, 'train-confirmed')], verbose_eval=100)
model_deaths = xgb.train(config, D_deaths, 900, evals=[(D_deaths, 'train-deaths')], verbose_eval=100)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-confirmed-rmse:0.49152
[100]	train-confirmed-rmse:0.17999
[200]	train-confirmed-rmse:0.06596
[300]	train-confirmed-rmse:0.02426
[400]	train-confirmed-rmse:0.00913
[500]	train-confirmed-rmse:0.00381
[600]	train-confirmed-rmse:0.00221
[700]	train-confirmed-rmse:0.00183
[800]	train-confirmed-rmse:0.00172
[899]	train-confirmed-rmse:0.00168
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-deaths-rmse:0.49069
[100]	train-deaths-rmse:0.17976
[200]	train-deaths-rmse:0.

In [10]:
# Predict
DECAY = 0.99
PRECISION = 2

predictions = X_train.copy()
predictions = train_features[(train_features.Date >= TRAIN_START) & (train_features.Date <= TRAIN_END)].copy()
predictions.LogConfirmedDelta = np.nan
predictions.LogFatalitiesDelta = np.nan

for i, d in enumerate(pd.date_range(TRAIN_END, utils.add_days(TEST_END, 1)).strftime(DATEFORMAT)):
    last_day = str(d).split(' ')[0]
    next_day = dt.datetime.strptime(last_day, DATEFORMAT) + dt.timedelta(days=1)
    next_day = next_day.strftime(DATEFORMAT)

    p_next_day = predictions[predictions.Date == last_day].copy()
    p_next_day.Date = next_day

    p_next_day['p_logc'] = model_confirmed.predict(xgb.DMatrix(p_next_day[features].round(PRECISION)))
    p_next_day['p_logd'] = model_deaths.predict(xgb.DMatrix(p_next_day[features].round(PRECISION)))
    p_next_day.LogConfirmed = p_next_day.LogConfirmed + np.clip(p_next_day['p_logc'], 0, None) * DECAY ** i
    p_next_day.LogDeaths = p_next_day.LogDeaths + np.clip(p_next_day['p_logd'], 0, None) * DECAY ** i

    predictions = pd.concat([predictions, p_next_day], sort=True)
    predictions = preprocess.generate_rolling_features(predictions)

predictions['p_expc'] = utils.to_exp(predictions.LogConfirmed)
predictions['p_expd'] = utils.to_exp(predictions.LogDeaths)

predictions.head()

Unnamed: 0,DC_Ratio,Date,EncodedLocation,LogConfirmed,LogConfirmedDelta,LogDeaths,LogDeathsDelta,Province_State,Rolled_LogConfirmed_0d,Rolled_LogConfirmed_10d,...,Rolled_LogDeaths_0d,Rolled_LogDeaths_10d,Rolled_LogDeaths_1d,Rolled_LogDeaths_21d,Rolled_LogDeaths_3d,Rolled_LogDeaths_7d,p_logc,p_logd,p_expc,p_expd
0,0.013667,01-12-2021,0,12.918652,,8.625868,0.032998,Alabama,12.918652,,...,8.625868,,,,,,,,407848.0,5573.0
50,0.014017,01-13-2021,0,12.926339,,8.658866,0.031608,Alabama,12.926339,,...,8.658866,,8.625868,,,,,,410995.0,5760.0
100,0.014342,01-14-2021,0,12.935031,,8.690474,0.014194,Alabama,12.935031,,...,8.690474,,8.658866,,,,,,414583.0,5945.0
150,0.014445,01-15-2021,0,12.942109,,8.704668,0.014649,Alabama,12.942109,,...,8.704668,,8.690474,,8.625868,,,,417528.0,6030.0
200,0.014548,01-16-2021,0,12.949632,,8.719317,0.000163,Alabama,12.949632,,...,8.719317,,8.704668,,8.658866,,,,420681.0,6119.0


In [11]:
submission = test.copy()
submission = submission.merge(predictions)

submission['ID'] = range(0, len(submission))
submission['Confirmed'] = submission['p_expc']
submission['Deaths'] = submission['p_expd']
submission

Unnamed: 0.1,Unnamed: 0,Province_State,Date,Confirmed,Deaths,DateTime,DC_Ratio,EncodedLocation,LogConfirmed,LogConfirmedDelta,...,Rolled_LogDeaths_10d,Rolled_LogDeaths_1d,Rolled_LogDeaths_21d,Rolled_LogDeaths_3d,Rolled_LogDeaths_7d,p_logc,p_logd,p_expc,p_expd,ID
0,0,Alabama,04-01-2021,516288.546942,10585.295357,2021-04-01,0.020505,0,13.154423,,...,9.253112,9.264355,9.237469,9.261699,9.259607,0.001746,0.002961,516288.546942,10585.295357,0
1,27,Nevada,04-01-2021,304211.307708,5260.065856,2021-04-01,0.017294,27,12.625481,,...,8.551595,8.565983,8.533263,8.563695,8.559486,0.001478,0.002106,304211.307708,5260.065856,1
2,28,New Hampshire,04-01-2021,84466.998302,1240.802344,2021-04-01,0.014701,28,11.344128,,...,7.104965,7.122060,7.083388,7.121252,7.114769,0.003451,0.002259,84466.998302,1240.802344,2
3,29,New Jersey,04-01-2021,912579.057609,24615.604468,2021-04-01,0.026975,29,13.724031,,...,10.093364,10.108956,10.078071,10.102543,10.099507,0.004132,0.002221,912579.057609,24615.604468,3
4,30,New Mexico,04-01-2021,191947.917447,3948.911868,2021-04-01,0.020578,30,12.164985,,...,8.266935,8.278428,8.254789,8.275376,8.273081,0.001543,0.003020,191947.917447,3948.911868,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1469,Maryland,04-30-2021,430520.191624,8780.680588,2021-04-30,0.020398,19,12.972752,,...,9.067542,9.079303,9.045579,9.076959,9.071823,0.002218,0.002708,430520.191624,8780.680588,1495
1496,1470,Massachusetts,04-30-2021,687391.992348,17898.549052,2021-04-30,0.026040,20,13.440661,,...,9.783091,9.791706,9.768464,9.789998,9.786232,0.004475,0.001995,687391.992348,17898.549052,1496
1497,1471,Michigan,04-30-2021,822344.019066,17969.409573,2021-04-30,0.021853,21,13.619915,,...,9.785166,9.795495,9.767707,9.793433,9.788926,0.004536,0.002388,822344.019066,17969.409573,1497
1498,1460,Hawaii,04-30-2021,33225.696749,497.334384,2021-04-30,0.014998,10,10.411109,,...,6.190323,6.208801,6.162871,6.205260,6.195820,0.005095,0.005975,33225.696749,497.334384,1498


In [12]:
submission[['ID', 'Confirmed', 'Deaths']].to_csv('Team2.csv', index=False)