In [3]:
# Libraries
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
import datetime as dt
import xgboost as xgb
from sklearn import preprocessing

# Written code
import utils
import preprocess

In [4]:
train = pd.read_csv('train_trendency.csv')
test = pd.read_csv('test.csv')

In [5]:
TRAIN_START = train.Date.min()
TEST_START = test.Date.min()
TRAIN_END = train.Date.max()
TEST_END = test.Date.max()

DATEFORMAT = '%m-%d-%Y'

In [6]:
def generate_prediction():
    train, test = preprocess.get_data()
    train_clean = preprocess.process_by_state(train)
    train_clean = train_clean[[
        'Province_State', 'Date', 'LogConfirmed', 
        'LogDeaths', 'LogConfirmedDelta', 'LogDeathsDelta', 'encoded_location'
    ]]
    train_features = preprocess.generate_rolling_features(train_clean)

    features = [
        'logc_7d', 'logd_7d', 'logc_3d', 'logd_3d', 'encoded_location',
        'logc_1d', 'logd_1d', 'logc_0d', 'logd_0d', 'dc_ratio'
    ]

    config = dict(
        min_child_weight=5,
        eta=0.01, colsample_bytree=0.8, 
        max_depth=5, subsample=0.9, nthread=2, booster='gbtree',
        eval_metric='rmse', objective='reg:squarederror'
    )

    data = train_features[(train_features.Date >= TRAIN_START) & (train_features.Date < TRAIN_END)].copy()
    data['day_until'] = -(pd.to_datetime(train_features.Date) - dt.datetime.strptime(TRAIN_END, DATEFORMAT)).dt.days

    dm_logc = xgb.DMatrix(data[features].round(2), label=data.LogConfirmedDelta, weight=utils.calc_weight(data.day_until))
    dm_logd = xgb.DMatrix(data[features].round(2), label=data.LogDeathsDelta, weight=utils.calc_weight(data.day_until))

    model_lc = xgb.train(config, dm_logc, 800, evals=[(dm_logc, 'train-logc')], verbose_eval=100)
    model_lf = xgb.train(config, dm_logd, 800, evals=[(dm_logd, 'train-logd')], verbose_eval=100)

    # Predict
    predictions = Xtr.copy()
    predictions = train_features[(train_features.Date >= TRAIN_START) & (train_features.Date <= TRAIN_END)].copy()
    predictions.LogConfirmedDelta = np.nan
    predictions.LogFatalitiesDelta = np.nan
    
    decay = 0.99
    for i, d in enumerate(pd.date_range(TRAIN_END, add_days(TEST_END, 1)).strftime(DATEFORMAT)):
        last_day = str(d).split(' ')[0]
        next_day = dt.datetime.strptime(last_day, DATEFORMAT) + dt.timedelta(days=1)
        next_day = next_day.strftime(DATEFORMAT)

        p_next_day = predictions[predictions.Date == last_day].copy()
        p_next_day.Date = next_day
        p_next_day['p_logc'] = model_lc.predict(xgb.DMatrix(p_next_day[features].round(2)))
        p_next_day['p_logd'] = model_lf.predict(xgb.DMatrix(p_next_day[features].round(2)))

        p_next_day.LogConfirmed = p_next_day.LogConfirmed + np.clip(p_next_day['p_logc'], 0, None) * decay ** i
        p_next_day.LogDeaths = p_next_day.LogDeaths + np.clip(p_next_day['p_logd'], 0, None) * decay ** i

        predictions = pd.concat([predictions, p_next_day], sort=True)
        predictions = preprocess.generate_rolling_features(predictions)

    predictions['p_expc'] = utils.to_exp(predictions.LogConfirmed)
    predictions['p_expd'] = utils.to_exp(predictions.LogDeaths)
    return predictions

In [7]:
result = generate_prediction()

NameError: name 'weighting' is not defined

In [20]:
submission = test.copy()
submission = submission.merge(result)

submission['ID'] = range(0, len(submission))
submission['Confirmed'] = submission['p_expc']
submission['Deaths'] = submission['p_expd']
submission

Unnamed: 0.1,Unnamed: 0,Province_State,Date,Confirmed,Deaths,DC_Ratio,EncodedLocation,LogConfirmed,LogConfirmedDelta,LogDeaths,...,Rolled_LogDeaths_10d,Rolled_LogDeaths_1d,Rolled_LogDeaths_21d,Rolled_LogDeaths_3d,Rolled_LogDeaths_7d,p_logc,p_logd,p_expc,p_expd,ID
0,0,Alabama,04-01-2021,5.165745e+05,10587.738842,0.020498,0,13.154977,,9.267546,...,9.253112,9.264355,9.237469,9.261699,9.259607,0.002299,0.003191,5.165745e+05,10587.738842,0
1,1,Alaska,04-01-2021,6.308200e+04,314.273532,0.004998,1,11.052207,,5.753441,...,5.739793,5.749393,5.723585,5.749393,5.749393,0.002873,0.004048,6.308200e+04,314.273532,1
2,2,Arizona,04-01-2021,8.437897e+05,17021.516377,0.020174,2,13.645660,,9.742292,...,9.725915,9.739084,9.708810,9.736193,9.733589,0.002348,0.003208,8.437897e+05,17021.516377,2
3,3,Arkansas,04-01-2021,3.311289e+05,5640.734224,0.017038,3,12.710266,,8.637947,...,8.620652,8.635332,8.597482,8.630879,8.625509,0.002210,0.002615,3.311289e+05,5640.734224,3
4,4,California,04-01-2021,3.674327e+06,59521.646742,0.016200,4,15.116881,,10.994112,...,10.962371,10.989943,10.929117,10.985818,10.974660,0.001520,0.004169,3.674327e+06,59521.646742,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1495,Virginia,04-30-2021,6.633125e+05,11046.683725,0.016655,45,13.405003,,9.309976,...,9.286829,9.307740,9.259426,9.303200,9.293817,0.002710,0.002993,6.633125e+05,11046.683725,1495
1496,1496,Washington,04-30-2021,4.103313e+05,5803.064392,0.014145,46,12.924723,,8.666314,...,8.634929,8.663150,8.597828,8.656726,8.644439,0.007011,0.004234,4.103313e+05,5803.064392,1496
1497,1497,West Virginia,04-30-2021,1.535403e+05,3123.211396,0.020348,47,11.941725,,8.046937,...,7.999816,8.042445,7.942170,8.033324,8.014466,0.003132,0.006012,1.535403e+05,3123.211396,1497
1498,1498,Wisconsin,04-30-2021,6.661854e+05,8206.887416,0.012321,48,13.409325,,9.012851,...,8.979676,9.009686,8.934445,9.003260,8.990016,0.001893,0.004236,6.661854e+05,8206.887416,1498


In [21]:
submission[['ID', 'Confirmed', 'Deaths']].to_csv('Team2.csv', index=False)