In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import statsmodels.api as sm


In [2]:
path = 'covid19-global-forecasting-week-5'
path = '../input/covid19-global-forecasting-week-5'
train = pd.read_csv(path+'/train.csv')
test = pd.read_csv(path+'/test.csv')

In [3]:
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

drop = ['County','Province_State']
train = train.drop(drop, axis=1)
test = test.drop(drop, axis=1)

test = test.rename(columns= {'ForecastId':'Id'})

train_len = train.shape[0]
df = pd.concat([train,test])

In [4]:
train.shape, test.shape, df.shape

((754934, 7), (311670, 6), (1066604, 7))

In [5]:
train.isnull().sum()

Id                0
Country_Region    0
Population        0
Weight            0
Date              0
Target            0
TargetValue       0
dtype: int64

In [6]:
train.head()

Unnamed: 0,Id,Country_Region,Population,Weight,Date,Target,TargetValue
0,1,Afghanistan,27657145,0.058359,2020-01-23,ConfirmedCases,0.0
1,2,Afghanistan,27657145,0.583587,2020-01-23,Fatalities,0.0
2,3,Afghanistan,27657145,0.058359,2020-01-24,ConfirmedCases,0.0
3,4,Afghanistan,27657145,0.583587,2020-01-24,Fatalities,0.0
4,5,Afghanistan,27657145,0.058359,2020-01-25,ConfirmedCases,0.0


In [7]:
train.describe()

Unnamed: 0,Id,Population,Weight,TargetValue
count,754934.0,754934.0,754934.0,754934.0
mean,484789.5,2719395.0,0.530872,9.746414
std,279911.138301,34777620.0,0.451909,261.845643
min,1.0,86.0,0.047491,-10034.0
25%,242364.25,12133.0,0.096838,0.0
50%,484789.5,30531.0,0.349413,0.0
75%,727214.75,105612.0,0.968379,0.0
max,969578.0,1395773000.0,2.239186,36163.0


In [8]:
#plt.plot_date(x=train['Date'],y=train['TargetValue'])

In [9]:
test.isnull().sum()/test.shape[0]

Id                0.0
Country_Region    0.0
Population        0.0
Weight            0.0
Date              0.0
Target            0.0
dtype: float64

In [10]:
test['Date'].describe()

count                  311670
unique                     45
top       2020-05-18 00:00:00
freq                     6926
first     2020-04-27 00:00:00
last      2020-06-10 00:00:00
Name: Date, dtype: object

In [11]:
train['Date'].describe()

count                  754934
unique                    109
top       2020-02-23 00:00:00
freq                     6926
first     2020-01-23 00:00:00
last      2020-05-10 00:00:00
Name: Date, dtype: object

In [12]:
train.tail()

Unnamed: 0,Id,Country_Region,Population,Weight,Date,Target,TargetValue
754929,969574,Zimbabwe,14240168,0.607106,2020-05-08,Fatalities,0.0
754930,969575,Zimbabwe,14240168,0.060711,2020-05-09,ConfirmedCases,1.0
754931,969576,Zimbabwe,14240168,0.607106,2020-05-09,Fatalities,0.0
754932,969577,Zimbabwe,14240168,0.060711,2020-05-10,ConfirmedCases,1.0
754933,969578,Zimbabwe,14240168,0.607106,2020-05-10,Fatalities,0.0


In [13]:
train['Target'].value_counts()

Fatalities        377467
ConfirmedCases    377467
Name: Target, dtype: int64

In [14]:
test['Target'].value_counts()

ConfirmedCases    155835
Fatalities        155835
Name: Target, dtype: int64

In [15]:
def date_time_feature(df,col):
    df[col+'_month'] = df['Date'].dt.month
    df[col+'_day'] = df['Date'].dt.day
    df[col+'_week'] = df['Date'].dt.week
    df[col+'_weekofyear'] = df['Date'].dt.weekofyear
    df['Date'] = df['Date'].dt.strftime('%Y%m%d').astype(int)
    return df

In [16]:
col= 'Date'
df = date_time_feature(df,col)

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
def encode(df,col):
    
    le = LabelEncoder()
    for c in col:
        df[c] = le.fit_transform(df[c])
    return df

In [19]:
col =['Country_Region','Target']
df = encode(df, col)


In [20]:
col = ['Country_Region','Date_month','Date_day','Date_week','Date_weekofyear']
df = pd.get_dummies(data=df,columns=col,drop_first=True)


In [21]:
df.head()

Unnamed: 0,Id,Population,Weight,Date,Target,TargetValue,Country_Region_1,Country_Region_2,Country_Region_3,Country_Region_4,...,Date_weekofyear_15,Date_weekofyear_16,Date_weekofyear_17,Date_weekofyear_18,Date_weekofyear_19,Date_weekofyear_20,Date_weekofyear_21,Date_weekofyear_22,Date_weekofyear_23,Date_weekofyear_24
0,1,27657145,0.058359,20200123,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,27657145,0.583587,20200123,1,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,27657145,0.058359,20200124,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,27657145,0.583587,20200124,1,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,27657145,0.058359,20200125,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Model

In [22]:
df1 = df.drop(['Date','Id','TargetValue'],axis=1)

In [23]:
X_train = df1[:train_len]
X_test = df1[train_len:]
y_train = df.iloc[:train_len]['TargetValue']

In [24]:
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)

In [25]:
from sklearn.model_selection import RandomizedSearchCV

In [26]:
# param = {
#     'learning_rate':np.linspace(0.001,0.5),
#     'n_estimators':np.arange(10,500),
# }

In [27]:
# rsCV  = RandomizedSearchCV(model,param_distributions=param, n_iter=3,n_jobs= -1)
# rsCV.fit(X_train,y_train)

In [28]:
import lightgbm as lgb

In [29]:
def lgb_model(q):
    model = lgb.LGBMRegressor(
        objective = 'quantile',
        alpha = q,
        learning_rate = 0.05,
        n_estimators = 1000,
        min_data_in_leaf=5,
        num_leaves = 100000,
        bagging_fraction=0.95,
        feature_fraction = 0.95,
        max_depth = 10,
        random_state = 12,
        num_threads = -1
    )
    model.fit(X_train,y_train)
    score = model.score(X_train,y_train)
    print(f'quantile {q} score: {round(score,3)}')
    y_pred = model.predict(X_test)
    return y_pred

In [30]:
sub = pd.DataFrame()
sub['Id']  = test.Id
sub['q0.05'] = lgb_model(0.05)
sub['q0.5'] = lgb_model(0.5)
sub['q0.95'] = lgb_model(0.95)

quantile 0.05 score: 0.644
quantile 0.5 score: 0.809
quantile 0.95 score: 0.718


In [31]:
sub=pd.melt(sub, id_vars=['Id'], value_vars=['q0.05','q0.5','q0.95'])
sub['variable']=sub['variable'].str.replace("q","", regex=False)
sub['ForecastId_Quantile']=sub['Id'].astype(str)+'_'+sub['variable']
sub['TargetValue']=sub['value']
sub=sub[['ForecastId_Quantile','TargetValue']]
sub.reset_index(drop=True,inplace=True)
sub.to_csv("submission.csv",index=False)
sub.head()

Unnamed: 0,ForecastId_Quantile,TargetValue
0,1_0.05,0.0
1,2_0.05,0.0
2,3_0.05,0.0
3,4_0.05,0.0
4,5_0.05,0.0
