In [1]:
import pandas as pd
from os.path import exists
import warnings
from datetime import date
import math

warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train = train.drop('Unnamed: 0', axis=1)
test = test.drop('Unnamed: 0', axis=1)

In [4]:
def date_to_int(d):
    initial_date = date(2020, 1, 21)
    curr_date = [int(x) for x in d.split('-')]
    final_date = date(curr_date[0], curr_date[1], curr_date[2])
    delta = final_date - initial_date
    return delta.days

In [5]:
one_hot_train = pd.get_dummies(train['Country'])
train = train.join(one_hot_train)
train['Day'] = train['Date'].apply(lambda x: date_to_int(x))

In [6]:
one_hot_test = pd.get_dummies(test['Country'])
test = test.join(one_hot_train)
test['Day'] = test['Date'].apply(lambda x: date_to_int(x))

In [7]:
train['log_Confirmed'] = train['Confirmed'].apply(lambda x: round(math.log(x+1),3))
train['log_Deaths'] = train['Deaths'].apply(lambda x: round(math.log(x+1),3))
train['log_Recovered'] = train['Recovered'].apply(lambda x: round(math.log(x+1),3))
train['log_Active'] = train['Active'].apply(lambda x: round(math.log(x+1),3))

In [8]:
test['log_Confirmed'] = test['Confirmed'].apply(lambda x: round(math.log(x+1),3))
test['log_Deaths'] = test['Deaths'].apply(lambda x: round(math.log(x+1),3))
test['log_Recovered'] = test['Recovered'].apply(lambda x: round(math.log(x+1),3))
test['log_Active'] = test['Active'].apply(lambda x: round(math.log(x+1),3))

In [9]:
train.head()

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Active,Afghanistan,Albania,Algeria,Andorra,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
0,Afghanistan,2020-01-22,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
1,Albania,2020-01-22,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
2,Algeria,2020-01-22,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
3,Andorra,2020-01-22,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
4,Angola,2020-01-22,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0


In [10]:
train.drop(train.iloc[:, 0:6], inplace = True, axis = 1)
test.drop(test.iloc[:, 0:6], inplace = True, axis = 1)

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from statistics import mean

In [12]:
# Grabbing x/y train test sets for 'Confirmed' cases
confirmed_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1)
confirmed_train_y = train['log_Confirmed']

confirmed_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1)
confirmed_test_y = test['log_Confirmed']

# Performing 5-fold cross validation on rolling basis for 'Confirmed' RF model
all_train = confirmed_train_x.append(confirmed_test_x).to_numpy()
all_values = confirmed_train_y.append(confirmed_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    rf = RandomForestRegressor(n_estimators=30, max_depth = 20, max_features = 4, min_samples_split = 5, n_jobs=-1, random_state=7)
    rf.fit(x_train, y_train)
    predict = rf.predict(x_test)
    rmse_scores.append(round(mean_squared_error(y_test, predict),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict),3))

print('Now predicting Confirmed Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using RF to predict Confirmed cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using RF to predict Confirmed cases is:', mean(rmsle_scores))

Now predicting Confirmed Cases...
Cross validated root mean squared errors are: [0.686, 2.306, 9.777, 16.187, 15.807]
Average RMSE using RF to predict Confirmed cases is: 8.9526

Cross validated root mean squared log errors are: [0.075, 0.266, 0.772, 0.682, 0.438]
Average RMSLE using RF to predict Confirmed cases is: 0.4466


In [13]:
# Grabbing x/y train test sets for 'Deaths' cases
deaths_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1)
deaths_train_y = train['log_Deaths']

deaths_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1)
deaths_test_y = test['log_Deaths']

# Performing 5-fold cross validation on rolling basis for 'Deaths' RF model
all_train = deaths_train_x.append(deaths_test_x).to_numpy()
all_values = deaths_train_y.append(deaths_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    rf = RandomForestRegressor(n_estimators=30, max_depth = 20, max_features = 4, min_samples_split = 5, n_jobs=-1, random_state=7)
    rf.fit(x_train, y_train)
    predict = rf.predict(x_test)
    rmse_scores.append(round(mean_squared_error(y_test, predict),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict),3))

print('Now predicting Deaths Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using RF to predict Deaths cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using RF to predict Deaths cases is:', mean(rmsle_scores))

Now predicting Deaths Cases...
Cross validated root mean squared errors are: [0.119, 0.387, 1.605, 5.376, 7.085]
Average RMSE using RF to predict Deaths cases is: 2.9144

Cross validated root mean squared log errors are: [0.008, 0.048, 0.223, 0.572, 0.5]
Average RMSLE using RF to predict Deaths cases is: 0.2702


In [14]:
# Grabbing x/y train test sets for 'Recovered' cases
recovered_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1)
recovered_train_y = train['log_Recovered']

recovered_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1)
recovered_test_y = test['log_Recovered']

# Performing 5-fold cross validation on rolling basis for 'Recovered' RF model
all_train = recovered_train_x.append(recovered_test_x).to_numpy()
all_values = recovered_train_y.append(recovered_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    rf = RandomForestRegressor(n_estimators=30, max_depth = 20, max_features = 4, min_samples_split = 5, n_jobs=-1, random_state=7)
    rf.fit(x_train, y_train)
    predict = rf.predict(x_test)
    rmse_scores.append(round(mean_squared_error(y_test, predict),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict),3))

print('Now predicting Recovered Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using RF to predict Recovered cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using RF to predict Recovered cases is:', mean(rmsle_scores))

Now predicting Recovered Cases...
Cross validated root mean squared errors are: [0.401, 0.839, 2.281, 8.091, 13.323]
Average RMSE using RF to predict Recovered cases is: 4.987

Cross validated root mean squared log errors are: [0.064, 0.087, 0.255, 0.697, 0.71]
Average RMSLE using RF to predict Recovered cases is: 0.3626


In [15]:
# Grabbing x/y train test sets for 'Active' cases
active_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1)
active_train_y = train['log_Active']

active_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1)
active_test_y = test['log_Active']

# Performing 5-fold cross validation on rolling basis for 'Active' RF model
all_train = active_train_x.append(active_test_x).to_numpy()
all_values = active_train_y.append(active_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []

for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    rf = RandomForestRegressor(n_estimators=30, max_depth = 20, max_features = 4, min_samples_split = 5, n_jobs=-1, random_state=7)
    rf.fit(x_train, y_train)
    predict = rf.predict(x_test)
    rmse_scores.append(round(mean_squared_error(y_test, predict),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict),3))


print('Now predicting Active Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using RF to predict Active cases is:', round(mean(rmse_scores),4))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using RF to predict Active cases is:', mean(rmsle_scores))

Now predicting Active Cases...
Cross validated root mean squared errors are: [0.564, 2.187, 9.488, 15.375, 14.458]
Average RMSE using RF to predict Active cases is: 8.4144

Cross validated root mean squared log errors are: [0.064, 0.087, 0.255, 0.697, 0.71, 0.064, 0.265, 0.773, 0.679, 0.443]
Average RMSLE using RF to predict Active cases is: 0.4037
