# Predict Covid Spread - SVM

In [8]:
# imports
import pandas as pd
from os.path import exists
import warnings
from datetime import date
import math

from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from statistics import mean

warnings.filterwarnings('ignore')

In [3]:
# Reading in files
train = pd.read_csv('../[Cleaned] Complete Data/train.csv')
test = pd.read_csv('../[Cleaned] Complete Data/test.csv')

# Dropping added pandas index column
train = train.drop('Unnamed: 0', axis=1)
test = test.drop('Unnamed: 0', axis=1)

In [4]:
# Helper function for converting date variable into integer
def date_to_int(d):
    initial_date = date(2020, 1, 21)
    curr_date = [int(x) for x in d.split('-')]
    final_date = date(curr_date[0], curr_date[1], curr_date[2])
    delta = final_date - initial_date
    return delta.days

# Feature Engineering - One hot encoding for country and Numeric Day value
one_hot_train = pd.get_dummies(train['Country'])
train = train.join(one_hot_train)
train['Day'] = train['Date'].apply(lambda x: date_to_int(x))

one_hot_test = pd.get_dummies(test['Country'])
test = test.join(one_hot_train)
test['Day'] = test['Date'].apply(lambda x: date_to_int(x))

# Applying Outcome Log Transformation
train['log_Confirmed'] = train['Confirmed'].apply(lambda x: round(math.log(x+1),3))
train['log_Deaths'] = train['Deaths'].apply(lambda x: round(math.log(x+1),3))
train['log_Recovered'] = train['Recovered'].apply(lambda x: round(math.log(x+1),3))
train['log_Active'] = train['Active'].apply(lambda x: round(math.log(x+1),3))

test['log_Confirmed'] = test['Confirmed'].apply(lambda x: round(math.log(x+1),3))
test['log_Deaths'] = test['Deaths'].apply(lambda x: round(math.log(x+1),3))
test['log_Recovered'] = test['Recovered'].apply(lambda x: round(math.log(x+1),3))
test['log_Active'] = test['Active'].apply(lambda x: round(math.log(x+1),3))

train.head()

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Active,Afghanistan,Albania,Algeria,Andorra,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
0,Afghanistan,2020-01-22,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
1,Albania,2020-01-22,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
2,Algeria,2020-01-22,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
3,Andorra,2020-01-22,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
4,Angola,2020-01-22,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0


In [5]:
# Removing old features and outputs
train.drop(train.iloc[:, 0:6], inplace = True, axis = 1)
test.drop(test.iloc[:, 0:6], inplace = True, axis = 1)

In [6]:
train.head()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0


In [7]:
test.tail()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
7101,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,188,9.271,4.369,8.23,8.824
7102,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,188,2.398,0.693,2.197,0.693
7103,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,188,7.434,6.182,6.726,5.93
7104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,188,8.424,4.949,7.943,7.377
7105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,188,7.903,3.611,6.297,7.662


In [None]:
# Grabbing x/y train and sets for 'Confirmed', 'Deaths', 'Recovered', and 'Active' Cases
train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1)
test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1)
all_train = train_x.append(test_x).to_numpy()

confirmed_train_y = train['log_Confirmed']
confirmed_test_y = test['log_Confirmed']

deaths_train_y = train['log_Deaths']
deaths_test_y = test['log_Deaths']

recovered_train_y = train['log_Recovered']
recovered_test_y = test['log_Recovered']

active_train_y = train['log_Active']
active_test_y = test['log_Active']

In [12]:
all_values = confirmed_train_y.append(confirmed_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Confirmed' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR(C=10, epsilon=0.1)
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    predict[predict < 0] = 0
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Confirmed Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Confirmed cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Confirmed cases is:', mean(rmsle_scores))

Now predicting Confirmed Cases...
Cross validated root mean squared errors are: [2.112, 3.57, 3.348, 4.365, 6.542]
Average RMSE using SVM to predict Confirmed cases is: 3.9874

Cross validated root mean squared log errors are: [0.674, 0.619, 0.457, 0.582, 1.063]
Average RMSLE using SVM to predict Confirmed cases is: 0.679


In [15]:
all_values = deaths_train_y.append(deaths_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Deaths' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR(C=10, epsilon=0.1)
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    predict[predict < 0] = 0
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Deaths Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Deaths cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Deaths cases is:', round(mean(rmsle_scores),4))

Now predicting Deaths Cases...
Cross validated root mean squared errors are: [0.999, 3.115, 3.951, 4.43, 4.858]
Average RMSE using SVM to predict Deaths cases is: 3.4706

Cross validated root mean squared log errors are: [0.405, 0.998, 1.098, 1.188, 1.277]
Average RMSLE using SVM to predict Deaths cases is: 0.9932


In [16]:
all_values = recovered_train_y.append(recovered_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Recovered' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR(C=10, epsilon=0.1)
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    predict[predict < 0] = 0
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Recovered Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Recovered cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Recovered cases is:', mean(rmsle_scores))

Now predicting Recovered Cases...
Cross validated root mean squared errors are: [1.265, 3.025, 5.483, 5.901, 7.794]
Average RMSE using SVM to predict Recovered cases is: 4.6936

Cross validated root mean squared log errors are: [0.474, 0.743, 1.235, 1.114, 0.845]
Average RMSLE using SVM to predict Recovered cases is: 0.8822


In [19]:
all_values = active_train_y.append(active_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Active' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR(C=10, epsilon=0.1)
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    predict[predict < 0] = 0
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Active Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Active cases is:', round(mean(rmse_scores),4))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Active cases is:', round(mean(rmsle_scores),4))

Now predicting Active Cases...
Cross validated root mean squared errors are: [2.336, 5.33, 3.419, 6.404, 5.377]
Average RMSE using SVM to predict Active cases is: 4.5732

Cross validated root mean squared log errors are: [0.692, 1.375, 0.594, 0.949, 0.928]
Average RMSLE using SVM to predict Active cases is: 0.9076
