# Predict Covid Spread - SVM

In [1]:
# imports
import pandas as pd
from os.path import exists
import warnings
from datetime import date
import math

from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from statistics import mean

warnings.filterwarnings('ignore')

In [2]:
# Reading in files
train = pd.read_csv('../[Cleaned] Forecasting Data/train.csv')
test = pd.read_csv('../[Cleaned] Forecasting Data/test.csv')

# Dropping added pandas index column
train = train.drop('Unnamed: 0', axis=1)
test = test.drop('Unnamed: 0', axis=1)

In [3]:
# Helper function for converting date variable into integer
def date_to_int(d):
    initial_date = date(2020, 1, 21)
    curr_date = [int(x) for x in d.split('-')]
    final_date = date(curr_date[0], curr_date[1], curr_date[2])
    delta = final_date - initial_date
    return delta.days

# Feature Engineering - One hot encoding for country and Numeric Day value
one_hot_train = pd.get_dummies(train['Country'])
train = train.join(one_hot_train)
train['Day'] = train['Date'].apply(lambda x: date_to_int(x))

one_hot_test = pd.get_dummies(test['Country'])
test = test.join(one_hot_train)
test['Day'] = test['Date'].apply(lambda x: date_to_int(x))

# Applying Outcome Log Transformation
train['log_Confirmed'] = train['Confirmed'].apply(lambda x: round(math.log(x+1),3))
train['log_Deaths'] = train['Deaths'].apply(lambda x: round(math.log(x+1),3))
train['log_Recovered'] = train['Recovered'].apply(lambda x: round(math.log(x+1),3))
train['log_Active'] = train['Active'].apply(lambda x: round(math.log(x+1),3))

test['log_Confirmed'] = test['Confirmed'].apply(lambda x: round(math.log(x+1),3))
test['log_Deaths'] = test['Deaths'].apply(lambda x: round(math.log(x+1),3))
test['log_Recovered'] = test['Recovered'].apply(lambda x: round(math.log(x+1),3))
test['log_Active'] = test['Active'].apply(lambda x: round(math.log(x+1),3))

train.head()

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Active,Afghanistan,Albania,Algeria,Andorra,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
0,Afghanistan,2020-01-22,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
1,Albania,2020-01-22,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
2,Algeria,2020-01-22,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
3,Andorra,2020-01-22,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
4,Angola,2020-01-22,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0


In [4]:
# Removing old features and outputs
train.drop(train.iloc[:, 0:6], inplace = True, axis = 1)
test.drop(test.iloc[:, 0:6], inplace = True, axis = 1)

In [5]:
train.head()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0


In [6]:
test.tail()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
8036,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,93,5.82,1.099,4.317,5.565
8037,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,93,1.946,0.0,1.792,0.693
8038,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,93,0.693,0.0,0.0,0.693
8039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,93,4.344,1.386,3.638,3.611
8040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,93,3.367,1.609,1.099,3.135


In [7]:
# Grabbing x/y train and sets for 'Confirmed', 'Deaths', 'Recovered', and 'Active' Cases
train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1)
test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1)
all_train = train_x.append(test_x).to_numpy()

confirmed_train_y = train['log_Confirmed']
confirmed_test_y = test['log_Confirmed']

deaths_train_y = train['log_Deaths']
deaths_test_y = test['log_Deaths']

recovered_train_y = train['log_Recovered']
recovered_test_y = test['log_Recovered']

active_train_y = train['log_Active']
active_test_y = test['log_Active']

In [8]:
all_values = confirmed_train_y.append(confirmed_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Confirmed' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR(C=10, epsilon=0.1)
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    predict[predict < 0] = 0
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Confirmed Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Confirmed cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Confirmed cases is:', mean(rmsle_scores))

Now predicting Confirmed Cases...
Cross validated root mean squared errors are: [0.489, 1.309, 2.834, 4.893, 3.54]
Average RMSE using SVM to predict Confirmed cases is: 2.613

Cross validated root mean squared log errors are: [0.192, 0.465, 0.767, 1.24, 0.563]
Average RMSLE using SVM to predict Confirmed cases is: 0.6454


In [9]:
all_values = deaths_train_y.append(deaths_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Deaths' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR(C=10, epsilon=0.1)
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    predict[predict < 0] = 0
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Deaths Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Deaths cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Deaths cases is:', round(mean(rmsle_scores),4))

Now predicting Deaths Cases...
Cross validated root mean squared errors are: [0.32, 0.525, 1.285, 2.582, 3.491]
Average RMSE using SVM to predict Deaths cases is: 1.6406

Cross validated root mean squared log errors are: [0.251, 0.251, 0.483, 0.87, 1.096]
Average RMSLE using SVM to predict Deaths cases is: 0.5902


In [10]:
all_values = recovered_train_y.append(recovered_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Recovered' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR(C=10, epsilon=0.1)
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    predict[predict < 0] = 0
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Recovered Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Recovered cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Recovered cases is:', mean(rmsle_scores))

Now predicting Recovered Cases...
Cross validated root mean squared errors are: [0.51, 0.851, 1.463, 2.586, 3.888]
Average RMSE using SVM to predict Recovered cases is: 1.8596

Cross validated root mean squared log errors are: [0.279, 0.373, 0.637, 0.733, 0.9]
Average RMSLE using SVM to predict Recovered cases is: 0.5844


In [11]:
all_values = active_train_y.append(active_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Active' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR(C=10, epsilon=0.1)
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    predict[predict < 0] = 0
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Active Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Active cases is:', round(mean(rmse_scores),4))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Active cases is:', round(mean(rmsle_scores),4))

Now predicting Active Cases...
Cross validated root mean squared errors are: [0.517, 1.245, 3.275, 4.399, 4.897]
Average RMSE using SVM to predict Active cases is: 2.8666

Cross validated root mean squared log errors are: [0.352, 0.555, 1.021, 1.01, 1.018]
Average RMSLE using SVM to predict Active cases is: 0.7912
