# Predict Covid Spread - SVM Scratch

In [1]:
# imports
import math
import warnings

import pandas as pd
import numpy as np 

from os.path import exists
from datetime import date
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from statistics import mean

warnings.filterwarnings('ignore')

In [2]:
# Reading in files
train = pd.read_csv('../[Cleaned] Forecasting Data/train.csv')
test = pd.read_csv('../[Cleaned] Forecasting Data/test.csv')

# Dropping added pandas index column
train = train.drop('Unnamed: 0', axis=1)
test = test.drop('Unnamed: 0', axis=1)

In [3]:
# Helper function for converting date variable into integer
def date_to_int(d):
    initial_date = date(2020, 1, 21)
    curr_date = [int(x) for x in d.split('-')]
    final_date = date(curr_date[0], curr_date[1], curr_date[2])
    delta = final_date - initial_date
    return delta.days

# Feature Engineering - One hot encoding for country and Numeric Day value
one_hot_train = pd.get_dummies(train['Country'])
train = train.join(one_hot_train)
train['Day'] = train['Date'].apply(lambda x: date_to_int(x))

one_hot_test = pd.get_dummies(test['Country'])
test = test.join(one_hot_train)
test['Day'] = test['Date'].apply(lambda x: date_to_int(x))

# Applying Outcome Log Transformation
train['log_Confirmed'] = train['Confirmed'].apply(lambda x: round(math.log(x+1),3))
train['log_Deaths'] = train['Deaths'].apply(lambda x: round(math.log(x+1),3))
train['log_Recovered'] = train['Recovered'].apply(lambda x: round(math.log(x+1),3))
train['log_Active'] = train['Active'].apply(lambda x: round(math.log(x+1),3))

test['log_Confirmed'] = test['Confirmed'].apply(lambda x: round(math.log(x+1),3))
test['log_Deaths'] = test['Deaths'].apply(lambda x: round(math.log(x+1),3))
test['log_Recovered'] = test['Recovered'].apply(lambda x: round(math.log(x+1),3))
test['log_Active'] = test['Active'].apply(lambda x: round(math.log(x+1),3))

train.head()

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Active,Afghanistan,Albania,Algeria,Andorra,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
0,Afghanistan,2020-01-22,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
1,Albania,2020-01-22,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
2,Algeria,2020-01-22,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
3,Andorra,2020-01-22,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
4,Angola,2020-01-22,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0


In [4]:
# Removing old features and outputs
train.drop(train.iloc[:, 0:6], inplace = True, axis = 1)
test.drop(test.iloc[:, 0:6], inplace = True, axis = 1)

In [5]:
train.head()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0


In [6]:
test.tail()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
8036,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,93,5.82,1.099,4.317,5.565
8037,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,93,1.946,0.0,1.792,0.693
8038,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,93,0.693,0.0,0.0,0.693
8039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,93,4.344,1.386,3.638,3.611
8040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,93,3.367,1.609,1.099,3.135


In [7]:
# Support Vector Regression with Linear Kernel
class LinearSVR:

    def __init__(self, learn_rate=0.00001, C=10, epochs=1000, epsilon = 0.001):
        self.lr = learn_rate
        self.C = C
        self.epochs = epochs
        self.epsilon = epsilon
        
    def loss(self, w, x, y):
        n = x.shape[0]
        y_pred = np.dot(x,w)
        error = abs(y_pred - y) -  self.epsilon
        error[error < 0] = 0
        ans = np.dot(w,w)/2 + self.C * (np.sum(error)/n)
        return ans
    
    def gradient(self, w, x, y):
        y_pred = np.dot(x,w)
        error = abs(y_pred - y) - self.epsilon
        
        if max(0, error) == 0 or y_pred == y:
            dw = w
        elif y_pred > y:
            dw = w + self.C*x
        else:
            dw = w - self.C*x
        return dw
    
    def fit(self, x, y):
        points, features = x.shape
        w = np.zeros(features)
        n = 0
        prev = math.inf

        for e in range(self.epochs):
            for i, v in enumerate(x):
                descent = self.gradient(w, v, y[i])
                w = w - (self.lr * descent)
        
            if e == 2**n:
                temp = self.loss(w, x, y)
                if abs(prev - temp) < 0.01 * prev:
                    return w
                prev = temp
                n += 1
        return w

    def predict(self, x, w):
        return np.dot(x, w)

In [8]:
# Grabbing x/y train and sets for 'Confirmed', 'Deaths', 'Recovered', and 'Active' Cases
confirmed_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1).to_numpy()
confirmed_train_y = train['log_Confirmed'].to_numpy()
confirmed_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1).to_numpy()
confirmed_test_y = test['log_Confirmed'].to_numpy()


deaths_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1).to_numpy()
deaths_train_y = train['log_Deaths'].to_numpy()
deaths_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1).to_numpy()
deaths_test_y = test['log_Deaths'].to_numpy()

recovered_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1).to_numpy()
recovered_train_y = train['log_Recovered'].to_numpy()
recovered_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1).to_numpy()
recovered_test_y = test['log_Recovered'].to_numpy()

active_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1).to_numpy()
active_train_y = train['log_Active'].to_numpy()
active_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1).to_numpy()
active_test_y = test['log_Active'].to_numpy()

In [10]:
# Adding intercept b value for x_test
confirmed_train_x = np.append(confirmed_train_x, np.zeros((confirmed_train_x.shape[0],1)), axis=1)
confirmed_test_x = np.append(confirmed_test_x, np.zeros((confirmed_test_x.shape[0],1)), axis=1)

all_train = np.append(confirmed_train_x, confirmed_test_x, axis=0)
all_values = np.append(confirmed_train_y, confirmed_test_y, axis=0)

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Confirmed' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR()
    weights = svr.fit(x_train, y_train)
    predict = svr.predict(x_test, weights)
    predict[predict < 0] = 0 # y = max(0, y)
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Confirmed Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Confirmed cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Confirmed cases is:', round(mean(rmsle_scores),4))

Now predicting Confirmed Cases...
Cross validated root mean squared errors are: [1.174, 1.884, 3.227, 3.152, 2.977]
Average RMSE using SVM to predict Confirmed cases is: 2.4828

Cross validated root mean squared log errors are: [0.451, 0.669, 0.866, 0.599, 0.537]
Average RMSLE using SVM to predict Confirmed cases is: 0.6244


In [12]:
# Adding intercept b value for x_test
deaths_train_x = np.append(deaths_train_x, np.zeros((deaths_train_x.shape[0],1)), axis=1)
deaths_test_x = np.append(deaths_test_x, np.zeros((deaths_test_x.shape[0],1)), axis=1)

all_train = np.append(deaths_train_x, deaths_test_x, axis=0)
all_values = np.append(deaths_train_y, deaths_test_y, axis=0)

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Deaths' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR()
    weights = svr.fit(x_train, y_train)
    predict = svr.predict(x_test, weights)
    predict[predict < 0] = 0 # y = max(0, y)
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Deaths Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Deaths cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Deaths cases is:', round(mean(rmsle_scores),4))

Now predicting Deaths Cases...
Cross validated root mean squared errors are: [0.529, 0.786, 1.495, 2.464, 2.613]
Average RMSE using SVM to predict Deaths cases is: 1.5774

Cross validated root mean squared log errors are: [0.166, 0.288, 0.588, 0.76, 0.711]
Average RMSLE using SVM to predict Deaths cases is: 0.5026


In [13]:
# Adding intercept b value for x_test
recovered_train_x = np.append(recovered_train_x, np.zeros((recovered_train_x.shape[0],1)), axis=1)
recovered_test_x = np.append(recovered_test_x, np.zeros((recovered_test_x.shape[0],1)), axis=1)

all_train = np.append(recovered_train_x, recovered_test_x, axis=0)
all_values = np.append(recovered_train_y, recovered_test_y, axis=0)

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Recovered' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR()
    weights = svr.fit(x_train, y_train)
    predict = svr.predict(x_test, weights)
    predict[predict < 0] = 0 # y = max(0, y)
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Recovered Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Recovered cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Recovered cases is:', round(mean(rmsle_scores),4))

Now predicting Recovered Cases...
Cross validated root mean squared errors are: [0.804, 1.219, 1.837, 3.109, 2.933]
Average RMSE using SVM to predict Recovered cases is: 1.9804

Cross validated root mean squared log errors are: [0.313, 0.457, 0.65, 0.89, 0.7]
Average RMSLE using SVM to predict Recovered cases is: 0.602


In [14]:
# Adding intercept b value for x_test
active_train_x = np.append(active_train_x, np.zeros((active_train_x.shape[0],1)), axis=1)
active_test_x = np.append(active_test_x, np.zeros((active_test_x.shape[0],1)), axis=1)

all_train = np.append(active_train_x, active_test_x, axis=0)
all_values = np.append(active_train_y, active_test_y, axis=0)

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

# Performing 5-fold cross validation on rolling basis for 'Active' SVR model
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR()
    weights = svr.fit(x_train, y_train)
    predict = svr.predict(x_test, weights)
    predict[predict < 0] = 0 # y = max(0, y)
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Active Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Active cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Active cases is:', round(mean(rmsle_scores),4))

Now predicting Active Cases...
Cross validated root mean squared errors are: [1.102, 1.785, 3.187, 2.953, 2.762]
Average RMSE using SVM to predict Active cases is: 2.3578

Cross validated root mean squared log errors are: [0.42, 0.644, 0.876, 0.59, 0.538]
Average RMSLE using SVM to predict Active cases is: 0.6136
