# Predict Covid Spread - SVM

In [1]:
# imports
import math
import warnings

import pandas as pd
import numpy as np 

from os.path import exists
from datetime import date
from sklearn.metrics import mean_squared_log_error

warnings.filterwarnings('ignore')

In [2]:
# Reading in files
train = pd.read_csv('[Cleaned] Forecasting Data/train.csv')
test = pd.read_csv('[Cleaned] Forecasting Data/test.csv')

train = train.drop('Unnamed: 0', axis=1)
test = test.drop('Unnamed: 0', axis=1)

def date_to_int(d):
    initial_date = date(2020, 1, 21)
    curr_date = [int(x) for x in d.split('-')]
    final_date = date(curr_date[0], curr_date[1], curr_date[2])
    delta = final_date - initial_date
    return delta.days

# Feature Engineering - One hot encoding for country and Numeric Day value
one_hot_train = pd.get_dummies(train['Country'])
train = train.join(one_hot_train)
train['Day'] = train['Date'].apply(lambda x: date_to_int(x))

one_hot_test = pd.get_dummies(test['Country'])
test = test.join(one_hot_train)
test['Day'] = test['Date'].apply(lambda x: date_to_int(x))

# Outcome Log Transformation
train['log_Confirmed'] = train['Confirmed'].apply(lambda x: round(math.log(x+1),3))
train['log_Deaths'] = train['Deaths'].apply(lambda x: round(math.log(x+1),3))
train['log_Recovered'] = train['Recovered'].apply(lambda x: round(math.log(x+1),3))
train['log_Active'] = train['Active'].apply(lambda x: round(math.log(x+1),3))

test['log_Confirmed'] = test['Confirmed'].apply(lambda x: round(math.log(x+1),3))
test['log_Deaths'] = test['Deaths'].apply(lambda x: round(math.log(x+1),3))
test['log_Recovered'] = test['Recovered'].apply(lambda x: round(math.log(x+1),3))
test['log_Active'] = test['Active'].apply(lambda x: round(math.log(x+1),3))

train.head()

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Active,Afghanistan,Albania,Algeria,Andorra,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
0,Afghanistan,2020-01-22,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
1,Albania,2020-01-22,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
2,Algeria,2020-01-22,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
3,Andorra,2020-01-22,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
4,Angola,2020-01-22,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0


In [3]:
# Preparing vectorized features to input into model
train.drop(train.iloc[:, 0:6], inplace = True, axis = 1)
test.drop(test.iloc[:, 0:6], inplace = True, axis = 1)

In [4]:
train.head()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0


In [5]:
test.tail()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
8036,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,93,5.82,1.099,4.317,5.565
8037,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,93,1.946,0.0,1.792,0.693
8038,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,93,0.693,0.0,0.0,0.693
8039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,93,4.344,1.386,3.638,3.611
8040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,93,3.367,1.609,1.099,3.135


In [6]:
# Support Vector Regression with Linear Kernel
class LinearSVR:

    def __init__(self, learn_rate=0.001, C=10, epochs=1000, epsilon = 0.1):
        self.lr = learn_rate
        self.C = C
        self.epochs = epochs
        self.epsilon = epsilon
        
    def loss(self, w, x, y):
        n = x.shape[0]
        y_pred = np.dot(x,w)
        error = abs(y_pred - y) -  self.epsilon
        error[error < 0] = 0
        ans = np.dot(w,w)/2 + self.C * (np.sum(error)/n)
        return ans
    
    def gradient(self, w, x, y):
        y_pred = np.dot(x,w)
        error = abs(y_pred - y) - self.epsilon
        
        if max(0, error) == 0 or y_pred == y:
            dw = w
        elif y_pred > y:
            dw = w + self.C*x
        else:
            dw = w - self.C*x
        return dw
    
    def fit(self, x, y):
        points, features = x.shape
        w = np.zeros(features)
        n = 0
        prev = math.inf

        for e in range(self.epochs):
            for i, v in enumerate(x):
                descent = self.gradient(w, v, y[i])
                w = w - (self.lr * descent)
        
            if e == 2**n:
                temp = self.loss(w, x, y)
                if abs(prev - temp) < 0.01 * prev:
                    return w
                prev = temp
                n += 1
        return w

    def predict(self, x, w):
        return np.dot(x, w)

In [7]:
# Support Vector Regression with RBF Kernel
class KernelSVR:

    def __init__(self, learn_rate=0.001, C=10, epochs=1000, epsilon = 0.1):
        self.lr = learn_rate
        self.C = C
        self.epochs = epochs
        self.epsilon = epsilon
    
    def rbf(self, x, y):
        diff = x - y
        return np.exp(-np.dot(diff,diff) * len(x)/2)
        
    def loss(self, w, x, y):
        n = x.shape[0]
        y_pred = np.dot(x,w)
        error = abs(y_pred - y) -  self.epsilon
        error[error < 0] = 0
        ans = np.dot(w,w)/2 + self.C * (np.sum(error)/n)
        return ans
    
    def gradient(self, w, x, y):
        y_pred = np.dot(x,w)
        error = abs(y_pred - y) - self.epsilon
        
        if max(0, error) == 0 or y_pred == y:
            dw = w
        elif y_pred > y:
            dw = w + self.C*x
        else:
            dw = w - self.C*x
        return dw
    
    def fit(self, x, y):
        print('test1')
        print(x.shape)
        kxx = np.apply_along_axis(lambda a: np.apply_along_axis(lambda b: self.rbf(a, b), 1, x), 1, x)
        print('test2')
        print(kxx.shape)
        return 
        points, features = kxx.shape
        w = np.zeros(features)
        n = 0
        prev = math.inf

        for e in range(self.epochs):
            for i, v in enumerate(kxx):
                descent = self.gradient(w, v, y[i])
                w = w - (self.lr * descent)
        
            if e == 2**n:
                temp = self.loss(w, kxx, y)
                if abs(prev - temp) < 0.01 * prev:
                    print('converged')
                    return w
                prev = temp
                n += 1
        print('max epoch reached')
        return w

    def predict(self, x, w):
        ans = np.dot(x,w)
        ans[ans<0] = 0
        return ans

In [8]:
# Grabbing x/y train test sets for 'Confirmed' cases

confirmed_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1).to_numpy()
confirmed_train_y = train['log_Confirmed'].to_numpy()

confirmed_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1).to_numpy()
confirmed_test_y = test['log_Confirmed'].to_numpy()


In [9]:
# Adding intercept b value for x_test
confirmed_train_x = np.append(confirmed_train_x, np.zeros((confirmed_train_x.shape[0],1)), axis=1)
confirmed_test_x = np.append(confirmed_test_x, np.zeros((confirmed_test_x.shape[0],1)), axis=1)

model = LinearSVR()
weights = model.fit(confirmed_train_x,confirmed_train_y)
confirmed_pred_y = model.predict(confirmed_test_x, weights)
rmsle = round(mean_squared_log_error(confirmed_test_y, confirmed_pred_y, squared=False),3)
print('Root mean squared error is', rmsle)


Root mean squared error is 1.484


In [None]:
# Adding intercept b value
confirmed_train_x = np.append(confirmed_train_x, np.zeros((confirmed_train_x.shape[0],1)), axis=1)
confirmed_test_x = np.append(confirmed_test_x, np.zeros((confirmed_test_x.shape[0],1)), axis=1)

model = KernelSVR()
weights = model.fit(confirmed_train_x,confirmed_train_y)
confirmed_pred_y = model.predict(confirmed_test_x, weights)
rmsle = round(mean_squared_log_error(confirmed_test_y, confirmed_pred_y, squared=False),3)
print('Root mean squared error is', rmsle)

In [15]:
with open('pred.txt', 'w') as f:
    for item in confirmed_pred_y:
        f.write("%s\n" % item)

In [108]:
with open('w.txt', 'w') as f:
    for item in weights:
        f.write("%s\n" % item)

In [109]:
with open('test.txt', 'w') as f:
    for item in confirmed_test_y:
        f.write("%s\n" % item)