# Predict Covid Spread - SVM

In [1]:
import pandas as pd
from os.path import exists
import warnings
from datetime import date
import math

warnings.filterwarnings('ignore')

# Reading in files
train = pd.read_csv('[Cleaned] Forecasting Data/train.csv')
test = pd.read_csv('[Cleaned] Forecasting Data/test.csv')

train = train.drop('Unnamed: 0', axis=1)
test = test.drop('Unnamed: 0', axis=1)

def date_to_int(d):
    initial_date = date(2020, 1, 21)
    curr_date = [int(x) for x in d.split('-')]
    final_date = date(curr_date[0], curr_date[1], curr_date[2])
    delta = final_date - initial_date
    return delta.days

# Feature Engineering - One hot encoding for country and Numeric Day value
one_hot_train = pd.get_dummies(train['Country'])
train = train.join(one_hot_train)
train['Day'] = train['Date'].apply(lambda x: date_to_int(x))

one_hot_test = pd.get_dummies(test['Country'])
test = test.join(one_hot_train)
test['Day'] = test['Date'].apply(lambda x: date_to_int(x))

# Outcome Log Transformation
train['log_Confirmed'] = train['Confirmed'].apply(lambda x: round(math.log(x+1),3))
train['log_Deaths'] = train['Deaths'].apply(lambda x: round(math.log(x+1),3))
train['log_Recovered'] = train['Recovered'].apply(lambda x: round(math.log(x+1),3))
train['log_Active'] = train['Active'].apply(lambda x: round(math.log(x+1),3))

test['log_Confirmed'] = test['Confirmed'].apply(lambda x: round(math.log(x+1),3))
test['log_Deaths'] = test['Deaths'].apply(lambda x: round(math.log(x+1),3))
test['log_Recovered'] = test['Recovered'].apply(lambda x: round(math.log(x+1),3))
test['log_Active'] = test['Active'].apply(lambda x: round(math.log(x+1),3))

train.head()

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Active,Afghanistan,Albania,Algeria,Andorra,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
0,Afghanistan,2020-01-22,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
1,Albania,2020-01-22,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
2,Algeria,2020-01-22,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
3,Andorra,2020-01-22,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
4,Angola,2020-01-22,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0


In [2]:
# Preparing vectorized features to input into model
train.drop(train.iloc[:, 0:6], inplace = True, axis = 1)
test.drop(test.iloc[:, 0:6], inplace = True, axis = 1)

In [3]:
train.head()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0.0,0.0,0.0,0.0


In [4]:
test.tail()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Day,log_Confirmed,log_Deaths,log_Recovered,log_Active
8036,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,93,5.82,1.099,4.317,5.565
8037,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,93,1.946,0.0,1.792,0.693
8038,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,93,0.693,0.0,0.0,0.693
8039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,93,4.344,1.386,3.638,3.611
8040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,93,3.367,1.609,1.099,3.135


In [5]:
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from statistics import mean

# Grabbing x/y train test sets for 'Confirmed' cases
confirmed_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1)
confirmed_train_y = train['log_Confirmed']

confirmed_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1)
confirmed_test_y = test['log_Confirmed']

# Performing 5-fold cross validation on rolling basis for 'Confirmed' SVR model
all_train = confirmed_train_x.append(confirmed_test_x).to_numpy()
all_values = confirmed_train_y.append(confirmed_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

flag = 0
for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = LinearSVR(C=10, epsilon=0.1)
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    predict[predict < 0] = 0
    if flag == 3:
        with open('pred.txt', 'w') as f:
            for item in predict:
                f.write("%s\n" % item)
    else:
        flag += 1
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Confirmed Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Confirmed cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Confirmed cases is:', mean(rmsle_scores))

Now predicting Confirmed Cases...
Cross validated root mean squared errors are: [0.494, 1.551, 3.348, 3.944, 4.944]
Average RMSE using SVM to predict Confirmed cases is: 2.8562

Cross validated root mean squared log errors are: [0.291, 0.587, 1.051, 0.798, 0.961]
Average RMSLE using SVM to predict Confirmed cases is: 0.7376


In [6]:
with open('test.txt', 'w') as f:
    for item in confirmed_test_y:
        f.write("%s\n" % item)

In [83]:
# Grabbing x/y train test sets for 'Deaths' cases
deaths_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1)
deaths_train_y = train['log_Deaths']

deaths_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1)
deaths_test_y = test['log_Deaths']

# Performing 5-fold cross validation on rolling basis for 'Deaths' SVR model
all_train = deaths_train_x.append(deaths_test_x).to_numpy()
all_values = deaths_train_y.append(deaths_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = SVR(C=10, kernel='linear', epsilon=0.1, gamma='auto')
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Deaths Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Deaths cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Deaths cases is:', mean(rmsle_scores))

Now predicting Deaths Cases...
Cross validated root mean squared errors are: [0.502, 0.741, 1.284, 2.348, 2.246]
Average RMSE using SVM to predict Deaths cases is: 1.4242

Cross validated root mean squared log errors are: [0.192, 0.299, 0.465, 0.689, 0.593]
Average RMSLE using SVM to predict Deaths cases is: 0.4476


In [84]:
# Grabbing x/y train test sets for 'Deaths' cases
recovered_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1)
recovered_train_y = train['log_Recovered']

recovered_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1)
recovered_test_y = test['log_Recovered']

# Performing 5-fold cross validation on rolling basis for 'Recovered' SVR model
all_train = recovered_train_x.append(recovered_test_x).to_numpy()
all_values = recovered_train_y.append(recovered_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
rmsle_scores = []

for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = SVR(C=10, kernel='linear', epsilon=0.1, gamma='auto')
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))
    rmsle_scores.append(round(mean_squared_log_error(y_test, predict, squared=False),3))

print('Now predicting Recovered Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Recovered cases is:', mean(rmse_scores))
print()
print('Cross validated root mean squared log errors are:', rmsle_scores)
print('Average RMSLE using SVM to predict Recovered cases is:', mean(rmsle_scores))

Now predicting Recovered Cases...
Cross validated root mean squared errors are: [0.743, 1.078, 1.58, 2.893, 2.433]
Average RMSE using SVM to predict Recovered cases is: 1.7454

Cross validated root mean squared log errors are: [0.279, 0.424, 0.519, 0.778, 0.631]
Average RMSLE using SVM to predict Recovered cases is: 0.5262


In [91]:
# Grabbing x/y train test sets for 'Deaths' cases
active_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1)
active_train_y = train['log_Active']

active_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1)
active_test_y = test['log_Active']

# svr_active = SVR(C=10, kernel='rbf', epsilon=0.1, gamma='auto')
# svr_active.fit(active_train_x, active_train_y)

# active_predict = svr_active.predict(active_test_x)
# print('Now predicting Active Cases...')
# print('Root mean squared error using SVM =', mean_squared_error(active_test_y, active_predict, squared=False))
# print('Root mean squared log error using SVM =', mean_squared_log_error(active_test_y, active_predict, squared=False))



# Performing 5-fold cross validation on rolling basis for 'Active' SVR model
all_train = active_train_x.append(active_test_x).to_numpy()
all_values = active_train_y.append(active_test_y).to_numpy()

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []

for train_index, test_index in tscv.split(all_train):
    x_train, x_test = all_train[train_index], all_train[test_index]
    y_train, y_test = all_values[train_index], all_values[test_index]
    
    svr = SVR(C=10, kernel='linear', epsilon=0.1, gamma='auto')
    svr.fit(x_train, y_train)
    predict = svr.predict(x_test)
    rmse_scores.append(round(mean_squared_error(y_test, predict, squared=False),3))

print('Now predicting Active Cases...')
print('Cross validated root mean squared errors are:', rmse_scores)
print('Average RMSE using SVM to predict Active cases is:', round(mean(rmse_scores),4))

Now predicting Active Cases...
Cross validated root mean squared errors are: [0.956, 1.522, 2.57, 2.332, 2.354]
Average RMSE using SVM to predict Active cases is: 1.9468


In [94]:
# Performing prediction with mean squared log error, was getting a negative value for some reason in previous code block
active_train_x = train.drop(train.columns[[-1,-2,-3,-4]], axis = 1)
active_train_y = train['log_Active']

active_test_x = test.drop(test.columns[[-1,-2,-3,-4]], axis = 1)
active_test_y = test['log_Active']

svr_active = SVR(C=10, kernel='linear', epsilon=0.1, gamma='auto')
svr_active.fit(active_train_x, active_train_y)

active_predict = svr_active.predict(active_test_x)
print('Root mean squared log error using SVM for Active Cases =', round(mean_squared_log_error(active_test_y, active_predict, squared=False),3))

Root mean squared log error using SVM for Active Cases = 0.623
