# Importing necessary packages

In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import pickle
import json

%matplotlib inline

# Importing and checking for missing values

In [52]:
df = None

try:
    df = pd.read_csv("dataset/data.csv")
    print("File found and imported successfully!")
    df.head()
except:
    print("File not found.")

File found and imported successfully!


# Data preprocessing

In [53]:
df.isnull().sum()

location                        0
date                            0
total_cases_per_million      3615
total_deaths_per_million    13205
dtype: int64

In [54]:
df = df.fillna(0) # replace missing values with 0

In [55]:
df.describe()

Unnamed: 0,total_cases_per_million,total_deaths_per_million
count,59091.0,59091.0
mean,3275.143496,78.471887
std,6727.495436,171.056173
min,0.0,0.0
25%,56.3145,0.153
50%,511.035,7.961
75%,3423.5845,59.16
max,85549.731,1401.513


In [56]:
df['year'] = df.date.str.slice(start=6)
df['month'] = df.date.str.slice(start=3, stop=5)
df['day'] = df.date.str.slice(start=0, stop=2)
df = pd.concat([df, pd.get_dummies(df.location)], axis = 'columns')
df = df.drop(['date', 'location'], axis = 'columns')

# Creating target and input variables

In [57]:
X = df.drop(['total_cases_per_million', 'total_deaths_per_million'], axis = 'columns')
X.head()

Unnamed: 0,year,month,day,Afghanistan,Albania,Algeria,Andorra,Angola,Anguilla,Antigua and Barbuda,...,Uzbekistan,Vanuatu,Vatican,Venezuela,Vietnam,Wallis and Futuna,Western Sahara,Yemen,Zambia,Zimbabwe
0,2019,12,31,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020,1,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020,1,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020,1,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
data = {
    'columns': [col for col in X]
}
with open('website/static/data.json', 'w') as f:
    f.write(json.dumps(data))

# TOTAL CASES MODEL

In [59]:
y = df.total_cases_per_million
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)
lr = LinearRegression()
lr.fit(X_train.values, y_train)

In [60]:
# Model Evaluation
y_predict = lr.predict(X_test)
print("Mean Absolute Error : ", mean_absolute_error(y_test, y_predict))
print("Mean Squared Error : ", mean_squared_error(y_test, y_predict))
print("Root Mean Squared Error : ", np.sqrt(mean_squared_error(y_test, y_predict)))
print("R2 Score : ", r2_score(y_test, y_predict))

Mean Absolute Error :  2840.7575532949836
Mean Squared Error :  19341347.834434286
Root Mean Squared Error :  4397.879924967743
R2 Score :  0.5626370765339409




In [61]:
with open('website/ml_models/total_cases_model.pickle', 'wb') as f:
    pickle.dump(lr, f)

# TOTAL DEATHS MODEL

In [62]:
y = df.total_deaths_per_million
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)
lr = LinearRegression()
lr.fit(X_train.values, y_train)

In [63]:
# Model Evaluation
y_predict = lr.predict(X_test)
print("Mean Absolute Error : ", mean_absolute_error(y_test, y_predict))
print("Mean Squared Error : ", mean_squared_error(y_test, y_predict))
print("Root Mean Squared Error : ", np.sqrt(mean_squared_error(y_test, y_predict)))
print("R2 Score : ", r2_score(y_test, y_predict))

Mean Absolute Error :  60.52659885702042
Mean Squared Error :  9132.45640538318
Root Mean Squared Error :  95.56388651254814
R2 Score :  0.6798891421782824




In [64]:
with open('website/ml_models/total_deaths_model.pickle', 'wb') as f:
    pickle.dump(lr, f)