# IMPORTING NECESSARY PACKAGES

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [31]:
import pickle
import json

# READING CSV FILE

In [32]:
df = pd.read_csv("covid-data.csv")
df.head()

Unnamed: 0,continent,location,total_cases,new_cases,total_deaths,new_deaths,population,date
0,North America,Aruba,93.0,1,1.0,1,106766,1586995200
1,North America,Aruba,95.0,2,1.0,0,106766,1587081600
2,North America,Aruba,96.0,1,2.0,1,106766,1587168000
3,North America,Aruba,96.0,0,2.0,0,106766,1587254400
4,North America,Aruba,97.0,1,2.0,0,106766,1587340800


In [33]:
data = df.groupby(['continent', 'location', 'population'])
data.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_cases,new_cases,total_deaths,new_deaths,date
continent,location,population,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Africa,Algeria,43851043,25.0,5,1.0,1,1584057600
Africa,Angola,32866268,7.0,3,2.0,2,1585526400
Africa,Benin,12123198,26.0,4,1.0,1,1586217600
Africa,Botswana,2351625,4.0,1,1.0,1,1585785600
Africa,Burkina Faso,20903278,26.0,6,1.0,1,1584576000
...,...,...,...,...,...,...,...
South America,Paraguay,7132530,5.0,4,1.0,0,1583884800
South America,Peru,32971846,7.0,6,2.0,0,1583712000
South America,Suriname,586634,10.0,0,1.0,1,1585958400
South America,Uruguay,3473727,295.0,36,1.0,1,1585526400


# DATA PREPROCESSING

In [34]:
df.isnull().sum()

continent          0
location           0
total_cases        1
new_cases          0
total_deaths    1889
new_deaths         0
population         0
date               0
dtype: int64

In [35]:
df = df.dropna()

In [36]:
df = pd.concat([df, pd.get_dummies(df.location)], axis = 'columns')
df.head()

Unnamed: 0,continent,location,total_cases,new_cases,total_deaths,new_deaths,population,date,Afghanistan,Albania,...,United States,United States Virgin Islands,Uruguay,Uzbekistan,Venezuela,Vietnam,Western Sahara,Yemen,Zambia,Zimbabwe
0,North America,Aruba,93.0,1,1.0,1,106766,1586995200,0,0,...,0,0,0,0,0,0,0,0,0,0
1,North America,Aruba,95.0,2,1.0,0,106766,1587081600,0,0,...,0,0,0,0,0,0,0,0,0,0
2,North America,Aruba,96.0,1,2.0,1,106766,1587168000,0,0,...,0,0,0,0,0,0,0,0,0,0
3,North America,Aruba,96.0,0,2.0,0,106766,1587254400,0,0,...,0,0,0,0,0,0,0,0,0,0
4,North America,Aruba,97.0,1,2.0,0,106766,1587340800,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
df = pd.concat([df, pd.get_dummies(df.continent)], axis = 'columns')
df.head()

Unnamed: 0,continent,location,total_cases,new_cases,total_deaths,new_deaths,population,date,Afghanistan,Albania,...,Western Sahara,Yemen,Zambia,Zimbabwe,Africa,Asia,Europe,North America,Oceania,South America
0,North America,Aruba,93.0,1,1.0,1,106766,1586995200,0,0,...,0,0,0,0,0,0,0,1,0,0
1,North America,Aruba,95.0,2,1.0,0,106766,1587081600,0,0,...,0,0,0,0,0,0,0,1,0,0
2,North America,Aruba,96.0,1,2.0,1,106766,1587168000,0,0,...,0,0,0,0,0,0,0,1,0,0
3,North America,Aruba,96.0,0,2.0,0,106766,1587254400,0,0,...,0,0,0,0,0,0,0,1,0,0
4,North America,Aruba,97.0,1,2.0,0,106766,1587340800,0,0,...,0,0,0,0,0,0,0,1,0,0


In [38]:
df = df.drop(['continent', 'location'], axis = 'columns')
df.head()

Unnamed: 0,total_cases,new_cases,total_deaths,new_deaths,population,date,Afghanistan,Albania,Algeria,Andorra,...,Western Sahara,Yemen,Zambia,Zimbabwe,Africa,Asia,Europe,North America,Oceania,South America
0,93.0,1,1.0,1,106766,1586995200,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,95.0,2,1.0,0,106766,1587081600,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,96.0,1,2.0,1,106766,1587168000,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,96.0,0,2.0,0,106766,1587254400,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,97.0,1,2.0,0,106766,1587340800,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [39]:
X = df.drop(['total_cases', 'new_cases', 'total_deaths', 'new_deaths'], axis = 'columns')
X.head()

Unnamed: 0,population,date,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,...,Western Sahara,Yemen,Zambia,Zimbabwe,Africa,Asia,Europe,North America,Oceania,South America
0,106766,1586995200,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,106766,1587081600,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,106766,1587168000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,106766,1587254400,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,106766,1587340800,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [40]:
keys = data.groups.keys()
data = {
    'columns': [col for col in X],
    'data': {}
}
for (continent, country, population) in keys: 
    temp = data['data'].get(continent, {})
    temp[country] = population
    data['data'][continent] = temp
with open('data.json', 'w') as f:
    f.write(json.dumps(data))

# NEW CASES MODEL

In [41]:
y = df.new_cases
y.head()

0    1
1    2
2    1
3    0
4    1
Name: new_cases, dtype: int64

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)

In [43]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.6230888343458343

In [44]:
with open('new_cases_model.pickle', 'wb') as f:
    pickle.dump(lr, f)

# TOTAL CASES MODEL

In [45]:
y = df.total_cases
y.head()

0    93.0
1    95.0
2    96.0
3    96.0
4    97.0
Name: total_cases, dtype: float64

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)

In [47]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.5836633948737515

In [48]:
with open('total_cases_model.pickle', 'wb') as f:
    pickle.dump(lr, f)

# NEW DEATHS MODEL

In [49]:
y = df.new_deaths
y.head()

0    1
1    0
2    1
3    0
4    0
Name: new_deaths, dtype: int64

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)

In [51]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.5881902433864177

In [52]:
with open('new_deaths_model.pickle', 'wb') as f:
    pickle.dump(lr, f)

# TOTAL DEATHS MODEL

In [53]:
y = df.total_deaths
y.head()

0    1.0
1    1.0
2    2.0
3    2.0
4    2.0
Name: total_deaths, dtype: float64

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)

In [55]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.706590607968922

In [56]:
with open('total_deaths_model.pickle', 'wb') as f:
    pickle.dump(lr, f)