In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import svm

## Tiền xử lý và gộp dữ liệu từ ngày 13/04/2021 - 19/05/2021

In [2]:
def pretreatment(df, acceptable_limit):
        df.drop('Bla', axis=1, inplace=True)
        df.drop('Position', axis=1, inplace=True)
        df.drop(df[df.isnull().sum(axis = 1) > acceptable_limit].index, inplace=True)

In [3]:
def data(d):
    df = pd.read_csv(d + '.csv', sep='\t', index_col = False)
    df.apply(lambda x: x.replace(',','',regex=True, inplace=True))
    pretreatment(df,2)
    num_col = ['Total_Cases', 'New_Cases', 'Total_Deaths', 'New_Deaths', 
           'Total_Recovered','New_Recovered', 'Active_Cases', 'Serious_Critical', 
           'Tot_Cases/1M_pop', 'Deaths/1M_pop', 'Total_Tests', 'Tests/1M_pop', 'Population']
    for i in num_col:
        df[i] = pd.to_numeric(df[i])
    df.fillna(0, inplace = True)
    df['Day'] = d
    return df

In [4]:
days = ['13-04-2021', '14-04-2021', '15-04-2021', '16-04-2021', '17-04-2021', '18-04-2021', '19-04-2021',
       '20-04-2021', '21-04-2021', '22-04-2021', '23-04-2021', '24-04-2021', '25-04-2021', '26-04-2021',
       '27-04-2021', '28-04-2021', '29-04-2021', '30-04-2021', '01-05-2021', '02-05-2021', '03-05-2021',
       '04-05-2021', '05-05-2021', '06-05-2021', '07-05-2021', '08-05-2021', '09-05-2021', '10-05-2021',
       '12-05-2021', '13-05-2021', '14-05-2021', '15-05-2021', '16-05-2021', '17-05-2021', '18-05-2021',
       '19-05-2021']

In [5]:
df = data(days[0])
for d in days[1:]:
    df = pd.concat([df,data(d)])
df.to_csv('model_data.csv', index = False)
pd.read_csv('model_data.csv')

Unnamed: 0,Country,Total_Cases,New_Cases,Total_Deaths,New_Deaths,Total_Recovered,New_Recovered,Active_Cases,Serious_Critical,Tot_Cases/1M_pop,Deaths/1M_pop,Total_Tests,Tests/1M_pop,Population,Region,Day
0,China,90435,9.0,4636,0.0,85506,11.0,293,4.0,63,3.0,160000000,111163,1439323776,Asia,13-04-2021
1,USA,32070784,77720.0,577177,819.0,24626410,65554.0,6867197,9540.0,96446,1736.0,422126761,1269452,332526757,North America,13-04-2021
2,India,13871321,185248.0,172115,1026.0,12332688,82248.0,1366518,8944.0,9975,124.0,259207108,186394,1390642113,Asia,13-04-2021
3,Brazil,13601566,80157.0,358718,3687.0,12074798,117730.0,1168050,8318.0,63634,1678.0,28600000,133804,213745314,South America,13-04-2021
4,France,5106329,39113.0,99480,345.0,3972182,25998.0,1034667,5952.0,78094,1521.0,70448755,1077411,65387072,Europe,13-04-2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5715,St. Vincent Grenadines,1949,2.0,12,0.0,1780,6.0,157,2.0,17519,108.0,47584,427718,111251,North America,19-05-2021
5716,Laos,1737,50.0,2,0.0,708,22.0,1027,0.0,236,0.3,227269,30844,7368283,Asia,19-05-2021
5717,Caribbean Netherlands,1605,1.0,17,0.0,1570,1.0,18,0.0,60710,643.0,8550,323410,26437,North America,19-05-2021
5718,Antigua and Barbuda,1252,1.0,42,0.0,1184,2.0,26,2.0,12692,426.0,16700,169297,98643,North America,19-05-2021


## Tách tập train, test

In [6]:
y = df['New_Cases']
X = df.drop('New_Cases', axis = 1)

In [7]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
train_data = train_X.copy()
train_data['New_Cases'] = train_y.copy()

In [9]:
train_data.to_csv('train_data.csv', index = False)
test_X.to_csv('test_X.csv', index = False)
test_y.to_csv('test_y.csv', index = False)

In [10]:
train_df = pd.read_csv('train_data.csv')

## Tách tập train, validation

In [11]:
y = train_df['New_Cases']
X = train_df.drop('New_Cases', axis = 1)

In [12]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=0)

## Preprocessing
- Bỏ các cột `Country`, `Region`, `Day`
- Class `ColAdderDropper` ở dưới sẽ thực hiện xóa các cột đó

In [13]:
class ColAdderDropper(TransformerMixin,BaseEstimator):
    def __init__(self, dropcols=None):
        """Encode, add, drop columns"""
        self.dropcols = dropcols
    def fit(self, X_df, y=None):
        return self
    def transform(self, X_df, y=None):
        transform_df = X_df.copy()
        transform_df.drop(self.dropcols, axis=1, inplace=True)
        return transform_df

## Modeling
- Sử dụng mô hình `Linear Regression` và `SVM` để dự đoán

### Linear Regression

In [14]:
eadc = ColAdderDropper(dropcols = ['Country', 'Region', 'Day'])
scaler = StandardScaler()

In [15]:
linear = LinearRegression()
linear_pipe = make_pipeline(eadc,scaler,linear)

In [16]:
linear_pipe.fit(train_X, train_y)
print('Train score:',linear_pipe.score(train_X, train_y))
print('Validation score:',linear_pipe.score(val_X, val_y))

Train score: 0.917399159577572
Validation score: 0.9364471181538294


### SVM

In [17]:
svr = svm.SVR()

full_pipeline = Pipeline([('eadc',eadc),('standard', StandardScaler()), ('svr',svr)])

svr_train_scores, svr_val_scores = [], []
c_s = [0.2, 0.5, 0.8, 1.0]
best_val_score = -float('inf')
best_c_svr = None
best_k_svr = None

for k in ['linear','poly','rbf','sigmoid']:
    print(k)
    for c in c_s:
        print(c)
        full_pipeline.set_params(svr__C = c, svr__kernel = k)
        full_pipeline.fit(train_X, train_y)
        train_score = full_pipeline.score(train_X, train_y)
        val_score = full_pipeline.score(val_X, val_y)
        print(train_score, val_score)
        svr_train_scores.append(train_score)
        svr_val_scores.append(val_score)
        if best_val_score < val_score:
            best_val_score = val_score
            best_c_svr = c
            best_k_svr = k
        print()

linear
0.2
0.0475794270434825 0.04916193665755608

0.5
0.12942910860407386 0.13226748130479837

0.8
0.17879303064470375 0.18103309393135447

1.0
0.21603002018884054 0.21765580048773436

poly
0.2
0.7101362956588412 0.7505886319325619

0.5
0.7825041584562493 0.834675398582085

0.8
0.8064766102654795 0.8561577240956271

1.0
0.8198026637571022 0.8654004575021833

rbf
0.2
-0.024929708590829325 -0.026120204773745437

0.5
-0.02400751017688152 -0.02523797455383936

0.8
-0.022979511138755404 -0.024254948860073178

1.0
-0.022371283147185173 -0.02367264133738134

sigmoid
0.2
-0.02325969600619815 -0.02445242521910318

0.5
-0.01986428837873966 -0.021099424453555837

0.8
-0.01646367157892481 -0.017747186629987866

1.0
-0.01421364326780128 -0.015527205477055661



In [18]:
best_val_score

0.8654004575021833

### Lấy mô hình tốt nhất train cho toàn tập train + validation
- Ta thấy mô hình `Linear Regression` cho kết quả trên tập validation cao hơn vì vậy chọn mô hình này

In [19]:
eadc = ColAdderDropper(dropcols = ['Country', 'Region', 'Day'])
scaler = StandardScaler()
linreg = LinearRegression()
full_pipeline = make_pipeline(eadc,scaler,linreg)
full_pipeline.fit(X, y)

Pipeline(steps=[('coladderdropper',
                 ColAdderDropper(dropcols=['Country', 'Region', 'Day'])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

### Đánh giá mô hình

In [20]:
test_X = pd.read_csv('test_X.csv')
test_y = pd.read_csv('test_y.csv')

In [21]:
full_pipeline.score(test_X, test_y)

0.9148098882375496