In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv("covid19_italy.csv")

In [3]:
print(data.head())

   SNo                 Date Country  RegionCode RegionName  ProvinceCode  \
0    0  2020-02-24 18:00:00     ITA          13    Abruzzo            69   
1    1  2020-02-24 18:00:00     ITA          13    Abruzzo            66   
2    2  2020-02-24 18:00:00     ITA          13    Abruzzo            68   
3    3  2020-02-24 18:00:00     ITA          13    Abruzzo            67   
4    4  2020-02-24 18:00:00     ITA          13    Abruzzo           979   

                           ProvinceName ProvinceAbbreviation   Latitude  \
0                                Chieti                   CH  42.351032   
1                              L'Aquila                   AQ  42.351222   
2                               Pescara                   PE  42.464584   
3                                Teramo                   TE  42.658918   
4  In fase di definizione/aggiornamento                  NaN   0.000000   

   Longitude  TotalPositiveCases  
0  14.167546                   0  
1  13.398438          

In [4]:
data['Date'].str.replace("-","")

0       20200224 18:00:00
1       20200224 18:00:00
2       20200224 18:00:00
3       20200224 18:00:00
4       20200224 18:00:00
5       20200224 18:00:00
6       20200224 18:00:00
7       20200224 18:00:00
8       20200224 18:00:00
9       20200224 18:00:00
10      20200224 18:00:00
11      20200224 18:00:00
12      20200224 18:00:00
13      20200224 18:00:00
14      20200224 18:00:00
15      20200224 18:00:00
16      20200224 18:00:00
17      20200224 18:00:00
18      20200224 18:00:00
19      20200224 18:00:00
20      20200224 18:00:00
21      20200224 18:00:00
22      20200224 18:00:00
23      20200224 18:00:00
24      20200224 18:00:00
25      20200224 18:00:00
26      20200224 18:00:00
27      20200224 18:00:00
28      20200224 18:00:00
29      20200224 18:00:00
              ...        
3042    20200318 17:00:00
3043    20200318 17:00:00
3044    20200318 17:00:00
3045    20200318 17:00:00
3046    20200318 17:00:00
3047    20200318 17:00:00
3048    20200318 17:00:00
3049    2020

In [5]:
data['Date'] = data['Date'].astype('datetime64[ns]') 

In [6]:
date = data['Date'].dt.strftime("%Y%m%d").astype(int)

In [7]:
date = pd.DataFrame(date.values,columns = ["date"])

In [8]:
print(date.head())

       date
0  20200224
1  20200224
2  20200224
3  20200224
4  20200224


In [9]:
region = pd.DataFrame(data['RegionCode'].values,columns = ["RegionCode"])

In [10]:
print(region.head())

   RegionCode
0          13
1          13
2          13
3          13
4          13


In [11]:
province= data['ProvinceCode']

In [12]:
print(province.head())

0     69
1     66
2     68
3     67
4    979
Name: ProvinceCode, dtype: int64


In [13]:
positive_cases = data['TotalPositiveCases']

In [14]:
print(positive_cases.head())

0    0
1    0
2    0
3    0
4    0
Name: TotalPositiveCases, dtype: int64


In [15]:
result = pd.concat([date,province,region],axis=1) 

In [16]:
print(result)

          date  ProvinceCode  RegionCode
0     20200224            69          13
1     20200224            66          13
2     20200224            68          13
3     20200224            67          13
4     20200224           979          13
5     20200224            77          17
6     20200224            76          17
7     20200224           980          17
8     20200224            21           4
9     20200224           981           4
10    20200224            79          18
11    20200224            78          18
12    20200224           101          18
13    20200224            80          18
14    20200224           102          18
15    20200224           982          18
16    20200224            64          15
17    20200224            62          15
18    20200224            61          15
19    20200224            63          15
20    20200224            65          15
21    20200224           983          15
22    20200224            37           8
23    20200224  

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train, x_test,y_train,y_test = train_test_split(result,positive_cases,test_size=0.33, random_state=0)

In [19]:
from sklearn.linear_model import LinearRegression

In [20]:
l_reg = LinearRegression()

In [21]:
l_reg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [22]:
l_pre = l_reg.predict(x_test)

In [23]:
print(r2_score(y_test,l_pre))

0.0636491244506261


In [24]:
from sklearn.ensemble import RandomForestRegressor

In [25]:
r_reg = RandomForestRegressor(10,random_state=10)

In [26]:
r_reg.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=10, verbose=0,
                      warm_start=False)

In [27]:
p_pred = r_reg.predict(x_test)

In [28]:
print(r2_score(y_test,p_pred))

0.9473474830057714


In [29]:
from sklearn.tree import DecisionTreeRegressor

In [30]:
d_reg = DecisionTreeRegressor(random_state=0)

In [31]:
d_reg.fit(x_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [32]:
d_pred = d_reg.predict(x_test)

In [33]:
print(r2_score(y_test,d_pred))

0.9555733315909787
