In [1]:
import pandas as pd
import datetime

import numpy as np
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
import datetime
import operator

import warnings
warnings.filterwarnings("ignore")

In [2]:
url='https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
covid_data_RAW = pd.read_csv(url)

In [3]:
covid_data = covid_data_RAW.rename(
        columns = {'Province/State':'subregion'
                   ,'Country/Region':'country'
                   ,'Lat':'lat'
                   ,'Long':'long'
                   }
        )

In [4]:
covid_data = (covid_data.melt(id_vars = ['country','subregion','lat','long']
                 ,var_name = 'date_RAW'
                 ,value_name = 'confirmed'
                 )
)

In [5]:
(covid_data
    .filter(['date_RAW'])
)

Unnamed: 0,date_RAW
0,1/22/20
1,1/22/20
2,1/22/20
3,1/22/20
4,1/22/20
...,...
310092,12/29/22
310093,12/29/22
310094,12/29/22
310095,12/29/22


In [6]:
(covid_data
    .assign(date = pd.to_datetime(covid_data.date_RAW, format='%m/%d/%y'))
    .filter(['date','date_RAW','confirmed'])
    .groupby(['date','date_RAW'])
    .agg('sum')
    .sort_values('date')
)

Unnamed: 0_level_0,Unnamed: 1_level_0,confirmed
date,date_RAW,Unnamed: 2_level_1
2020-01-22,1/22/20,557
2020-01-23,1/23/20,657
2020-01-24,1/24/20,944
2020-01-25,1/25/20,1437
2020-01-26,1/26/20,2120
...,...,...
2022-12-25,12/25/22,657127276
2022-12-26,12/26/22,657640732
2022-12-27,12/27/22,658286056
2022-12-28,12/28/22,658908677


In [7]:
covid_data = covid_data.assign(
         date = pd.to_datetime(covid_data.date_RAW, format='%m/%d/%y')
         )

In [8]:
covid_data = (covid_data
               .filter(['country', 'subregion', 'date', 'lat', 'long', 'confirmed'])
               .sort_values(['country','subregion','lat','long','date'])
               )


In [9]:
covid_data.set_index('country', inplace = True)

In [10]:
pd.set_option('display.max_rows', 155)

In [11]:
(covid_data
    .reset_index()
    .filter(['country'])
    .drop_duplicates()
    .head(n = 200)
)

Unnamed: 0,country
0,Afghanistan
1073,Albania
2146,Algeria
3219,Andorra
4292,Angola
...,...
303659,Vietnam
304732,West Bank and Gaza
305805,Winter Olympics 2022
306878,Yemen


In [12]:
pd.reset_option('display.max_rows')

In [13]:
covid_data.loc['Indonesia']

Unnamed: 0_level_0,subregion,date,lat,long,confirmed
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Indonesia,,2020-01-22,-0.7893,113.9213,0
Indonesia,,2020-01-23,-0.7893,113.9213,0
Indonesia,,2020-01-24,-0.7893,113.9213,0
Indonesia,,2020-01-25,-0.7893,113.9213,0
Indonesia,,2020-01-26,-0.7893,113.9213,0
...,...,...,...,...,...
Indonesia,,2022-12-25,-0.7893,113.9213,6716124
Indonesia,,2022-12-26,-0.7893,113.9213,6716592
Indonesia,,2022-12-27,-0.7893,113.9213,6717395
Indonesia,,2022-12-28,-0.7893,113.9213,6718090


In [14]:
def covid_rename_columns(input_data):
    output_data = input_data.rename(
                              columns = {'Province/State':'subregion'
                                         ,'Country/Region':'country'
                                         ,'Lat':'lat'
                                         ,'Long':'long'
                                         }
                              )
    return(output_data)

In [15]:
def covid_fill_missing(input_data):
    output_data = input_data.fillna(value = {'subregion':''})
    return(output_data)

In [16]:
def covid_melt_data(input_data, value_var_name):
    output_data = input_data.melt(id_vars = ['country','subregion','lat','long']
                                  ,var_name = 'date_RAW'
                                  ,value_name = value_var_name
                                  )
    return(output_data)

In [17]:
def covid_convert_dates(input_data):
    output_data = input_data.assign(
         date = pd.to_datetime(input_data.date_RAW, format='%m/%d/%y')
         )
    output_data.drop(columns = ['date_RAW'], inplace = True)
    return(output_data)

In [18]:
def covid_rearrange_data(input_data,value_var_name):
    output_data = (input_data
                   .filter(['country', 'subregion', 'date', 'lat', 'long', value_var_name])
                   .sort_values(['country','subregion','date','lat','long'])
                   .reset_index(drop = True)
                   )
    return(output_data)

In [19]:
def covid_get_data(input_url, value_var_name):
    covid_data_inprocess = pd.read_csv(input_url)
    covid_data_inprocess = covid_rename_columns(covid_data_inprocess)
    covid_data_inprocess = covid_fill_missing(covid_data_inprocess)
    covid_data_inprocess = covid_melt_data(covid_data_inprocess,value_var_name)
    covid_data_inprocess = covid_convert_dates(covid_data_inprocess)
    covid_data_inprocess = covid_rearrange_data(covid_data_inprocess, value_var_name)
    return(covid_data_inprocess)

In [20]:
url_confirmed = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
url_deaths = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
url_recovered = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'


covid_confirmed = covid_get_data(url_confirmed,'confirmed')
covid_deaths = covid_get_data(url_deaths,'dead')
covid_recovered = covid_get_data(url_recovered,'recovered')

In [21]:
len(covid_confirmed) # 17204
len(covid_deaths)    # 17204
len(covid_recovered) # 16252

294002

In [22]:
covid_deaths.drop(columns = ['lat','long'], inplace = True)
covid_recovered.drop(columns = ['lat','long'], inplace = True)

In [23]:
covid_data = (covid_confirmed
                .merge(covid_deaths, on = ['country','subregion','date'], how = 'left')
                .merge(covid_recovered, on = ['country','subregion','date'], how = 'left')
             )

In [24]:
print(covid_data)

            country subregion       date        lat       long  confirmed  \
0       Afghanistan           2020-01-22  33.939110  67.709953          0   
1       Afghanistan           2020-01-23  33.939110  67.709953          0   
2       Afghanistan           2020-01-24  33.939110  67.709953          0   
3       Afghanistan           2020-01-25  33.939110  67.709953          0   
4       Afghanistan           2020-01-26  33.939110  67.709953          0   
...             ...       ...        ...        ...        ...        ...   
310092     Zimbabwe           2022-12-25 -19.015438  29.154857     259981   
310093     Zimbabwe           2022-12-26 -19.015438  29.154857     259981   
310094     Zimbabwe           2022-12-27 -19.015438  29.154857     259981   
310095     Zimbabwe           2022-12-28 -19.015438  29.154857     259981   
310096     Zimbabwe           2022-12-29 -19.015438  29.154857     259981   

        dead  recovered  
0          0        0.0  
1          0        0.0

In [25]:
#making for specifically indonesia
covidDF=(covid_data.query('country=="Indonesia"').reset_index(drop=True))

In [26]:
covidDF=(covidDF.assign(new_cases=covidDF.confirmed.diff()))

In [27]:
covidDF.drop('lat', inplace=True, axis=1)
covidDF.drop('long', inplace=True, axis=1)
covidDF.drop('recovered', inplace=True, axis=1)
covidDF.drop('subregion', inplace=True, axis=1)

In [28]:
covidDF

Unnamed: 0,country,date,confirmed,dead,new_cases
0,Indonesia,2020-01-22,0,0,
1,Indonesia,2020-01-23,0,0,0.0
2,Indonesia,2020-01-24,0,0,0.0
3,Indonesia,2020-01-25,0,0,0.0
4,Indonesia,2020-01-26,0,0,0.0
...,...,...,...,...,...
1068,Indonesia,2022-12-25,6716124,160537,538.0
1069,Indonesia,2022-12-26,6716592,160551,468.0
1070,Indonesia,2022-12-27,6717395,160560,803.0
1071,Indonesia,2022-12-28,6718090,160574,695.0


In [29]:
covidDF["date"] = pd.to_datetime(covidDF["date"], format = "%Y-%m-%d")
covidDF

Unnamed: 0,country,date,confirmed,dead,new_cases
0,Indonesia,2020-01-22,0,0,
1,Indonesia,2020-01-23,0,0,0.0
2,Indonesia,2020-01-24,0,0,0.0
3,Indonesia,2020-01-25,0,0,0.0
4,Indonesia,2020-01-26,0,0,0.0
...,...,...,...,...,...
1068,Indonesia,2022-12-25,6716124,160537,538.0
1069,Indonesia,2022-12-26,6716592,160551,468.0
1070,Indonesia,2022-12-27,6717395,160560,803.0
1071,Indonesia,2022-12-28,6718090,160574,695.0


In [30]:
covidDF=covidDF.dropna()
covidDF

Unnamed: 0,country,date,confirmed,dead,new_cases
1,Indonesia,2020-01-23,0,0,0.0
2,Indonesia,2020-01-24,0,0,0.0
3,Indonesia,2020-01-25,0,0,0.0
4,Indonesia,2020-01-26,0,0,0.0
5,Indonesia,2020-01-27,0,0,0.0
...,...,...,...,...,...
1068,Indonesia,2022-12-25,6716124,160537,538.0
1069,Indonesia,2022-12-26,6716592,160551,468.0
1070,Indonesia,2022-12-27,6717395,160560,803.0
1071,Indonesia,2022-12-28,6718090,160574,695.0


In [39]:
yTrain = covidDF.iloc[705:997, 4]
xTrain = covidDF.iloc[705:997, 1]

yTest = covidDF.iloc[997:1070, 4]
xTest = covidDF.iloc[997:1070, 1]

xTrain = xTrain.to_numpy()
xTest = xTest.to_numpy()

xTrain = xTrain.reshape(-1, 1)
xTest = xTest.reshape(-1, 1)




In [40]:
SVM = SVR(kernel ="rbf", gamma = 1.0, C = 0.1, epsilon = 0.01)
SVM.fit(xTrain,yTrain)

In [41]:
yPredict = SVM.predict(xTest)

In [42]:
yPredict

array([2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5,
       2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5,
       2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5,
       2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5,
       2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5,
       2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5,
       2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5,
       2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5,
       2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5, 2164.5,
       2164.5])

In [44]:
mean_absolute_error(yTest, yPredict)

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [36]:
yTrain = covidDF.iloc[705:1070 , 2]
xTrain = covidDF.iloc[705:1070, 1]

yTest = covidDF.iloc[705:1070, 2]
xTest = covidDF.iloc[705:1070, 1]

xTrain = xTrain.to_numpy()
xTest = xTest.to_numpy()

xTrain = xTrain.reshape(-1, 1)
xTest = xTest.reshape(-1, 1)

In [37]:
SVM = SVR(kernel ="rbf", gamma = 1.0, C = 0.1, epsilon = 0.01)
SVM.fit(xTrain,yTrain)

yPredict = SVM.predict(xTest)

In [38]:
yPredict

array([6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9, 6084062.9,
       6084062.9, 6084062.9, 6084062.9, 6084062.9, 

In [49]:
mean_absolute_error(yTest, yPredict)

4865655.004950495