In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler 

# model
from catboost import Pool
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier
#from xgboost import XGBRegressor
#from sklearn.ensemble import GradientBoostingRegressor
#from sklearn.ensemble import BaggingRegressor

#plot
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/world-happiness/2015.csv
/kaggle/input/world-happiness/2017.csv
/kaggle/input/world-happiness/2016.csv
/kaggle/input/world-happiness/2018.csv
/kaggle/input/world-happiness/2019.csv
/kaggle/input/covid19-global-forecasting-week-5/train.csv
/kaggle/input/covid19-global-forecasting-week-5/submission.csv
/kaggle/input/covid19-global-forecasting-week-5/test.csv


In [2]:
# load training and testing data 
subm = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-5/submission.csv')
training_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-5/train.csv', index_col='Id', parse_dates=True)
testing_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-5/test.csv', index_col='ForecastId', parse_dates=True)
happiest_data = pd.read_csv('/kaggle/input/world-happiness/2019.csv')

In [3]:
# load additional data 
happiest_data.rename(columns={'Country or region':'Country_Region'}, inplace=True)

In [4]:
testing_data.info(), training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 311670 entries, 1 to 311670
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   County          282870 non-null  object 
 1   Province_State  294840 non-null  object 
 2   Country_Region  311670 non-null  object 
 3   Population      311670 non-null  int64  
 4   Weight          311670 non-null  float64
 5   Date            311670 non-null  object 
 6   Target          311670 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 19.0+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 748008 entries, 1 to 969576
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   County          678888 non-null  object 
 1   Province_State  707616 non-null  object 
 2   Country_Region  748008 non-null  object 
 3   Population      748008 non-null  int64  
 4   Weight          748008

(None, None)

In [5]:
training_data.describe(include=['O'])

Unnamed: 0,County,Province_State,Country_Region,Date,Target
count,678888,707616,748008,748008,748008
unique,1840,133,187,108,2
top,Washington,Texas,US,2020-05-07,Fatalities
freq,6696,55080,690768,6926,374004


In [6]:
training_data.loc[training_data['Country_Region'] == 'US']

Unnamed: 0_level_0,County,Province_State,Country_Region,Population,Weight,Date,Target,TargetValue
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
67761,Autauga,Alabama,US,55869,0.091485,2020-01-23,ConfirmedCases,0.0
67762,Autauga,Alabama,US,55869,0.914848,2020-01-23,Fatalities,0.0
67763,Autauga,Alabama,US,55869,0.091485,2020-01-24,ConfirmedCases,0.0
67764,Autauga,Alabama,US,55869,0.914848,2020-01-24,Fatalities,0.0
67765,Autauga,Alabama,US,55869,0.091485,2020-01-25,ConfirmedCases,0.0
...,...,...,...,...,...,...,...,...
963132,,,US,324141489,0.510290,2020-05-07,Fatalities,2231.0
963133,,,US,324141489,0.051029,2020-05-08,ConfirmedCases,26906.0
963134,,,US,324141489,0.510290,2020-05-08,Fatalities,1518.0
963135,,,US,324141489,0.051029,2020-05-09,ConfirmedCases,25620.0


In [7]:
# add information to the training data from happiest_data
train_data = training_data.copy()
train_data = train_data.merge(happiest_data, how='left', left_index=True, on=['Country_Region'])
train_data.index = training_data.index

In [8]:
# ... and to the test data
test_data = testing_data.copy()
test_data = test_data.merge(happiest_data, how='left', on=['Country_Region'])
test_data.index = testing_data.index

In [9]:
# see testing data
test_data

Unnamed: 0_level_0,County,Province_State,Country_Region,Population,Weight,Date,Target,Overall rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,,,Afghanistan,27657145,0.058359,2020-04-27,ConfirmedCases,154.0,3.203,0.350,0.517,0.361,0.000,0.158,0.025
2,,,Afghanistan,27657145,0.583587,2020-04-27,Fatalities,154.0,3.203,0.350,0.517,0.361,0.000,0.158,0.025
3,,,Afghanistan,27657145,0.058359,2020-04-28,ConfirmedCases,154.0,3.203,0.350,0.517,0.361,0.000,0.158,0.025
4,,,Afghanistan,27657145,0.583587,2020-04-28,Fatalities,154.0,3.203,0.350,0.517,0.361,0.000,0.158,0.025
5,,,Afghanistan,27657145,0.058359,2020-04-29,ConfirmedCases,154.0,3.203,0.350,0.517,0.361,0.000,0.158,0.025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311666,,,Zimbabwe,14240168,0.607106,2020-06-08,Fatalities,146.0,3.663,0.366,1.114,0.433,0.361,0.151,0.089
311667,,,Zimbabwe,14240168,0.060711,2020-06-09,ConfirmedCases,146.0,3.663,0.366,1.114,0.433,0.361,0.151,0.089
311668,,,Zimbabwe,14240168,0.607106,2020-06-09,Fatalities,146.0,3.663,0.366,1.114,0.433,0.361,0.151,0.089
311669,,,Zimbabwe,14240168,0.060711,2020-06-10,ConfirmedCases,146.0,3.663,0.366,1.114,0.433,0.361,0.151,0.089


In [10]:
# detect missing values in training
train_data.isna().sum()

County                           69120
Province_State                   40392
Country_Region                       0
Population                           0
Weight                               0
Date                                 0
Target                               0
TargetValue                          0
Overall rank                    700056
Score                           700056
GDP per capita                  700056
Social support                  700056
Healthy life expectancy         700056
Freedom to make life choices    700056
Generosity                      700056
Perceptions of corruption       700056
dtype: int64

In [11]:
# Convert data in integer
train_data['Date']= pd.to_datetime(train_data['Date']).dt.strftime("%m%d").astype(int)
test_data['Date']= pd.to_datetime(test_data['Date']).dt.strftime("%m%d").astype(int)

In [12]:
# define the minimum and maximum dates after convertion in train data
train_data['Date'].min(), train_data['Date'].max()

(123, 509)

In [13]:
# define the minimum and maximum dates after convertion in train data
test_data['Date'].min(), test_data['Date'].max()

(427, 610)

In [14]:
# create a list with dates that intersect in the training and test data
drop_date = [i for i in range(test_data['Date'].min(), train_data['Date'].max()+1)]

In [15]:
# see it
#drop_date

In [16]:
# throw out the dates coinciding with the test data from the train data
train_data = train_data.loc[~train_data['Date'].isin(drop_date)]

In [17]:
#check the minimum and maximum dates 
train_data['Date'].min(), train_data['Date'].max()

(123, 426)

In [18]:
# separate the vector correct answers from the training data
y = train_data.TargetValue
train_data.drop(['TargetValue'], axis=1, inplace=True)

In [19]:
# Select categorical columns in training and testing data
categorical_cols = [cname for cname in train_data.columns if
                    train_data[cname].dtype == "object"]

In [20]:
# Select non type columns in training and testing data
non_cols = [cname for cname in train_data.columns if
                    train_data[cname].dtype == None]

In [21]:
non_cols, categorical_cols

(['Weight',
  'Overall rank',
  'Score',
  'GDP per capita',
  'Social support',
  'Healthy life expectancy',
  'Freedom to make life choices',
  'Generosity',
  'Perceptions of corruption'],
 ['County', 'Province_State', 'Country_Region', 'Target'])

In [22]:
# replace missing values in training and testing data
train_data[categorical_cols] = train_data[categorical_cols].fillna('-')
test_data[categorical_cols] = test_data[categorical_cols].fillna('-')

In [23]:
train_data.isna().sum()

County                               0
Province_State                       0
Country_Region                       0
Population                           0
Weight                               0
Date                                 0
Target                               0
Overall rank                    615790
Score                           615790
GDP per capita                  615790
Social support                  615790
Healthy life expectancy         615790
Freedom to make life choices    615790
Generosity                      615790
Perceptions of corruption       615790
dtype: int64

In [24]:
# replace missing non type values in training and testing data
train_data[non_cols] = train_data[non_cols].fillna(0)
test_data[non_cols] = test_data[non_cols].fillna(0)

In [25]:
train_data.isna().sum()

County                          0
Province_State                  0
Country_Region                  0
Population                      0
Weight                          0
Date                            0
Target                          0
Overall rank                    0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
dtype: int64

In [26]:
# perform LabelEncoder with categorical data (categorical_cols)
state_encoder = LabelEncoder()
counrty_encoder = LabelEncoder()
ord_encoder = OrdinalEncoder()
encod_train_data = train_data.copy()
encod_test_data = test_data.copy()

    
encod_train_data[categorical_cols] = ord_encoder.fit_transform(train_data[categorical_cols])
encod_test_data[categorical_cols] = ord_encoder.transform(test_data[categorical_cols])


In [27]:
encod_train_data.loc[120], encod_test_data.loc[120]

(County                          0.000000e+00
 Province_State                  0.000000e+00
 Country_Region                  0.000000e+00
 Population                      2.765714e+07
 Weight                          5.835874e-01
 Date                            3.220000e+02
 Target                          1.000000e+00
 Overall rank                    1.540000e+02
 Score                           3.203000e+00
 GDP per capita                  3.500000e-01
 Social support                  5.170000e-01
 Healthy life expectancy         3.610000e-01
 Freedom to make life choices    0.000000e+00
 Generosity                      1.580000e-01
 Perceptions of corruption       2.500000e-02
 Name: 120, dtype: float64,
 County                          0.000000e+00
 Province_State                  0.000000e+00
 Country_Region                  1.000000e+00
 Population                      2.886026e+06
 Weight                          6.722512e-01
 Date                            5.110000e+02
 Targe

In [28]:
def rmse_score(iterations):
    rmse = np.sqrt(-cross_val_score(CatBoostRegressor(iterations=iterations, 
                          depth=6, 
                          learning_rate=0.4, 
                          loss_function='RMSE',
                          #random_seed=random_seed,
                          verbose=False),X_train, y_train, scoring="neg_mean_squared_error", cv = 3))
    return(rmse)

In [29]:
#metrics = [4000, 8000, 12000]
#results = {}
#for x in metrics:
    #results[x] = rmse_score(x)

In [30]:
#results

In [31]:
#plt.figure(figsize=(12,8))
#for i in results:
    #sns.lineplot(data=results[i], label=i)

In [32]:
#for x in metrics:
    #print(x, results[x].mean())

In [33]:
# split encod_train_data into training(X_train) and validation(X_valid) data
# and split vector correct answers ('ConfirmedCases')
X_train, X_valid, y_train, y_valid = train_test_split(encod_train_data, y, train_size=0.95, 
                                                      test_size=0.05, random_state=0)

In [34]:
# select model and install parameters
model = CatBoostRegressor(iterations=12000, 
                          depth=9, 
                          learning_rate=0.4, 
                          loss_function='RMSE',
                          verbose=False)

In [35]:
# train the model
model.fit(X_train,y_train, plot = True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x7f5fe96bb350>

In [36]:
# preprocessing of validation data, get predictions
preds = model.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 2.7194994861844095


In [37]:
# make the prediction using the resulting model
preds = model.predict(X_valid)

print('MSE:', mean_squared_error(y_valid, preds))

MSE: 1917.9197111039775


In [38]:
x_list = [X_train, X_valid]
y_list = [y_train, y_valid]

scoring = list(map(lambda x,y: round(model.score(x,y)*100, 2), x_list, y_list)) 
scoring

[100.0, 95.95]

In [39]:
# get predictions test data
final_preds = model.predict(encod_test_data)

In [40]:
pd_preds = pd.DataFrame({'Id':test_data.index, 'TargetValue':final_preds})

In [41]:
test_id = test_data.index.astype(str)
quant_05 = pd.DataFrame({'ForecastId_Quantile': test_id + '_0.05', 'TargetValue': 0.85*final_preds})
quant_50 = pd.DataFrame({'ForecastId_Quantile': test_id + '_0.5', 'TargetValue': final_preds})
quant_95 = pd.DataFrame({'ForecastId_Quantile': test_id + '_0.95', 'TargetValue': 1.15*final_preds})
all_predict = pd.concat([quant_05, quant_50, quant_95])

**Final predict**

In [42]:
all_predict.to_csv("submission.csv",index=False)

In [43]:
q05 = pd_preds.groupby('Id')['TargetValue'].quantile(q=0.05).reset_index()
q50 = pd_preds.groupby('Id')['TargetValue'].quantile(q=0.5).reset_index()
q95 = pd_preds.groupby('Id')['TargetValue'].quantile(q=0.95).reset_index()

q05.columns=['number','0.05']
q50.columns=['number','0.5']
q95.columns=['number','0.95']

In [44]:
all_data = pd.concat([q05,q50['0.5'],q95['0.95']],1)
all_data['number'] = all_data['number'] 
all_data.head(10)

Unnamed: 0,number,0.05,0.5,0.95
0,1,67.789105,67.789105,67.789105
1,2,3.162276,3.162276,3.162276
2,3,67.789105,67.789105,67.789105
3,4,3.162276,3.162276,3.162276
4,5,67.789105,67.789105,67.789105
5,6,3.162276,3.162276,3.162276
6,7,67.789105,67.789105,67.789105
7,8,3.162276,3.162276,3.162276
8,9,67.789105,67.789105,67.789105
9,10,3.162276,3.162276,3.162276


In [45]:
predict = pd.melt(all_data, id_vars=['number'], value_vars=['0.05','0.5','0.95'])
predict['ForecastId_Quantile']=predict['number'].astype(str)+'_'+predict['variable']
predict['TargetValue']=predict['value']
predict=predict[['ForecastId_Quantile','TargetValue']]
predict.reset_index(drop=True,inplace=True)
#predict.to_csv("submission.csv",index=False)
predict.tail(10)

Unnamed: 0,ForecastId_Quantile,TargetValue
935000,311661_0.95,0.306294
935001,311662_0.95,0.262435
935002,311663_0.95,0.306294
935003,311664_0.95,0.262435
935004,311665_0.95,0.306294
935005,311666_0.95,0.262435
935006,311667_0.95,0.306294
935007,311668_0.95,0.262435
935008,311669_0.95,0.306294
935009,311670_0.95,0.262435
