In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
df = pd.read_csv('states_covid_stringency.csv', parse_dates=True, dtype="category")

print(df.shape)

In [None]:
df.tail()

In [None]:
X = df.drop(['CountryName', 'CountryCode', 'RegionName',
                              'RegionCode', 'Jurisdiction', 'Date','date', 'ConfirmedCases', 'ConfirmedDeaths'], axis=1)
y = df.iloc[:, df.columns == 'ConfirmedDeaths']

print(X.dtypes)
print(y.dtypes)

In [None]:
X[['StringencyIndex', 'StringencyLegacyIndex', 'GovernmentResponseIndex', 
   'ContainmentHealthIndex', 'ContainmentHealthIndexForDisplay', 'deathIncrease']] = X[['StringencyIndex', 'StringencyLegacyIndex', 'GovernmentResponseIndex', 
   'ContainmentHealthIndex', 'ContainmentHealthIndexForDisplay', 'deathIncrease']].apply(pd.to_numeric)

In [None]:
y = y.astype(np.float64)
print(X.dtypes)
print(y.dtypes)

In [None]:
## Use SelectKBest to see the best features used in the study.

from sklearn.feature_selection import SelectKBest, f_regression

fs = SelectKBest(score_func=f_regression, k=10)

# apply feature selection
X_selected = fs.fit(X, y)

#print(X_selected.shape)

df_scores = pd.DataFrame(X_selected.scores_)
df_columns = pd.DataFrame(X.columns)

print(df_scores)
print(df_columns)

In [None]:
# concatenate dataframes
feature_scores = pd.concat([df_columns, df_scores],axis=1)
feature_scores.columns = ['Feature_Name','Score']  # name output columns
print(feature_scores.nlargest(10,'Score'))  # print 20 best features
# export selected features to .csv
df_univ_feat = feature_scores.nlargest(10,'Score')
#df_univ_feat.to_csv('feature_selection_UNIVARIATE.csv', index=False)

In [None]:
X = df[['deathIncrease', 'GovernmentResponseIndex', 'H6_Facial Coverings', 'H7_Vaccination policy']]
y = df.iloc[:, df.columns == 'ConfirmedDeaths']

print(X.head())
print(y.head())

In [None]:
param_dist = {'alpha': ['0.1', '0.25', '0.40', '0.5', '0.75', '1.0', '10', '50', '100', '500', '1000'],
              'fit_intercept': ['True', 'False'],
              'normalize': ['True', 'False'],
              'tol': loguniform(1e-4, 1e0),}

ridge_model = Ridge()

rrandom = RandomizedSearchCV(estimator=ridge_model, param_distributions=param_dist, cv=10, n_iter=100, random_state=42)
rrandom.fit(X, y)

print(rrandom.best_score_)
print(rrandom.best_estimator_.alpha)
print(rrandom.best_params_)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [None]:
rfinal = Ridge(alpha= 0.40, fit_intercept=True, normalize=True, tol=0.006870614282613298)
rfinal.fit(X_train, y_train)


In [None]:
pred = rfinal.predict(X_train)

In [None]:
print(np.sqrt(mean_squared_error(y_train, pred, squared=False)))
print(r2_score(y_train, pred))

In [None]:
from sklearn.ensemble import RandomForestRegressor

param_dist = {"n_estimators": [10, 25, 50, 100, 500, 1000],
             "max_depth": [1,5,10,25,50,100]}

rfr = RandomForestRegressor()

rrandom = RandomizedSearchCV(estimator=rfr, param_distributions=param_dist, cv=10, n_iter=100, random_state=42)
rrandom.fit(X, y)

print(rrandom.best_score_)
print(rrandom.best_estimator_)
print(rrandom.best_params_)

In [None]:
rfr = RandomForestRegressor(max_depth=5, n_estimators=500)
cv = RepeatedKFold(n_splits = 5, n_repeats = 3, random_state=1)

scores = cross_val_score(rfr, X, y, cv=cv)

# Cross Validation Scores (Averaged)

print('Cross Val Score: %.3f (%.3f)' % (mean(scores), std(scores)))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

param_dist = {"n_estimators": [10, 25, 50, 100, 500, 1000]}

gbr = GradientBoostingRegressor()

rrandom = RandomizedSearchCV(estimator=gbr, param_distributions=param_dist, cv=10, n_iter=100, random_state=42)
rrandom.fit(X, y)

print(rrandom.best_score_)
print(rrandom.best_estimator_.alpha)
print(rrandom.best_params_)

In [None]:
gbr = GradientBoostingRegressor(alpha=0.9, n_estimators=100)
cv = RepeatedKFold(n_splits = 5, n_repeats = 3, random_state=1)

scores = cross_val_score(gbr, X, y, cv=cv)

# Cross Validation Scores (Averaged)

print('Cross Val Score: %.3f (%.3f)' % (mean(scores), std(scores)))

In [None]:
!pip install xgboost

In [None]:
from xgboost import XGBRegressor

In [None]:
xgbr = XGBRegressor(objective='reg:squarederror')

cv = RepeatedKFold(n_splits = 5, n_repeats = 3, random_state=1)

scores = cross_val_score(xgbr, X, y, cv=cv)

# Cross Validation Scores (Averaged)

print('Cross Val Score: %.3f (%.3f)' % (mean(scores), std(scores)))