In [14]:
import pandas as pd
import numpy as np

import pyarrow

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

#!conda install -c conda-forge xgboost
import xgboost as xgb

%matplotlib inline

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet

from sklearn.metrics import r2_score

from pickle import dump
from pickle import load

In [3]:
data_path = '/Users/ksatola/Documents/git/air-polution/agh/data/'
data_file = data_path + 'dfpm25_2008-2018_ml_7days_lags.hdf'

df = pd.read_hdf(path_or_buf=data_file, key="df")
print(f'Dataframe size: {df.shape}')
df.head()

Dataframe size: (4012, 17)


Unnamed: 0,t,t-1,t-2,t-3,t-4,t-5,t-6,t-7,year,month,day,hour,dayofyear,weekofyear,dayofweek,quarter,season
0,45.041667,46.083333,36.0625,57.3125,42.979167,46.104167,30.958333,53.586957,2008,1,8,0,8,2,1,1,1
1,101.375,45.041667,46.083333,36.0625,57.3125,42.979167,46.104167,30.958333,2008,1,9,0,9,2,2,1,1
2,110.083333,101.375,45.041667,46.083333,36.0625,57.3125,42.979167,46.104167,2008,1,10,0,10,2,3,1,1
3,141.833333,110.083333,101.375,45.041667,46.083333,36.0625,57.3125,42.979167,2008,1,11,0,11,2,4,1,1
4,47.625,141.833333,110.083333,101.375,45.041667,46.083333,36.0625,57.3125,2008,1,12,0,12,2,5,1,1


In [4]:
# Split dataset into independent variables dataset columns and dependent variable column
X = df.iloc[:, 1:]
y = df.iloc[:, :1]
#y.head()

In [5]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.5, 
                                                    random_state = 123)

In [19]:
num_folds = 6
seed = 123

In [8]:
models = []
models.append(('LR', LinearRegression()))
models.append(('EN', ElasticNet()))

In [22]:
def score_models(models, scoring):
    names = []
    results = []
    output = []
    
    for name, model in models:
        names.append(name)
        try:
            # Not all scoring metrics are available for all models
            
            kfold = KFold(n_splits=num_folds, random_state=seed)
            cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
            results.append(cv_results)
            output.append('Model {}, mean accuracy {:0.2f}, (std. dev. {:0.2f})'.format(name, cv_results.mean(), cv_results.std()))
        except:
            output.append('Model {} {} metric unavailable)'.format(name, scoring))
        
    return output, results, names

In [25]:
scores, results, names = score_models(models, 'r2')
for score in scores:
    print(score)
    print(results)

Model LR, mean accuracy 0.52, (std. dev. 0.10)
[array([0.63629187, 0.45294277, 0.33198449, 0.60397151, 0.57510284,
       0.53279603]), array([0.62203061, 0.43764003, 0.34750155, 0.60964924, 0.57797759,
       0.52230395])]
Model EN, mean accuracy 0.52, (std. dev. 0.10)
[array([0.63629187, 0.45294277, 0.33198449, 0.60397151, 0.57510284,
       0.53279603]), array([0.62203061, 0.43764003, 0.34750155, 0.60964924, 0.57797759,
       0.52230395])]




In [24]:
scores, results, names = score_models(models, 'explained_variance')
for score in scores:
    print(score)

Model LR, mean accuracy 0.53, (std. dev. 0.10)
Model EN, mean accuracy 0.53, (std. dev. 0.10)


