# Final Project Model Training

In [10]:
import pandas as pd
import numpy as np
import streamlit as st
from matplotlib import pyplot as plt # Matplotlib
import datetime
from datetime import datetime

# Package to implement ML Algorithms
import sklearn
from sklearn.tree import DecisionTreeRegressor     # Decision Tree
from sklearn.ensemble import RandomForestRegressor # Random Forest

# Package for data partitioning
from sklearn.model_selection import train_test_split

# Package for generating confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Package for generating classification report
from sklearn.metrics import classification_report

# Import packages to implement Stratified K-fold CV
from sklearn.model_selection import KFold # For creating folds

# Import Package to implement GridSearch CV
from sklearn.model_selection import GridSearchCV

# Importing package for Randomized Search CV
from sklearn.model_selection import RandomizedSearchCV

# Package to record time
import time

# Package for Data pretty printer
from pprint import pprint

# Module to save and load Python objects to and from files
import pickle 

# Ignore Deprecation Warnings
import warnings
warnings.filterwarnings('ignore')

# Display inline plots as vector-based (svg)
%config InlineBackend.figure_formats = ['svg']

%matplotlib inline

In [11]:
aqi = pd.read_csv("mlm_aqi_data.csv")
aqi.head()

Unnamed: 0,State,urban_perc_state,County,percentage20_Donald_Trump,percentage20_Joe_Biden,avg_sal_2022,white_perc,baa_perc,asian_perc,native_perc,pi_perc,his_perc,AQI,CO_perc,NO2_perc,O3_perc,PM2.5_perc,PM10_perc
0,Alabama,0.577,Baldwin,0.762,0.223,56747.0,0.893656,0.094556,0.017246,0.017071,0.002414,0.050362,40,0.0,0.0,0.808511,0.191489,0.0
1,Alabama,0.577,Clay,0.808,0.183,39876.0,0.851035,0.15305,0.004719,0.012326,0.001056,0.031835,27,0.0,0.0,0.0,1.0,0.0
2,Alabama,0.577,DeKalb,0.844,0.146,40558.0,0.946637,0.026404,0.007236,0.037126,0.00682,0.159921,37,0.0,0.0,0.92562,0.07438,0.0
3,Alabama,0.577,Elmore,0.736,0.252,49071.0,0.764367,0.230173,0.011433,0.011121,0.001999,0.031799,37,0.0,0.0,1.0,0.0,0.0
4,Alabama,0.577,Etowah,0.745,0.242,42951.0,0.825877,0.163812,0.011262,0.015201,0.004152,0.047736,42,0.0,0.0,0.424581,0.575419,0.0


In [12]:
X = aqi.drop(columns= ["AQI"])
y = aqi['AQI']

In [13]:
# One hot encoding for categorical variables
cat_var = ["County", "State"]
X_encoded = pd.get_dummies(X, columns = cat_var)

X_encoded.head()

Unnamed: 0,urban_perc_state,percentage20_Donald_Trump,percentage20_Joe_Biden,avg_sal_2022,white_perc,baa_perc,asian_perc,native_perc,pi_perc,his_perc,...,State_South Dakota,State_Tennessee,State_Texas,State_Utah,State_Vermont,State_Virginia,State_Washington,State_West Virginia,State_Wisconsin,State_Wyoming
0,0.577,0.762,0.223,56747.0,0.893656,0.094556,0.017246,0.017071,0.002414,0.050362,...,False,False,False,False,False,False,False,False,False,False
1,0.577,0.808,0.183,39876.0,0.851035,0.15305,0.004719,0.012326,0.001056,0.031835,...,False,False,False,False,False,False,False,False,False,False
2,0.577,0.844,0.146,40558.0,0.946637,0.026404,0.007236,0.037126,0.00682,0.159921,...,False,False,False,False,False,False,False,False,False,False
3,0.577,0.736,0.252,49071.0,0.764367,0.230173,0.011433,0.011121,0.001999,0.031799,...,False,False,False,False,False,False,False,False,False,False
4,0.577,0.745,0.242,42951.0,0.825877,0.163812,0.011262,0.015201,0.004152,0.047736,...,False,False,False,False,False,False,False,False,False,False


In [14]:
# Data partitioning into train and test sets
train_X, test_X, train_y, test_y = train_test_split(X_encoded, y, test_size = 0.2, random_state = 42)

## Random Forest Model

In [24]:
#random forest model
#regressor = RandomForestRegressor(random_state = 42)
regressor = sklearn.ensemble.HistGradientBoostingRegressor(random_state=42)
folds = KFold(n_splits = 3, shuffle = True, random_state = 42)

param_grid = {
    'max_depth': [5, 15, 25, 40, 60],
    'min_samples_leaf': [5, 15, 25, 40, 60, 100]
    #'min_samples_split': [5, 15, 25, 40, 60, 100]
    #'n_estimators': [40, 60, 100, 200, 300]
    }

#randomized for before hyperparameter tuning
random_cv = RandomizedSearchCV(estimator = regressor,
                              param_distributions = param_grid,
                              n_iter = 100,
                              scoring = 'r2',
                              cv = folds,
                              verbose = 2,
                              random_state = 42,
                              n_jobs = -1)





In [25]:
start = time.time()
random_cv.fit(train_X, train_y)
stop = time.time()            
print(f"Training time: {stop - start}s")
print('Initial score: ', random_cv.best_score_)
print('Initial parameters: ', random_cv.best_params_)


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] END ...................max_depth=5, min_samples_leaf=15; total time=   8.0s
[CV] END ....................max_depth=5, min_samples_leaf=5; total time=   9.5s
[CV] END ....................max_depth=5, min_samples_leaf=5; total time=   9.7s
[CV] END ....................max_depth=5, min_samples_leaf=5; total time=   9.6s
[CV] END ...................max_depth=5, min_samples_leaf=15; total time=   6.3s
[CV] END ...................max_depth=5, min_samples_leaf=25; total time=   5.4s
[CV] END ...................max_depth=5, min_samples_leaf=25; total time=   5.8s
[CV] END ...................max_depth=5, min_samples_leaf=15; total time=   6.7s
[CV] END ...................max_depth=5, min_samples_leaf=40; total time=   4.1s
[CV] END ...................max_depth=5, min_samples_leaf=25; total time=   4.6s
[CV] END ...................max_depth=5, min_samples_leaf=40; total time=   4.1s
[CV] END ...................max_depth=5, min_sam

In [None]:
'''
#generating final model w GridSearchCV()
param_grid = {
    'max_depth': [],
    'min_samples_leaf': [],
    'min_samples_split': [],
    'n_estimators': []
    }

model_cv = GridSearchCV(estimator = regressor, 
                        param_grid = param_grid, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        n_jobs = -1)
start = time.time()

#fitting random forest model
model_cv.fit(train_X, train_y)  
stop = time.time()            
print(f"Training time: {stop - start}s")
print('Improved score: ', model_cv.best_score_)
print('Improved parameters: ', model_cv.best_params_)
'''

In [None]:
'''
# Storing importance values from the trained model
importance = model_cv.best_estimator_.feature_importances_

# Storing feature importance as a dataframe
feature_imp = pd.DataFrame(list(zip(train_X.columns, importance)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)
feature_imp = feature_imp[feature_imp['Importance'] > 0.001]
# Bar plot
plt.figure(figsize=(10, 5))
plt.barh(feature_imp['Feature'], feature_imp['Importance'], color = ['mediumslateblue', 'deeppink'])

plt.xlabel("Importance", fontsize = 12)
plt.ylabel("Input Feature", fontsize = 12)
plt.title('Which features are the most important for air quality prediction?', fontsize = 12) 
plt.yticks(fontsize = 8) # fontsize of yticks
plt.xticks(fontsize = 8) # fontsize of xticks
plt.savefig("rf_feature_imp.svg", bbox_inches="tight")
plt.tight_layout();
'''

In [None]:
'''
# Creating the file where we want to write the model
rf_pickle = open('rf_aqi.pickle', 'wb') 

# Write RF model to the file
pickle.dump(model_cv, rf_pickle) 

# Close the file
rf_pickle.close()
'''

## Decision Tree Model

In [None]:
#random forest model
regressor = DecisionTreeRegressor(random_state = 42)
folds = KFold(n_splits = 3, shuffle = True, random_state = 42)
'''
param_grid = {
    'max_depth': [5, 15, 25, 40, 60],
    'min_samples_leaf': [5, 15, 25, 40, 60, 100],
    'min_samples_split': [5, 15, 25, 40, 60, 100]
    }

#randomized for before hyperparameter tuning
random_cv = RandomizedSearchCV(estimator = regressor,
                              param_distributions = param_grid,
                              n_iter = 100,
                              scoring = 'r2',
                              cv = folds,
                              verbose = 2,
                              random_state = 42,
                              n_jobs = -1)

}
'''

In [None]:
'''
model_cv.fit(train_X, train_y)  
stop = time.time()            
print(f"Training time: {stop - start}s")
print('Initial score: ', model_cv.best_score_)
print('Initial parameters: ', model_cv.best_params_)
'''

In [None]:
'''
#generating final model w GridSearchCV()
param_grid = {
    'max_depth': [],
    'min_samples_leaf': [],
    'min_samples_split': [],
    'n_estimators': []

model_cv = GridSearchCV(estimator = regressor, 
                        param_grid = param_grid, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        n_jobs = -1)
start = time.time()

#fitting random forest model
model_cv.fit(train_X, train_y)  
stop = time.time()            
print(f"Training time: {stop - start}s")
print('Improved score: ', model_cv.best_score_)
print('Improved parameters: ', model_cv.best_params_)
'''

In [None]:
'''
# Storing importance values from the trained model
importance = model_cv.best_estimator_.feature_importances_

# Storing feature importance as a dataframe
feature_imp = pd.DataFrame(list(zip(train_X.columns, importance)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)
feature_imp = feature_imp[feature_imp['Importance'] > 0.001]
# Bar plot
plt.figure(figsize=(10, 5))
plt.barh(feature_imp['Feature'], feature_imp['Importance'], color = ['mediumslateblue', 'deeppink'])

plt.xlabel("Importance", fontsize = 12)
plt.ylabel("Input Feature", fontsize = 12)
plt.title('Which features are the most important for air quality prediction?', fontsize = 12) 
plt.yticks(fontsize = 8) # fontsize of yticks
plt.xticks(fontsize = 8) # fontsize of xticks
plt.savefig("rf_feature_imp.svg", bbox_inches="tight")
plt.tight_layout();
'''

In [None]:
'''
# Creating the file where we want to write the model
dt_pickle = open('dt_aqi.pickle', 'wb') 

# Write RF model to the file
pickle.dump(model_cv, dt_pickle) 

# Close the file
dt_pickle.close()
'''

## AdaBoost Model

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import StratifiedKFold

In [None]:
regressor = AdaBoostRegressor(random_state=42)
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

In [None]:
'''
n_estimators = [int(x) for x in np.linspace(start = 5, stop = 500, num = 10)]

learning_rate = [x for x in np.arange(0.1, 2.1, 0.1)]

param_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate
}

model_cv = RandomizedSearchCV(estimator = regressor,
                              param_distributions = param_grid,
                              n_iter = 100,
                              scoring = 'f1_macro',
                              cv = folds,
                              verbose = 2,
                              random_state = 42,
                              n_jobs = -1)
'''

In [None]:
'''
model_cv.fit(train_X, train_y)  
stop = time.time()            
print(f"Training time: {stop - start}s")
print('Initial score: ', model_cv.best_score_)
print('Initial parameters: ', model_cv.best_params_)
'''

In [None]:
'''
#best model:
#'learning_rate': , 'n_estimators': 
param_grid = {'n_estimators': [],
               'learning_rate': []
}

# Call GridSearchCV()
model_cv = GridSearchCV(estimator = regressor, 
                        param_grid = param_grid, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        n_jobs = -1)

# Fit the model
start = time.time()            # Start Time
model_cv.fit(train_X, train_y)  
stop = time.time()             # End Time
print(f"Training time: {stop - start}s")
print('Improved score: ', model_cv.best_score_)
print('Improved parameters: ', model_cv.best_params_)
'''

In [None]:
'''
# Storing importance values from the trained model
importance = model_cv.best_estimator_.feature_importances_

# Storing feature importance as a dataframe
feature_imp = pd.DataFrame(list(zip(train_X.columns, importance)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)
feature_imp = feature_imp[feature_imp['Importance'] > 0.001]
# Bar plot
plt.figure(figsize=(10, 5))
plt.barh(feature_imp['Feature'], feature_imp['Importance'], color = ['mediumslateblue', 'deeppink'])

plt.xlabel("Importance", fontsize = 12)
plt.ylabel("Input Feature", fontsize = 12)
plt.title('Which features are the most important for traffic volume prediction?', fontsize = 12) 
plt.yticks(fontsize = 8) # fontsize of yticks
plt.xticks(fontsize = 8) # fontsize of xticks
plt.savefig("ad_feature_imp.svg", bbox_inches="tight")
plt.tight_layout();
'''

In [None]:
'''
#saving adaboost model
ad_pickle = open('ad_traffic.pickle', 'wb') 

# Write RF model to the file
pickle.dump(model_cv, ad_pickle) 

# Close the file
ad_pickle.close()
'''