In [110]:
# Imports
import pymssql
import pandas as pd
import numpy as np
from config import database
from config import username
from config import password
from config import server
from config import asthma_table
from config import aq_table
from config import census_table
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from joblib import dump
from sklearn.impute import SimpleImputer

In [111]:
# Read in the data from the SQL Database
conn = pymssql.connect(server,username,password,database)
cursor = conn.cursor()
query = f"""
    SELECT * FROM {asthma_table}
    INNER JOIN {aq_table} ON {aq_table}.AQ_ID = {asthma_table}.AQ_ID 
    WHERE {asthma_table}.YEAR != 2021
"""
df = pd.read_sql(query,conn)
print(len(df))
df.head(10)



247


Unnamed: 0,ASTHMA_ID,COUNTY_ID,YEAR,NUM_ED_VISITS,AGE_ADJ_ED_VISITS,AQ_ID,AQ_ID.1,COUNTY_ID.1,YEAR.1,LEAD_MEAN,...,PM25_METRIC_ID,PM25_UNITS_ID,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,SO2_METHOD_ID,SO2_METRIC_ID,SO2_UNITS_ID
0,220,862,2018,1770,38.0,767,767,862,2018,,...,2,2,,,,,,3,2,2
1,49,809,2015,3281,60.7,768,768,809,2015,,...,2,2,,,,,,3,2,2
2,119,65,2017,198,55.1,769,769,65,2017,,...,2,2,,,,,,3,2,2
3,236,46,2019,5910,54.2,770,770,46,2019,,...,2,2,,,,,,3,2,2
4,74,603,2016,487,79.4,771,771,603,2016,,...,2,2,,,,,,3,2,2
5,100,1737,2016,5534,29.1,772,772,1737,2016,,...,2,2,0.537047,1.8,1.6,0.381402,1.7,3,2,2
6,19,3118,2015,52227,53.1,773,773,3118,2015,0.012374,...,2,2,1.849615,21.666667,8.3,1.838851,9.7,3,2,2
7,94,2343,2016,10068,30.9,774,774,2343,2016,,...,2,2,0.22535,1.2,0.65,0.154843,0.65,3,2,2
8,147,586,2017,10076,43.2,775,775,586,2017,0.005121,...,2,2,0.617355,2.5,1.9,0.414118,2.3,3,2,2
9,87,2491,2016,8878,29.2,776,776,2491,2016,,...,2,2,0.465903,3.3,2.1,0.496436,3.2,3,2,2


In [112]:
# Exploratory
df.drop(['ASTHMA_ID',"NUM_ED_VISITS","COUNTY_ID","AQ_ID","YEAR"],axis=1, inplace=True, errors="ignore")
params = ["LEAD","NO2","OZONE","PM10","PM25","SO2"]
for param in params:
    df.drop([f"{param}_METHOD_ID",f"{param}_METRIC_ID",f"{param}_UNITS_ID"],axis=1,inplace=True,errors="ignore")

for column in df.columns:
    nan=df[df[column].isna()]
    print(column,f"{1-len(nan)/len(df):.2f}")

AGE_ADJ_ED_VISITS 1.00
LEAD_MEAN 0.07
LEAD_1STMAX 0.07
LEAD_99PERC 0.07
LEAD_STD 0.07
LEAD_2NDMAX 0.07
NO2_MEAN 0.66
NO2_1STMAX 0.66
NO2_99PERC 0.66
NO2_STD 0.66
NO2_2NDMAX 0.66
OZONE_MEAN 0.97
OZONE_1STMAX 0.97
OZONE_99PERC 0.97
OZONE_STD 0.97
OZONE_2NDMAX 0.97
PM10_MEAN 0.06
PM10_1STMAX 0.06
PM10_99PERC 0.06
PM10_STD 0.06
PM10_2NDMAX 0.06
PM25_MEAN 0.43
PM25_1STMAX 0.43
PM25_99PERC 0.43
PM25_STD 0.43
PM25_2NDMAX 0.43
SO2_MEAN 0.17
SO2_1STMAX 0.17
SO2_99PERC 0.17
SO2_STD 0.17
SO2_2NDMAX 0.17


In [113]:
### Clean it
# Drop unnecessary columns
df.drop(['ASTHMA_ID',"NUM_ED_VISITS","COUNTY_ID","AQ_ID","YEAR"],axis=1, inplace=True, errors="ignore")
params = ["LEAD","NO2","OZONE","PM10","PM25","SO2"]
for param in params:
    df.drop([f"{param}_METHOD_ID",f"{param}_METRIC_ID",f"{param}_UNITS_ID"],axis=1,inplace=True,errors="ignore")

# Remove columns where at least 50% of the values are missing
df.dropna(axis=1,thresh=len(df)*0.4,inplace=True)
# print(df.columns)

# # Remove any rows that still have nulls
df.dropna(axis=0,how="any",inplace=True)

df[:]

Unnamed: 0,AGE_ADJ_ED_VISITS,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX
3,54.2,12.650939,39.325000,33.325000,7.373577,35.825000,0.041853,0.093000,0.076500,0.011707,0.086250,6.133333,25.500000,25.500000,3.794034,17.200
5,29.1,23.385278,51.650000,46.000000,9.673512,50.250000,0.044737,0.088250,0.077750,0.011794,0.082750,8.045263,22.700000,22.700000,3.834162,20.300
6,53.1,29.475561,73.842857,62.385714,12.842313,67.307143,0.054857,0.112308,0.099923,0.017255,0.105385,10.928321,51.410000,40.335000,6.861900,42.685
7,30.9,19.314232,57.666667,46.666667,10.879726,50.555556,0.050642,0.085727,0.080727,0.010884,0.082818,8.451769,23.533333,19.866667,3.427879,18.600
8,43.2,17.986095,50.114286,41.800000,9.269775,43.857143,0.060352,0.114364,0.104545,0.017974,0.111636,12.455025,46.150000,40.600000,7.090441,40.875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,65.7,23.236892,61.200000,53.100000,12.087939,59.750000,0.049152,0.095500,0.086000,0.013842,0.091750,10.794251,41.050000,32.900000,6.765846,34.825
241,67.0,19.062693,55.100000,48.625000,9.693162,50.500000,0.055849,0.108800,0.098000,0.018154,0.101400,16.477120,84.025000,77.400000,15.593288,72.100
243,60.2,19.701047,47.450000,42.100000,9.399910,43.750000,0.043386,0.079250,0.072000,0.011732,0.077250,9.082204,31.150000,22.200000,4.833899,22.225
245,42.2,15.176731,51.900000,40.500000,8.251028,47.400000,0.052732,0.094750,0.082250,0.012370,0.088750,13.773410,411.700000,192.500000,32.422845,299.900


In [114]:
### Create a training set that's 75% of your dat set and a complementary test set with the remaining 25%. Specify random_state = 0 ###
X = df.drop('AGE_ADJ_ED_VISITS',axis=1)
y = df['AGE_ADJ_ED_VISITS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [115]:
### Run a Random Forest Regressor algorithm and compare ###

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# Optimization - Could try using RandomizedSearchCV to speed this up

param_grid= {
    'n_estimators': [100,500,1000,1500,2000,2500,3000],
    'max_features': ['auto','sqrt','log2'],
    'max_depth': [8,9,10,11,12,None],
    'min_samples_split':[2,3],
    'min_samples_leaf':[1,2],
    'bootstrap':[True,False]
}

rf=RandomForestRegressor(random_state=0)
rf_grid=GridSearchCV(estimator=rf,param_grid=param_grid,n_jobs=-1,verbose=3,cv=5, error_score='raise')

rf_grid.fit(X_train, y_train)
print(f"The best parameters are: {rf_grid.best_params_}")

y_predict_forest = rf_grid.predict(X_test)

Fitting 5 folds for each of 1008 candidates, totalling 5040 fits
The best parameters are: {'bootstrap': True, 'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 3000}


In [116]:
# Save the model
dump(rf_grid, 'random_forest_model.model')


['random_forest_model.model']

In [117]:
# Read in the data from the SQL Database
df = pd.read_sql(query,conn)

# Drop unnecessary columns
df.drop(['ASTHMA_ID',"NUM_ED_VISITS","COUNTY_ID","AQ_ID","YEAR"],axis=1, inplace=True, errors="ignore")
params = ["LEAD","NO2","OZONE","PM10","PM25","SO2"]
for param in params:
    df.drop([f"{param}_METHOD_ID",f"{param}_METRIC_ID",f"{param}_UNITS_ID"],axis=1,inplace=True,errors="ignore")

# Remove columns where at least 50% of the values are missing
df.dropna(axis=1,thresh=len(df)*0.4,inplace=True)

# Create a training set that's 75% of your dat set and a complementary test set with the remaining 25%. Specify random_state = 0 ###
X = df.drop('AGE_ADJ_ED_VISITS',axis=1)
y = df['AGE_ADJ_ED_VISITS']
X_train, X_test_imputed, y_train, y_test_imputed = train_test_split(X, y, test_size=0.25, random_state=0)

# impute any rows that still have nulls
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)
X_train = imp.transform(X_train)
X_test_imputed = imp.transform(X_test_imputed)

### Run a Random Forest Regressor algorithm and compare ###
param_grid= {
    'n_estimators': [100,500,1000,1500,2000,2500,3000],
    'max_features': ['auto','sqrt','log2'],
    'max_depth': [8,9,10,11,12,None],
    'min_samples_split':[2,3],
    'min_samples_leaf':[1,2],
    'bootstrap':[True,False]
}
rf=RandomForestRegressor(random_state=0)
rf_grid_imputed=GridSearchCV(estimator=rf,param_grid=param_grid,n_jobs=-1,verbose=3,cv=5, error_score='raise')
rf_grid_imputed.fit(X_train, y_train)
print(f"The best parameters are: {rf_grid.best_params_}")
y_predict_forest_imputed = rf_grid.predict(X_test_imputed)

# Calculating Mean Square Error for both Linear Regression and Random Forest Regressor
MSE_forest=0
MSE_forest_imputed=0
for i in range(len(y_predict_forest)):
    MSE_forest += (list(y_test)[i]-y_predict_forest[i])**2/len(y_test)
    MSE_forest_imputed += (list(y_test_imputed)[i]-y_predict_forest_imputed[i])**2/len(y_test_imputed)
print(f"The MSE for forest was {MSE_forest:.2f}, and MSE for forest imputed was {MSE_forest_imputed:.2f}")
print(f"The score for non-imputed is {rf_grid.score(X_test,y_test)}")
print(f"The score for imputed is {rf_grid_imputed.score(X_test_imputed,y_test_imputed)}")




Fitting 5 folds for each of 1008 candidates, totalling 5040 fits
The best parameters are: {'bootstrap': True, 'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 3000}
The MSE for forest was 153.82, and MSE for forest imputed was 43.90




The score for non-imputed is 0.19595873502884698
The score for imputed is 0.053058349803256744
