In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report

In [2]:
# Read in the data
df = pd.read_csv(Path('merged_air_asthma.csv'))
# drop rows with null values
df = df.dropna()

# print out the dataframe
df

Unnamed: 0,city,pm25,pm10,population,color_pm10,color_pm25,uniquezip,state,data_value_type,data_value
0,Livermore,7.500000,15.639667,89115,green,green,0641992,California,Crude prevalence,8.9
1,Las Vegas,9.100000,18.976129,2270187,green,green,3240000,Nevada,Crude prevalence,10.1
2,Camden,9.700000,20.227303,79877,yellow,green,3410000,New Jersey,Crude prevalence,11.2
3,Cleveland,9.700000,20.227303,1772872,yellow,green,3916000-39035103900,Ohio,Crude prevalence,10.6
4,Scranton,8.200000,17.099369,378605,green,green,4269000-42069101200,Pennsylvania,Crude prevalence,10.6
...,...,...,...,...,...,...,...,...,...,...
12249,Yakima,8.550000,17.829220,243231,green,green,5380010,Washington,Crude prevalence,11.3
12250,Yakima,8.550000,17.829220,243231,green,green,5380010-53077001000,Washington,Crude prevalence,11.1
12251,Tacoma,7.066667,14.736042,198397,green,green,5370000-53053063300,Washington,Crude prevalence,11.3
12252,Tacoma,7.066667,14.736042,198397,green,green,5370000,Washington,Age-adjusted prevalence,11.3


In [3]:
# create data frame for the columns that we do not need to scale or convert
unchanged_df = df[['pm25','pm10','population','data_value']]
unchanged_df.head()

Unnamed: 0,pm25,pm10,population,data_value
0,7.5,15.639667,89115,8.9
1,9.1,18.976129,2270187,10.1
2,9.7,20.227303,79877,11.2
3,9.7,20.227303,1772872,10.6
4,8.2,17.099369,378605,10.6


In [4]:
# check types for each column
df.dtypes

city                object
pm25               float64
pm10               float64
population           int64
color_pm10          object
color_pm25          object
uniquezip           object
state               object
data_value_type     object
data_value         float64
dtype: object

In [5]:
# Convert columns that are objects to integers using onehotencoder
enc = OneHotEncoder(sparse=False)

# Fit and transform the onehotencoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[['color_pm25','color_pm10','data_value_type']]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(['color_pm25','color_pm10','data_value_type'])
encode_df.head()

Unnamed: 0,color_pm25_green,color_pm25_yellow,color_pm10_darkred,color_pm10_green,color_pm10_yellow,data_value_type_Age-adjusted prevalence,data_value_type_Crude prevalence
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [6]:
# Use pandas.get_dummies to convert columns with a large amount of unique entries

dummies_df = pd.get_dummies(df[['city','state']], columns=['city', 'state'])

dummies_df.head()

Unnamed: 0,city_Albany,city_Albuquerque,city_Allentown,city_Anaheim,city_Anchorage,city_Apple Valley,city_Atlanta,city_Auburn,city_Baltimore,city_Baton Rouge,...,state_Rhode Island,state_South Carolin,state_South Dakota,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Merge encoded dataframes with the remaining columns
ml_df = pd.concat([encode_df, unchanged_df, dummies_df],axis=1) # took out dummies_df
ml_df.head()

Unnamed: 0,color_pm25_green,color_pm25_yellow,color_pm10_darkred,color_pm10_green,color_pm10_yellow,data_value_type_Age-adjusted prevalence,data_value_type_Crude prevalence,pm25,pm10,population,...,state_Rhode Island,state_South Carolin,state_South Dakota,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,7.5,15.639667,89115,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,9.1,18.976129,2270187,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,9.7,20.227303,79877,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,9.7,20.227303,1772872,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,8.2,17.099369,378605,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# check to make sure all of the data types are correct for ML
ml_df.dtypes

color_pm25_green       float64
color_pm25_yellow      float64
color_pm10_darkred     float64
color_pm10_green       float64
color_pm10_yellow      float64
                        ...   
state_Utah               uint8
state_Vermont            uint8
state_Virginia           uint8
state_Washington         uint8
state_West Virginia      uint8
Length: 170, dtype: object

In [9]:
# Use standard scaler to help scale the data to train the model
scaler = StandardScaler()

# Want to scale the columns from data that are originally integers
cols_to_scale = ['pm10','pm25','population']
scaled_data = scaler.fit_transform(ml_df[cols_to_scale])
scaled_df = pd.DataFrame(scaled_data, columns=cols_to_scale)
scaled_df

Unnamed: 0,pm10,pm25,population
0,-0.318275,-0.318275,-0.924888
1,0.513507,0.513507,-0.169448
2,0.825426,0.825426,-0.928088
3,0.825426,0.825426,-0.341699
4,0.045629,0.045629,-0.824620
...,...,...,...
12249,0.227582,0.227582,-0.871508
12250,0.227582,0.227582,-0.871508
12251,-0.543550,-0.543550,-0.887037
12252,-0.543550,-0.543550,-0.887037


In [16]:
# drop the original columns that we scaled and add in the scaled columns to our ml_df
ml_df.drop(columns=cols_to_scale,inplace=True)
ml_df = pd.concat([ml_df, scaled_df],axis=1)
ml_df

Unnamed: 0,color_pm25_green,color_pm25_yellow,color_pm10_darkred,color_pm10_green,color_pm10_yellow,data_value_type_Age-adjusted prevalence,data_value_type_Crude prevalence,data_value,city_Albany,city_Albuquerque,...,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia,pm10,pm25,population
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,8.9,0,0,...,0,0,0,0,0,0,0,-0.318275,-0.318275,-0.924888
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,10.1,0,0,...,0,0,0,0,0,0,0,0.513507,0.513507,-0.169448
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,11.2,0,0,...,0,0,0,0,0,0,0,0.825426,0.825426,-0.928088
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,10.6,0,0,...,0,0,0,0,0,0,0,0.825426,0.825426,-0.341699
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,10.6,0,0,...,0,0,0,0,0,0,0,0.045629,0.045629,-0.824620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12249,1.0,0.0,0.0,1.0,0.0,0.0,1.0,11.3,0,0,...,0,0,0,0,0,1,0,0.227582,0.227582,-0.871508
12250,1.0,0.0,0.0,1.0,0.0,0.0,1.0,11.1,0,0,...,0,0,0,0,0,1,0,0.227582,0.227582,-0.871508
12251,1.0,0.0,0.0,1.0,0.0,0.0,1.0,11.3,0,0,...,0,0,0,0,0,1,0,-0.543550,-0.543550,-0.887037
12252,1.0,0.0,0.0,1.0,0.0,1.0,0.0,11.3,0,0,...,0,0,0,0,0,1,0,-0.543550,-0.543550,-0.887037


In [17]:
# Split preprocessed data into features and target arrays
y = ml_df['data_value'].values
X = ml_df.drop(['data_value'],1).values

  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
# check to see the shape of X
X.shape

(12254, 169)

In [19]:
y.shape # this is one column so that is what we want

(12254,)

In [20]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [21]:
# Create an object for linear regression
model = LinearRegression()

# time how long it takes to fit the model
start = datetime.now()

# Fit the linear regression model to the training set
model.fit(X_train,y_train)
end = datetime.now()

# print total time to fit and the RAM and CPU for machine
time_to_fit = end - start
print('Time to fit the model:',time_to_fit)
print('This code was run on a computer with memory: 4 GB 1600 MHz DDR3 and processor: 1.6 GHz Dual-Core Intel Core i5.')

Time to fit the model: 0:00:00.131867
This code was run on a computer with memory: 4 GB 1600 MHz DDR3 and processor: 1.6 GHz Dual-Core Intel Core i5.


In [22]:
# Predict the test set results
y_pred= model.predict(X_test)

In [23]:
# print summary stats to get r squared score
from statsmodels.api import OLS
OLS(y_test, X_test).fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.503
Model:,OLS,Adj. R-squared:,0.474
Method:,Least Squares,F-statistic:,17.88
Date:,"Mon, 23 May 2022",Prob (F-statistic):,3.6e-263
Time:,21:04:11,Log-Likelihood:,-4070.1
No. Observations:,2451,AIC:,8404.0
Df Residuals:,2319,BIC:,9170.0
Df Model:,131,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,3.5179,0.126,27.833,0.000,3.270,3.766
x2,2.9612,0.199,14.885,0.000,2.571,3.351
x3,0.9173,0.215,4.267,0.000,0.496,1.339
x4,2.9379,0.132,22.186,0.000,2.678,3.198
x5,2.6238,0.129,20.325,0.000,2.371,2.877
x6,3.0985,0.188,16.470,0.000,2.730,3.467
x7,3.3806,0.120,28.285,0.000,3.146,3.615
x8,-0.4030,0.965,-0.417,0.676,-2.296,1.490
x9,0.2133,0.175,1.218,0.223,-0.130,0.557

0,1,2,3
Omnibus:,94.295,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,109.368
Skew:,0.453,Prob(JB):,1.7799999999999997e-24
Kurtosis:,3.501,Cond. No.,2.54e+16


In [24]:
# get more summary stats to see mean squared error
import sklearn.metrics as metrics
def regression_results(y_test, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_test, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_test, y_pred) 
    mse=metrics.mean_squared_error(y_test, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_test, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_test, y_pred)
    r2=metrics.r2_score(y_test, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

regression_results(y_test,y_pred)

explained_variance:  0.4745
mean_squared_log_error:  0.0135
r2:  0.4742
MAE:  1.0083
MSE:  1.7138
RMSE:  1.3091
