In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import tensorflow as tf

Load dataset

In [2]:
data_df = pd.read_csv("https://raw.githubusercontent.com/joshuanallen/Air_Quality_Prediction/main/clean_data.csv")
data_df.head()

Unnamed: 0.1,Unnamed: 0,Census Tract,Total Population,ZIP,Longitude,Latitude,Ozone,PM2.5,Diesel PM,Pesticides,Tox. Release,Traffic,Asthma,Low Birth Weight,Cardiovascular Disease,Poverty,Unemployment
0,0,6019001100,3174,93706,-119.781696,36.709695,0.065,15.4,48.524,2.75,18551.95719,909.14,131.64,7.44,14.13,76.3,17.6
1,1,6071001600,6133,91761,-117.618013,34.05778,0.062,13.31,38.556,1.37,7494.236622,782.26,60.66,7.04,12.94,72.5,12.3
2,2,6019000200,3167,93706,-119.805504,36.735491,0.062,15.4,47.445,3.03,12454.94841,576.52,142.12,10.16,14.96,86.8,16.1
3,3,6077000801,6692,95203,-121.314524,37.940517,0.046,12.54,24.117,12.93,2387.782922,1305.01,142.17,6.23,14.72,61.3,19.6
4,4,6019001500,2206,93725,-119.717843,36.6816,0.065,15.4,18.846,3518.41,21790.70672,435.16,90.48,4.5,12.82,66.4,18.6


Drop unecessary columns

In [3]:
drop_loc_data_df = data_df[["Ozone", "PM2.5", "Diesel PM", "Pesticides", "Tox. Release", "Traffic", "Asthma", "Low Birth Weight", "Cardiovascular Disease", "Poverty", "Unemployment"]]
drop_loc_data_df.head()

Unnamed: 0,Ozone,PM2.5,Diesel PM,Pesticides,Tox. Release,Traffic,Asthma,Low Birth Weight,Cardiovascular Disease,Poverty,Unemployment
0,0.065,15.4,48.524,2.75,18551.95719,909.14,131.64,7.44,14.13,76.3,17.6
1,0.062,13.31,38.556,1.37,7494.236622,782.26,60.66,7.04,12.94,72.5,12.3
2,0.062,15.4,47.445,3.03,12454.94841,576.52,142.12,10.16,14.96,86.8,16.1
3,0.046,12.54,24.117,12.93,2387.782922,1305.01,142.17,6.23,14.72,61.3,19.6
4,0.065,15.4,18.846,3518.41,21790.70672,435.16,90.48,4.5,12.82,66.4,18.6


Check for missing data

In [4]:
print(drop_loc_data_df.isnull().sum())

Ozone                     0
PM2.5                     0
Diesel PM                 0
Pesticides                0
Tox. Release              0
Traffic                   0
Asthma                    0
Low Birth Weight          0
Cardiovascular Disease    0
Poverty                   0
Unemployment              0
dtype: int64


Summarize dataset

In [5]:
drop_loc_data_df.describe()

Unnamed: 0,Ozone,PM2.5,Diesel PM,Pesticides,Tox. Release,Traffic,Asthma,Low Birth Weight,Cardiovascular Disease,Poverty,Unemployment
count,7712.0,7712.0,7712.0,7712.0,7712.0,7712.0,7712.0,7712.0,7712.0,7712.0,7712.0
mean,0.047426,10.420606,19.348765,320.518047,3204.873215,953.325455,52.503613,4.980663,8.320344,36.412202,10.182923
std,0.01022,2.568878,16.469596,2822.298975,12652.368968,913.59337,30.330867,1.550364,2.929641,20.251424,4.957414
min,0.026,1.869001,0.021,0.0,0.0,22.41,3.63,0.0,1.56,1.0,0.0
25%,0.04,8.697944,9.172,0.0,103.800931,450.0475,30.42,3.96,6.14,19.2,6.6
50%,0.046,10.37,16.7255,0.0,489.912078,706.89,45.9,4.92,8.005,33.6,9.3
75%,0.055,12.05,24.74075,0.36,3571.025854,1207.335,66.43,5.93,10.08,51.6,12.8
max,0.068,19.6,208.4,91316.19,842751.3293,45687.87,223.3,14.89,21.26,94.9,60.5


Bin continuous variables for classification

In [9]:
# Asthma Binning
asthma_bins = [0, 25, 50, 75, 100, 125, 150, 175, 200, 225]

asthma_labels =[1,2,3,4,5,6,7,8,9]

drop_loc_data_df['Asthma_binned'] = pd.cut(drop_loc_data_df["Asthma"], bins = asthma_bins, labels = asthma_labels, include_lowest=True)


# Low birth weight binning
lbw_bins = [-1, 3, 6, 9, 12, 15]

lbw_labels =[1,2,3,4,5]

drop_loc_data_df['LBW_binned'] = pd.cut(drop_loc_data_df["Low Birth Weight"], bins = lbw_bins, labels = lbw_labels, include_lowest=True)


# Cardiovascular Disease Binning
cvd_bins = [0, 3, 6, 9, 12, 14, 16, 18, 20, 22]

cvd_labels =[1,2,3,4,5,6,7,8,9]

drop_loc_data_df['CVD_binned'] = pd.cut(drop_loc_data_df["Cardiovascular Disease"], bins = cvd_bins, labels = cvd_labels, include_lowest=True)


# Poverty
poverty_bins = [0, 20, 40, 60, 80, 100]

poverty_labels =[1,2,3,4,5]

drop_loc_data_df['Poverty_binned'] = pd.cut(drop_loc_data_df["Poverty"], bins = poverty_bins, labels = poverty_labels, include_lowest=True)


# Unemployment
ue_bins = [0, 5, 10, 15, 20, 25, 100]

ue_labels =[1,2,3,4,5,6]

drop_loc_data_df['Unemployment_binned'] = pd.cut(drop_loc_data_df["Unemployment"], bins = ue_bins, labels = ue_labels, include_lowest=True)


drop_loc_data_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stabl

Unnamed: 0,Ozone,PM2.5,Diesel PM,Pesticides,Tox. Release,Traffic,Asthma,Low Birth Weight,Cardiovascular Disease,Poverty,Unemployment,Asthma_binned,LBW_binned,CVD_binned,Poverty_binned,Unemployment_binned
0,0.065,15.4,48.524,2.75,18551.95719,909.14,131.64,7.44,14.13,76.3,17.6,6,3,6,4,4
1,0.062,13.31,38.556,1.37,7494.236622,782.26,60.66,7.04,12.94,72.5,12.3,3,3,5,4,3
2,0.062,15.4,47.445,3.03,12454.94841,576.52,142.12,10.16,14.96,86.8,16.1,6,4,6,5,4
3,0.046,12.54,24.117,12.93,2387.782922,1305.01,142.17,6.23,14.72,61.3,19.6,6,3,6,4,4
4,0.065,15.4,18.846,3518.41,21790.70672,435.16,90.48,4.5,12.82,66.4,18.6,4,2,5,4,4


Set feature and target variables

In [10]:
# Features
X_air_data = drop_loc_data_df[["Ozone", "PM2.5", "Diesel PM", "Pesticides", "Tox. Release", "Traffic"]]

# targets
y_Asthma = drop_loc_data_df["Asthma_binned"]
y_Low_birth_weight = drop_loc_data_df["LBW_binned"]
y_Cardiovascular_disease = drop_loc_data_df["CVD_binned"]
y_Poverty = drop_loc_data_df["Poverty_binned"]
y_Unemployment = drop_loc_data_df["Unemployment_binned"]

Standarize numerical variables

1. Split into training and testing datasets
2. Scale the feature data
3. Train and evaluate the Random Forest Classifier

In [11]:
X = X_air_data
y = y_Asthma

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()
# scaler = MinMaxScaler()
# scaler = RobustScaler()


# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.612


**Low Birth Weight** Random Forest Classifier model

In [12]:
X = X_air_data
y = y_Low_birth_weight

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()
# scaler = MinMaxScaler()
# scaler = RobustScaler()


# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.666


**Cardiovascular disease** Random Forest Classifier model

In [13]:
X = X_air_data
y = y_Cardiovascular_disease

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()
# scaler = MinMaxScaler()
# scaler = RobustScaler()


# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.599


**Poverty** Random Forest Classifier model


In [14]:
X = X_air_data
y = y_Poverty

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()
# scaler = MinMaxScaler()
# scaler = RobustScaler()


# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.493


**Unemployment** Random Forest Classifier model

In [15]:
X = X_air_data
y = y_Unemployment

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()
# scaler = MinMaxScaler()
# scaler = RobustScaler()


# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.442
