In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
import numpy as np


Load in dataset

In [2]:
data_df = pd.read_csv("https://raw.githubusercontent.com/joshuanallen/Air_Quality_Prediction/main/datasets/clean_data.csv")
data_df.head()

Unnamed: 0.1,Unnamed: 0,Census Tract,Total Population,ZIP,Longitude,Latitude,Ozone,PM2.5,Diesel PM,Pesticides,Tox. Release,Traffic,Asthma,Low Birth Weight,Cardiovascular Disease,Poverty,Unemployment
0,0,6019001100,3174,93706,-119.781696,36.709695,0.065,15.4,48.524,2.75,18551.95719,909.14,131.64,7.44,14.13,76.3,17.6
1,1,6071001600,6133,91761,-117.618013,34.05778,0.062,13.31,38.556,1.37,7494.236622,782.26,60.66,7.04,12.94,72.5,12.3
2,2,6019000200,3167,93706,-119.805504,36.735491,0.062,15.4,47.445,3.03,12454.94841,576.52,142.12,10.16,14.96,86.8,16.1
3,3,6077000801,6692,95203,-121.314524,37.940517,0.046,12.54,24.117,12.93,2387.782922,1305.01,142.17,6.23,14.72,61.3,19.6
4,4,6019001500,2206,93725,-119.717843,36.6816,0.065,15.4,18.846,3518.41,21790.70672,435.16,90.48,4.5,12.82,66.4,18.6


Remove location data from dataset

In [3]:
drop_loc_data_df = data_df[["Ozone", "PM2.5", "Diesel PM", "Pesticides", "Tox. Release", "Traffic", "Asthma", "Low Birth Weight", "Cardiovascular Disease", "Poverty", "Unemployment"]]
drop_loc_data_df.head()

Unnamed: 0,Ozone,PM2.5,Diesel PM,Pesticides,Tox. Release,Traffic,Asthma,Low Birth Weight,Cardiovascular Disease,Poverty,Unemployment
0,0.065,15.4,48.524,2.75,18551.95719,909.14,131.64,7.44,14.13,76.3,17.6
1,0.062,13.31,38.556,1.37,7494.236622,782.26,60.66,7.04,12.94,72.5,12.3
2,0.062,15.4,47.445,3.03,12454.94841,576.52,142.12,10.16,14.96,86.8,16.1
3,0.046,12.54,24.117,12.93,2387.782922,1305.01,142.17,6.23,14.72,61.3,19.6
4,0.065,15.4,18.846,3518.41,21790.70672,435.16,90.48,4.5,12.82,66.4,18.6


Split dataset into feature and target variable datasets

Features:
1. Ozone
2. PM2.5
3. Diesel PM
4. Pesticides
5. Tox. Release
6. Traffic

We chose 5 target variables to build predictive models

Targets:
1. Asthma
2. Low Birth Weight
3. Cardiovascular Disease
4. Poverty
5. Unemployment



In [4]:
X_air_data = drop_loc_data_df[["Ozone", "PM2.5", "Diesel PM", "Pesticides", "Tox. Release", "Traffic"]]

y_Asthma = drop_loc_data_df["Asthma"]
y_Low_birth_weight = drop_loc_data_df["Low Birth Weight"]
y_Cardiovascular_disease = drop_loc_data_df["Cardiovascular Disease"]
y_Poverty = drop_loc_data_df["Poverty"]
y_unemployment = drop_loc_data_df["Unemployment"]

Scale feature data using `StandardScaler()` on X_air_data


In [5]:
# Scale X data

# Create the StandardScaler instance
scaler = StandardScaler()

# other scalers to test
# scaler = MinMaxScaler()
# scaler = RobustScaler()

# Fit the StandardScaler
scaler.fit(X_air_data)
# Scale the data
scaled_data = scaler.transform(X_air_data)
# Create a DataFrame with the scaled data
transformed_scaled_data = pd.DataFrame(scaled_data, columns=X_air_data.columns)
transformed_scaled_data.head()

Unnamed: 0,Ozone,PM2.5,Diesel PM,Pesticides,Tox. Release,Traffic
0,1.719711,1.938479,1.771575,-0.112599,1.21306,-0.048368
1,1.426149,1.124842,1.166299,-0.113088,0.339039,-0.187257
2,1.426149,1.938479,1.706056,-0.1125,0.731142,-0.41247
3,-0.13951,0.825081,0.289536,-0.108992,-0.064584,0.384971
4,1.719711,1.938479,-0.030529,1.133154,1.469056,-0.56721


Create Scaled feature data

In [6]:
X_air_data_scaled = transformed_scaled_data[["Ozone", "PM2.5", "Diesel PM", "Pesticides", "Tox. Release", "Traffic"]]

# X_ozone_scaled = X_air_data_scaled["Ozone"].values.reshape(-1,1)
# X_pm25_scaled = X_air_data_scaled["PM2.5"].values.reshape(-1,1)
# X_diesel_pm_scaled = X_air_data_scaled["Diesel PM"].values.reshape(-1,1)
# X_pesticides_scaled = X_air_data_scaled["Pesticides"].values.reshape(-1,1)
# X_tox_release_scaled = X_air_data_scaled["Tox. Release"].values.reshape(-1,1)
# X_traffic_scaled = X_air_data_scaled["Traffic"].values.reshape(-1,1)

Build regression models for all 5 target variables

**Asthma** Linear Regression Model

In [7]:
X = X_air_data_scaled
y = y_Asthma

model = LinearRegression()
model.fit(X, y)

r_squared = model.score(X,y)
coefficients = model.coef_
intercept = model.intercept_

print(f"Ozone coefficient: {coefficients[0]:.3f}")
print(f"PM 2.5 coefficient: {coefficients[1]:.3f}")
print(f"Diesel PM coefficient: {coefficients[2]:.3f}")
print(f"Pesticides coefficient: {coefficients[3]:.3f}")
print(f"Tox. Release coefficient: {coefficients[4]:.3f}")
print(f"Traffic coefficient: {coefficients[5]:.3f}")
print(f"Y-Intercept: {intercept:.3f}")
print(f"R-square: {r_squared}")

adjusted_r_squared = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
print(f'Adjusted R-Square {adjusted_r_squared}')

Ozone coefficient: 3.309
PM 2.5 coefficient: 0.267
Diesel PM coefficient: 6.852
Pesticides coefficient: 0.234
Tox. Release coefficient: -1.571
Traffic coefficient: -1.869
Y-Intercept: 52.504
R-square: 0.05411954503733396
Adjusted R-Square 0.053382973625292984


**Low Birth Weight** Linear Regression Model

In [8]:
X = X_air_data_scaled
y = y_Low_birth_weight

model = LinearRegression()
model.fit(X, y)

r_squared = model.score(X,y)
coefficients = model.coef_
intercept = model.intercept_

print(f"Ozone coefficient: {coefficients[0]:.3f}")
print(f"PM 2.5 coefficient: {coefficients[1]:.3f}")
print(f"Diesel PM coefficient: {coefficients[2]:.3f}")
print(f"Pesticides coefficient: {coefficients[3]:.3f}")
print(f"Tox. Release coefficient: {coefficients[4]:.3f}")
print(f"Traffic coefficient: {coefficients[5]:.3f}")
print(f"Y-Intercept: {intercept:.3f}")
print(f"R-square: {r_squared}")

adjusted_r_squared = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
print(f'Adjusted R-Square {adjusted_r_squared}')

Ozone coefficient: 0.151
PM 2.5 coefficient: 0.148
Diesel PM coefficient: 0.251
Pesticides coefficient: -0.003
Tox. Release coefficient: -0.016
Traffic coefficient: 0.051
Y-Intercept: 4.981
R-square: 0.055828417853251655
Adjusted R-Square 0.05509317716631068


**Cardiovascular Disease** Linear Regression Model

In [9]:
X = X_air_data_scaled
y = y_Cardiovascular_disease

model = LinearRegression()
model.fit(X, y)

r_squared = model.score(X,y)
coefficients = model.coef_
intercept = model.intercept_

print(f"Ozone coefficient: {coefficients[0]:.3f}")
print(f"PM 2.5 coefficient: {coefficients[1]:.3f}")
print(f"Diesel PM coefficient: {coefficients[2]:.3f}")
print(f"Pesticides coefficient: {coefficients[3]:.3f}")
print(f"Tox. Release coefficient: {coefficients[4]:.3f}")
print(f"Traffic coefficient: {coefficients[5]:.3f}")
print(f"Y-Intercept: {intercept:.3f}")
print(f"R-square: {r_squared}")

adjusted_r_squared = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
print(f'Adjusted R-Square {adjusted_r_squared}')

Ozone coefficient: 1.277
PM 2.5 coefficient: -0.111
Diesel PM coefficient: 0.234
Pesticides coefficient: 0.011
Tox. Release coefficient: -0.047
Traffic coefficient: -0.141
Y-Intercept: 8.320
R-square: 0.17137509930493577
Adjusted R-Square 0.170729836565913


**Poverty** Linear Regression Model

In [10]:
X = X_air_data_scaled
y = y_Poverty

model = LinearRegression()
model.fit(X, y)

r_squared = model.score(X,y)
coefficients = model.coef_
intercept = model.intercept_

print(f"Ozone coefficient: {coefficients[0]:.3f}")
print(f"PM 2.5 coefficient: {coefficients[1]:.3f}")
print(f"Diesel PM coefficient: {coefficients[2]:.3f}")
print(f"Pesticides coefficient: {coefficients[3]:.3f}")
print(f"Tox. Release coefficient: {coefficients[4]:.3f}")
print(f"Traffic coefficient: {coefficients[5]:.3f}")
print(f"Y-Intercept: {intercept:.3f}")
print(f"R-square: {r_squared}")

adjusted_r_squared = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
print(f'Adjusted R-Square {adjusted_r_squared}')

Ozone coefficient: 5.093
PM 2.5 coefficient: 1.647
Diesel PM coefficient: 4.642
Pesticides coefficient: 1.103
Tox. Release coefficient: 0.400
Traffic coefficient: 0.283
Y-Intercept: 36.412
R-square: 0.12523889570487912
Adjusted R-Square 0.12455770600653115


**Unemployment** Linear Regression Model

In [11]:
X = X_air_data_scaled
y = y_unemployment

model = LinearRegression()
model.fit(X, y)

r_squared = model.score(X,y)
coefficients = model.coef_
intercept = model.intercept_

print(f"Ozone coefficient: {coefficients[0]:.3f}")
print(f"PM 2.5 coefficient: {coefficients[1]:.3f}")
print(f"Diesel PM coefficient: {coefficients[2]:.3f}")
print(f"Pesticides coefficient: {coefficients[3]:.3f}")
print(f"Tox. Release coefficient: {coefficients[4]:.3f}")
print(f"Traffic coefficient: {coefficients[5]:.3f}")
print(f"Y-Intercept: {intercept:.3f}")
print(f"R-square: {r_squared}")

adjusted_r_squared = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
print(f'Adjusted R-Square {adjusted_r_squared}')

Ozone coefficient: 1.522
PM 2.5 coefficient: 0.145
Diesel PM coefficient: 0.353
Pesticides coefficient: 0.021
Tox. Release coefficient: -0.129
Traffic coefficient: -0.146
Y-Intercept: 10.183
R-square: 0.09980972892536788
Adjusted R-Square 0.0991087371503584
