In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
# Read in the data
df = pd.read_csv(Path('merged_air_dia.csv'))
# drop rows with null values
df = df.dropna()

# print out the dataframe
df

Unnamed: 0,city,pm25,pm10,population,color_pm10,color_pm25,uniquezip,state,data_value_type,data_value,low_confidence_limit,high_confidence_limit
0,Anchorage,5.650000,11.781883,380821,green,green,0203000-02020002501,Alaska,Crude prevalence,7.4,6.9,7.8
1,Tempe,7.350000,15.326874,161780,green,green,0473000-04013319907,Arizona,Crude prevalence,7.0,6.4,7.9
2,Yuma,8.300000,17.307898,195751,green,green,0485540,Arizona,Crude prevalence,11.1,11.0,11.3
3,Folsom,6.250000,13.033056,72199,green,green,0624638,California,Crude prevalence,7.1,7.0,7.3
4,San Francisco,7.500000,15.639667,870887,green,green,0667000,California,Crude prevalence,8.6,8.5,8.6
...,...,...,...,...,...,...,...,...,...,...,...,...
12249,Charleston,7.300000,15.222609,227078,green,green,5414600-54039002000,West Virginia,Crude prevalence,10.3,9.3,11.3
12250,Yakima,8.550000,17.829220,243231,green,green,5380010-53077002802,Washington,Crude prevalence,9.0,8.4,9.5
12251,Yakima,8.550000,17.829220,243231,green,green,5380010-53077001000,Washington,Crude prevalence,8.9,8.4,9.4
12252,Tacoma,7.066667,14.736042,198397,green,green,5370000-53053061400,Washington,Crude prevalence,13.0,12.5,13.6


In [3]:
# check types for each column
df.dtypes

city                      object
pm25                     float64
pm10                     float64
population                 int64
color_pm10                object
color_pm25                object
uniquezip                 object
state                     object
data_value_type           object
data_value               float64
low_confidence_limit     float64
high_confidence_limit    float64
dtype: object

In [4]:
# df['color_pm25'].nunique() # 2 unique colors
# df['color_pm10'].nunique() # 3 unique colors
# df['data_value_type'].nunique() # 2 unique types

# drop high and low confidence limits for machine learning model so there is no bias
df.drop(['low_confidence_limit','high_confidence_limit'],inplace=True)
df.head()

2

In [5]:
# Convert columns that are objects to integers using onehotencoder
enc = OneHotEncoder(sparse=False)

# Fit and transform the onehotencoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[['color_pm25','color_pm10','data_value_type']]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(['color_pm25','color_pm10','data_value_type'])
encode_df.head()

Unnamed: 0,color_pm25_green,color_pm25_yellow,color_pm10_darkred,color_pm10_green,color_pm10_yellow,data_value_type_Age-adjusted prevalence,data_value_type_Crude prevalence
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [6]:
# Use pandas.get_dummies to convert columns with a large amount of unique entries

# use the line below to drop the dash from unique zip
# df["uniquezip"] = df["uniquezip"].str.replace('-','')

dummies_df = pd.get_dummies(df[['city','uniquezip','state']], columns=['city', 'uniquezip', 'state'])
dummies_df.drop(['color_pm10','color_pm25','data_value_type'], axis=1, inplace=True)
dummies_df.head()

Unnamed: 0,pm25,pm10,population,data_value,low_confidence_limit,high_confidence_limit,city_Albany,city_Albuquerque,city_Allentown,city_Anaheim,...,state_Rhode Island,state_South Carolin,state_South Dakota,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia
0,5.65,11.781883,380821,7.4,6.9,7.8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7.35,15.326874,161780,7.0,6.4,7.9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8.3,17.307898,195751,11.1,11.0,11.3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6.25,13.033056,72199,7.1,7.0,7.3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7.5,15.639667,870887,8.6,8.5,8.6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Merge encoded dataframes with the remaining columns
ml_df = df['pm10','pm25','population','data_value']
ml_df = pd.concat([encode_df, dummies_df],axis=1)
ml_df.head()

Unnamed: 0,color_pm25_green,color_pm25_yellow,color_pm10_darkred,color_pm10_green,color_pm10_yellow,data_value_type_Age-adjusted prevalence,data_value_type_Crude prevalence,pm25,pm10,population,...,state_Rhode Island,state_South Carolin,state_South Dakota,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,5.65,11.781883,380821,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,7.35,15.326874,161780,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0.0,0.0,1.0,0.0,0.0,1.0,8.3,17.307898,195751,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,6.25,13.033056,72199,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,7.5,15.639667,870887,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# check to make sure all of the data types are correct for ML
ml_df.dtypes

color_pm25_green       float64
color_pm25_yellow      float64
color_pm10_darkred     float64
color_pm10_green       float64
color_pm10_yellow      float64
                        ...   
state_Utah               uint8
state_Vermont            uint8
state_Virginia           uint8
state_Washington         uint8
state_West Virginia      uint8
Length: 12295, dtype: object

In [9]:
# Split preprocessed data into features and target arrays
y = ml_df['data_value'].values
X = ml_df.drop(['data_value'],1).values

  after removing the cwd from sys.path.


(12254, 12294)

In [None]:
# check to see the shape of X
X.shape

In [10]:
y.shape # this is one column so that is what we want

(12254,)

In [None]:
# Use standard scaler to help scale the data to train the model
scaler = StandardScaler()

# Want to scale the columns from data that are originally integers
cols_to_scale = ['pm10','pm25','population']
X_scaled = scaler.fit_transform(df[cols_to_scale])
X_scaled

In [None]:
# Now do we add the scaled data back into the ml_df or do we use the scaled data in the train_test_split
new_X = X.drop([cols_to_scale])
new_X = pd.concat([X_scaled],axis=1)
new_X # do we use this for the train_test_split

# do we need to use PCA next? or why did we do this in the crypto analysis

# for clustering see crypto assignment

In [11]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
# Create an object for linear regression
model = LinearRegression()

# time how long it takes to fit the model
start = datetime.now()

# Fit the linear regression model to the training set
model.fit(X_train,y_train)
end = datetime.now()

# print total time to fit and the RAM and CPU for machine
time_to_fit = end - start
print('Time to fit the model:',time_to_fit)
print('This code was run on a computer with memory: 4 GB 1600 MHz DDR3 and processor: 1.6 GHz Dual-Core Intel Core i5.')

Time to fit the model: 0:20:04.934723
This code was run on a computer with memory: 4 GB 1600 MHz DDR3 and processor: 1.6 GHz Dual-Core Intel Core i5.


In [None]:
# Predict the test set results and time how long it takes to predict the model
start = datetime.now()
y_pred= model.predict(X_test)
end = datetime.now()

# print total time to predict and machine information
time_to_predict = end - start
print('Time to predict:',time_to_predict)
print('This code was run on a computer with memory: 4 GB 1600 MHz DDR3 and processor: 1.6 GHz Dual-Core Intel Core i5.')

In [None]:
# reshaped data? X = df.pm25.values.reshape(-1, 1)
# get the R-squared value
model.score(y_test,y_pred)

In [None]:
# calculate the accuracy score
acc_score = balanced_accuracy_score(y_test,y_pred)

In [None]:
# calculate mean squared error using y_test and y_pred
mse = mean_squared_error(y_test, y_pred)

In [None]:
#scatter plot of data comparing percent of people with diabetes to the particle matter
plt.scatter( ml_df.pm10,ml_df.data_value)
plt.xlabel('Data Value')
plt.ylabel('PM10')
plt.show()

In [None]:
#scatter plot of data comparing percent of people with diabetes to the particle matter
plt.scatter( ml_df.pm25,ml_df.data_value)
plt.xlabel('Data Value')
plt.ylabel('PM25')
plt.show()

In [None]:
#plot best fit line with data
plt.scatter(X, y)
plt.plot(X, y_pred, color='red')
plt.show()

In [None]:
#coef = slope
#intercept = y intercept
print(model.coef_)
print(model.intercept_)

In [None]:
#Multiple Regression 
X2 = df[['pm10', 'pm25']]
y2 = df['Data_Value']
# setting up model
regr = LinearRegression()
regr.fit(X2, y2)
# makes y predict
y_pred2 = regr.predict(X2)
print(y_pred2.shape)

In [None]:
#predict the % diabetes of a population where pm10 is 18 and pm2.5 is 14
predictedDiabetes = regr.predict([[18, 14]])
# predicted % of population with diabetes, if the pm10 is 18 and pm 2.5 is 14
print(predictedDiabetes)

In [None]:
#coef = slope
print(regr.coef_)

In [None]:
#intercept = y intercept
print(regr.intercept_)