In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
siteDF = pd.read_csv("/content/drive/MyDrive/FA21 CMPE 255 Term Project/aqs_sites.csv", encoding="utf8")
siteDF = siteDF.drop_duplicates(subset=['Address'])

**Read training data, years 2000-2017**

In [4]:
dfToTrain = pd.concat(
    map(pd.read_csv, ['/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2000.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2001.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2002.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2003.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2004.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2005.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2006.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2007.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2008.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2009.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2010.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2011.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2012.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2013.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2014.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2015.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2016.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2017.csv'
                      ]), ignore_index=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
dfToTrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4670368 entries, 0 to 4670367
Data columns (total 29 columns):
 #   Column               Dtype  
---  ------               -----  
 0   State Code           int64  
 1   County Code          int64  
 2   Site Num             int64  
 3   Parameter Code       int64  
 4   POC                  int64  
 5   Latitude             float64
 6   Longitude            float64
 7   Datum                object 
 8   Parameter Name       object 
 9   Sample Duration      object 
 10  Pollutant Standard   object 
 11  Date Local           object 
 12  Units of Measure     object 
 13  Event Type           object 
 14  Observation Count    int64  
 15  Observation Percent  float64
 16  Arithmetic Mean      float64
 17  1st Max Value        float64
 18  1st Max Hour         int64  
 19  AQI                  float64
 20  Method Code          float64
 21  Method Name          object 
 22  Local Site Name      object 
 23  Address              object 
 24

**Creating a function to preprocess the train data**

In [6]:
def preprocess(dataFile):
  # extract day, month, and year from date local
  dataFile['Date Local'] = pd.to_datetime(dataFile['Date Local'])
  dataFile['Day'] = dataFile['Date Local'].dt.day
  dataFile['Month'] = dataFile['Date Local'].dt.month
  dataFile['Year'] = dataFile['Date Local'].dt.year

  dfSpcMethod = pd.DataFrame()
  # select the most relevant sample duration
  dfSpcMethod = dataFile.loc[dataFile['Sample Duration'] == "8-HR RUN AVG END HOUR"]

  # drop irrelevant columns
  dfSpcMethodNoNa = dfSpcMethod.drop(columns=['Method Code', 'Date Local', 'Observation Count', 'Observation Percent'])
  dfSpcMethodNoNa = dfSpcMethodNoNa.select_dtypes(exclude=['object'])
  
  return dfSpcMethodNoNa


In [7]:
dfSpcMethodNoNa = pd.DataFrame()
dfSpcMethodNoNa = preprocess(dfToTrain)

**Select the x_train and y_train data to fit the regression models with**

In [8]:
y_train = dfSpcMethodNoNa['AQI']
x_train = dfSpcMethodNoNa.drop(columns=['AQI'])

In [9]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2335396 entries, 366 to 4670367
Data columns (total 13 columns):
 #   Column           Dtype  
---  ------           -----  
 0   State Code       int64  
 1   County Code      int64  
 2   Site Num         int64  
 3   Parameter Code   int64  
 4   POC              int64  
 5   Latitude         float64
 6   Longitude        float64
 7   Arithmetic Mean  float64
 8   1st Max Value    float64
 9   1st Max Hour     int64  
 10  Day              int64  
 11  Month            int64  
 12  Year             int64  
dtypes: float64(4), int64(9)
memory usage: 249.4 MB


In [10]:
x_train.head()

Unnamed: 0,State Code,County Code,Site Num,Parameter Code,POC,Latitude,Longitude,Arithmetic Mean,1st Max Value,1st Max Hour,Day,Month,Year
366,1,73,28,42101,1,33.529444,-86.850278,0.994737,1.1,5,1,1,2000
367,1,73,28,42101,1,33.529444,-86.850278,0.891667,1.4,19,2,1,2000
368,1,73,28,42101,1,33.529444,-86.850278,0.7,1.3,0,3,1,2000
369,1,73,28,42101,1,33.529444,-86.850278,1.120833,1.5,17,4,1,2000
370,1,73,28,42101,1,33.529444,-86.850278,1.095833,2.5,23,5,1,2000


In [11]:
y_train.isnull().sum()

0

**Read the test data, years 2018-2020**

In [12]:
# df2018 = pd.read_csv("/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2018.csv", encoding="utf8")

dfToTest = pd.concat(
    map(pd.read_csv, ['/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2018.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2019.csv',
                      '/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/daily_42101_2020.csv',
                      ]), ignore_index=True)

In [13]:
nameOfParameter = dfToTest['Parameter Name'][0]
nameOfParameter

'Carbon monoxide'

In [14]:
df2018NoNa = pd.DataFrame()
df2018NoNa = preprocess(dfToTest)

**Select x_test and y_test data to predict AQI values and verify accuracy of model**

In [15]:
y_test = df2018NoNa['AQI']
x_test = df2018NoNa.drop(columns = ['AQI'])

In [16]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 282107 entries, 266 to 564208
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   State Code       282107 non-null  int64  
 1   County Code      282107 non-null  int64  
 2   Site Num         282107 non-null  int64  
 3   Parameter Code   282107 non-null  int64  
 4   POC              282107 non-null  int64  
 5   Latitude         282107 non-null  float64
 6   Longitude        282107 non-null  float64
 7   Arithmetic Mean  282107 non-null  float64
 8   1st Max Value    282107 non-null  float64
 9   1st Max Hour     282107 non-null  int64  
 10  Day              282107 non-null  int64  
 11  Month            282107 non-null  int64  
 12  Year             282107 non-null  int64  
dtypes: float64(4), int64(9)
memory usage: 30.1 MB


**Random Forest Regression performs well but accuracy is low in comaprison to other models**

In [17]:
rfReg = RandomForestRegressor(max_depth=2)
rfReg.fit(x_train, y_train)
predictedValsRf = rfReg.predict(x_test)
print(rfReg.score(x_train, y_train))
print(rfReg.score(x_test, y_test))

0.795958466900497
0.7566475121281864


**Bayesian Regression performs well with a high accuracy score for test data**

In [18]:
bayesReg = linear_model.BayesianRidge()
bayesReg.fit(x_train, y_train)
predictedValsBayes = bayesReg.predict(x_test)
print(bayesReg.score(x_train, y_train))
print(bayesReg.score(x_test, y_test))

0.9958590943858727
0.9904037681761942


**Linear Regression performs well with a high accuracy score for test data**

In [19]:
linreg = LinearRegression()
linreg.fit(x_train, y_train)
predictedVals = linreg.predict(x_test)
print(linreg.score(x_train, y_train))
print(linreg.score(x_test, y_test))

0.9958590943858764
0.9904037676772358


**N estimators set to 30 for Gradient Boost gives high accuracy and takes relatively lesser time**

In [20]:
gbr = GradientBoostingRegressor(n_estimators=30)
gbr.fit(x_train, y_train)
values = gbr.predict(x_test)
print(gbr.score(x_test, y_test))

0.99409439028416


In [21]:
graphDataFrame = x_test[['State Code','County Code','Site Num','Latitude','Longitude','Month','Day','Year']]

In [22]:
graphDataFrame['AQI predicted'] = values
graphDataFrame['Parameter'] = "CO"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
graphDataFrame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 282107 entries, 266 to 564208
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State Code     282107 non-null  int64  
 1   County Code    282107 non-null  int64  
 2   Site Num       282107 non-null  int64  
 3   Latitude       282107 non-null  float64
 4   Longitude      282107 non-null  float64
 5   Month          282107 non-null  int64  
 6   Day            282107 non-null  int64  
 7   Year           282107 non-null  int64  
 8   AQI predicted  282107 non-null  float64
 9   Parameter      282107 non-null  object 
dtypes: float64(3), int64(6), object(1)
memory usage: 23.7+ MB


**Gradient Boost Regressor performs the best and is therefore chosen to predict AQI values**

In [24]:
graphDataFrame.to_csv('/content/drive/MyDrive/FA21 CMPE 255 Term Project/CO/CO_AQI_Predicted', index=False)