In [122]:
#import the libraries
import pandas as pd
import numpy as np
from datetime import datetime as dt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [2]:
# IAQ Data Exploration

#import the data
fileStr = 'Mantes la jolie Inside (inside) (48.990902 1.71929) Primary 60_minute_average 3_11_2019 10_25_2020.csv'

custom_parser = lambda date: dt.strptime(date, '%Y-%m-%d %H:%M:%S UTC')

df_data = pd.read_csv(fileStr, parse_dates=['created_at'], date_parser=custom_parser)

In [3]:
df_data.describe()

Unnamed: 0,PM1.0_CF1_ug/m3,PM2.5_CF1_ug/m3,PM10.0_CF1_ug/m3,UptimeMinutes,RSSI_dbm,Temperature_F,Humidity_%,PM2.5_ATM_ug/m3,Unnamed: 9
count,9983.0,9983.0,9983.0,9983.0,9983.0,9983.0,9983.0,9983.0,0.0
mean,4.103229,6.169224,7.055959,25006.658975,-60.19733,82.579285,31.105014,6.031272,
std,5.136841,7.329539,8.010919,19749.664939,3.771609,4.915265,5.346126,6.361829,
min,0.0,0.0,0.03,1.9,-74.17,66.1,16.0,0.0,
25%,1.15,2.16,2.75,8143.0,-62.03,78.97,27.37,2.16,
50%,2.43,3.83,4.55,19623.0,-59.2,82.23,31.0,3.83,
75%,5.13,7.51,8.59,39147.0,-57.47,85.8,35.0,7.5,
max,75.21,113.18,150.69,71503.0,-37.0,101.3,46.27,75.99,


In [4]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10054 entries, 0 to 10053
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   created_at        10054 non-null  datetime64[ns]
 1   PM1.0_CF1_ug/m3   9983 non-null   float64       
 2   PM2.5_CF1_ug/m3   9983 non-null   float64       
 3   PM10.0_CF1_ug/m3  9983 non-null   float64       
 4   UptimeMinutes     9983 non-null   float64       
 5   RSSI_dbm          9983 non-null   float64       
 6   Temperature_F     9983 non-null   float64       
 7   Humidity_%        9983 non-null   float64       
 8   PM2.5_ATM_ug/m3   9983 non-null   float64       
 9   Unnamed: 9        0 non-null      float64       
dtypes: datetime64[ns](1), float64(9)
memory usage: 785.6 KB


In [5]:
df_data.head(5)

Unnamed: 0,created_at,PM1.0_CF1_ug/m3,PM2.5_CF1_ug/m3,PM10.0_CF1_ug/m3,UptimeMinutes,RSSI_dbm,Temperature_F,Humidity_%,PM2.5_ATM_ug/m3,Unnamed: 9
0,2019-03-11 15:00:00,1.72,4.36,4.59,1.9,-41.9,66.1,26.5,4.36,
1,2019-03-11 16:00:00,28.74,78.08,122.29,20.12,-37.0,74.15,21.45,53.16,
2,2019-03-12 14:00:00,31.89,93.42,150.69,15.03,-50.92,71.83,22.94,64.02,
3,2019-03-12 15:00:00,0.13,0.26,0.26,42.0,-51.88,75.62,21.0,0.26,
4,2020-03-28 09:00:00,34.77,52.74,57.56,11.22,-53.39,73.13,30.74,40.19,


In [6]:
df_data['day'] = df_data['created_at'].dt.day
df_data['month'] = df_data['created_at'].dt.month
df_data['year'] = df_data['created_at'].dt.year

In [7]:
df_data['day'].head() 

0    11
1    11
2    12
3    12
4    28
Name: day, dtype: int64

In [8]:
#drop unnamed column
df_data.drop('Unnamed: 9', axis=1, inplace=True)
# df_data.sample()

In [9]:
#is it weekend or not?
df_data['dayOfWeek'] = (df_data['created_at'].dt.dayofweek // 5 ==1).astype(float)

In [10]:
#is it weekend or not?
df_data['hour'] = df_data['created_at'].dt.hour

In [11]:
df_data['hour'].sample(5)

4283    21
6021     7
9541    23
3651    13
7316     6
Name: hour, dtype: int64

In [12]:
#create time of the day
timeOfDay = {'earlyMorning':0, 'morning':1, 'afternoon':2, 'night':3}
df_data['timeOfDay'] = 0
df_data.loc[(df_data['hour'] > 0) & (df_data['hour'] <= 6), 'timeOfDay'] = 0;
df_data.loc[(df_data['hour'] > 6) & (df_data['hour'] <= 12), 'timeOfDay'] = 1;
df_data.loc[(df_data['hour'] > 13) & (df_data['hour'] <= 18), 'timeOfDay'] = 2;
df_data.loc[(df_data['hour'] > 18) & (df_data['hour'] <= 23), 'timeOfDay'] = 3;

In [13]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10054 entries, 0 to 10053
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   created_at        10054 non-null  datetime64[ns]
 1   PM1.0_CF1_ug/m3   9983 non-null   float64       
 2   PM2.5_CF1_ug/m3   9983 non-null   float64       
 3   PM10.0_CF1_ug/m3  9983 non-null   float64       
 4   UptimeMinutes     9983 non-null   float64       
 5   RSSI_dbm          9983 non-null   float64       
 6   Temperature_F     9983 non-null   float64       
 7   Humidity_%        9983 non-null   float64       
 8   PM2.5_ATM_ug/m3   9983 non-null   float64       
 9   day               10054 non-null  int64         
 10  month             10054 non-null  int64         
 11  year              10054 non-null  int64         
 12  dayOfWeek         10054 non-null  float64       
 13  hour              10054 non-null  int64         
 14  timeOfDay         1005

In [16]:
df_data['dayOfWeek'].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    1.0
Name: dayOfWeek, dtype: float64

In [39]:
df_PM = df_data['PM2.5_CF1_ug/m3']

In [64]:
#concatenate data 
dataFrame_PM = pd.concat([df_data['dayOfWeek'], df_data['timeOfDay'], df_PM.shift(2), df_PM.shift(1), df_PM], axis=1)

dataFrame_PM.columns = ['dayOfWeek','timeOfDay', 'PM_lag2', 'PM_lag1', 'PM2.5' ]

In [65]:
#drop rows
dataFrame_PM = dataFrame_PM.dropna()
dataFrame_PM.head()

Unnamed: 0,dayOfWeek,timeOfDay,PM_lag2,PM_lag1,PM2.5
2,0.0,2,4.36,78.08,93.42
3,0.0,2,78.08,93.42,0.26
4,1.0,1,93.42,0.26,52.74
5,1.0,1,0.26,52.74,80.49
6,1.0,1,52.74,80.49,76.0


In [94]:
#extract the train and test part
trainSize = int(len(dataFrame_PM)* 0.75)

trainSet, testSet = dataFrame_PM.iloc[0: trainSize], dataFrame_PM.iloc[trainSize:len(dataFrame_PM)]

In [100]:
#print sizes
print('Total Observations: {}'.format(len(dataFrame_PM)))

print('Training Observations: {}'.format(len(trainSet)))

print('Testing Observations: {}'.format(len(testSet)))

Total Observations: 9969
Training Observations: 7476
Testing Observations: 2493


In [117]:
predictors = ['dayOfWeek', 'timeOfDay', 'PM_lag2', 'PM_lag1']
target = ['PM2.5']

X_train, X_test, y_train, y_test = trainSet[predictors].values, testSet[predictors].values, trainSet[target].values, testSet[target].values

In [118]:
#use random forest
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train.ravel())
y_pred = regressor.predict(X_test)

In [121]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 1.9523499763623864
Mean Squared Error: 38.03315313742697
Root Mean Squared Error: 6.167102491237435
