# Model Development

### Prepping data for analysis

In [225]:
!pip3 install tensorflow scikit-learn pandas matplotlib seaborn numpy 



In [226]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

In [227]:
# Read all the CSV files in the directory
data_dir = '../data'
all_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')]
df_list = [pd.read_csv(f) for f in all_files]
data = pd.concat(df_list, ignore_index=True)

# Sort by datatimestamp
data = data.sort_values(by='Date').reset_index(drop=True)

# Display the first few rows of the combined dataframe
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,year_month
0,2004-01-04,0:00:00,1.6,1143.0,106.0,6.3,825.0,96.0,986.0,86.0,1477.0,978.0,12.0,61.6,0.8593,2004-01
1,2004-01-04,2:00:00,1.1,1034.0,71.0,4.1,716.0,50.0,1085.0,55.0,1405.0,891.0,10.7,67.2,0.863,2004-01
2,2004-01-04,3:00:00,0.9,956.0,72.0,4.0,713.0,,1099.0,,1422.0,849.0,9.0,73.1,0.8394,2004-01
3,2004-01-04,4:00:00,0.7,909.0,44.0,2.4,615.0,57.0,1237.0,49.0,1322.0,790.0,10.2,66.6,0.8299,2004-01
4,2004-01-04,5:00:00,0.9,996.0,45.0,2.9,648.0,64.0,1176.0,50.0,1340.0,852.0,11.0,63.7,0.8325,2004-01


In [228]:
data.isna().sum()

Date                0
Time                0
CO(GT)            700
PT08.S1(CO)       206
NMHC(GT)         3538
C6H6(GT)          206
PT08.S2(NMHC)     206
NOx(GT)           810
PT08.S3(NOx)      206
NO2(GT)           812
PT08.S4(NO2)      206
PT08.S5(O3)       206
T                 206
RH                206
AH                206
year_month          0
dtype: int64

### Feature Engineering

In [229]:
# Creating Date related columns from 'Date' column
data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['IsWeekend'] = data['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

# Convert time column to datetime
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S')
data['Hour'] = data['Time'].dt.hour

data = data.drop(columns=['Time', 'year_month', 'Date'])

data.head()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Year,Month,Day,DayOfWeek,IsWeekend,Hour
0,1.6,1143.0,106.0,6.3,825.0,96.0,986.0,86.0,1477.0,978.0,12.0,61.6,0.8593,2004,1,4,6,1,0
1,1.1,1034.0,71.0,4.1,716.0,50.0,1085.0,55.0,1405.0,891.0,10.7,67.2,0.863,2004,1,4,6,1,2
2,0.9,956.0,72.0,4.0,713.0,,1099.0,,1422.0,849.0,9.0,73.1,0.8394,2004,1,4,6,1,3
3,0.7,909.0,44.0,2.4,615.0,57.0,1237.0,49.0,1322.0,790.0,10.2,66.6,0.8299,2004,1,4,6,1,4
4,0.9,996.0,45.0,2.9,648.0,64.0,1176.0,50.0,1340.0,852.0,11.0,63.7,0.8325,2004,1,4,6,1,5


In [230]:
# Creating Season column
def get_season(month):
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    else:
        return 'fall'

data['Season'] = data['Month'].apply(get_season)

# One-hot encoding for Season
data = pd.get_dummies(data, columns=['Season'], drop_first=True)

In [231]:
# One hot encoding for Year
data = pd.get_dummies(data, columns=['Year'], drop_first=True)

In [232]:
# Cyclical Encoding for hour
data['Hour_sin'] = np.sin(2 * np.pi * data['Hour'] / 24)
data['Hour_cos'] = np.cos(2 * np.pi * data['Hour'] / 24)
data = data.drop(columns=['Hour'])

In [233]:
# Cycilical Encoding for Month
data['Month_sin'] = np.sin(2 * np.pi * data['Month'] / 12)
data['Month_cos'] = np.cos(2 * np.pi * data['Month'] / 12)
data = data.drop(columns=['Month'])

In [234]:
# Handle NaN values
data.fillna(data.mean(), inplace=True)

In [235]:
data.columns

Index(['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)',
       'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
       'T', 'RH', 'AH', 'Day', 'DayOfWeek', 'IsWeekend', 'Season_spring',
       'Season_summer', 'Season_winter', 'Year_2005', 'Hour_sin', 'Hour_cos',
       'Month_sin', 'Month_cos'],
      dtype='object')

### Baseline Prediction

In [236]:
# Naive pred using prev value
targets = ['CO(GT)', 'C6H6(GT)', 'NMHC(GT)', 'NOx(GT)', 'NO2(GT)']  # add others as needed
baseline_results = {}

for target in targets:
    y = data[target].fillna(method='ffill')  
    y_pred = y.shift(1).iloc[1:]            
    y_true = y.iloc[1:]

    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    baseline_results[target] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}
    
baseline_results_df = pd.DataFrame(baseline_results).T
baseline_results_df

  y = data[target].fillna(method='ffill')
  y = data[target].fillna(method='ffill')
  y = data[target].fillna(method='ffill')
  y = data[target].fillna(method='ffill')
  y = data[target].fillna(method='ffill')


Unnamed: 0,MAE,RMSE,R2
CO(GT),0.715873,1.249437,0.163001
C6H6(GT),3.880327,36.774404,0.23917
NMHC(GT),15.160911,4176.09314,0.272041
NOx(GT),79.901954,18134.807847,0.407981
NO2(GT),21.90123,1090.107564,0.391304


### Define Features and Target Variable

In [237]:
# Define features and target variable (CO)
targetCO = data['CO(GT)']
otherSensorsCO = ['PT08.S2(NMHC)', 'PT08.S3(NOx)', 'PT08.S4(NO2)', 'PT08.S5(O3)']
featuresCO = data[['PT08.S1(CO)', 'T', 'RH', 'AH', 'Year_2005', 'Day', 'DayOfWeek', 'IsWeekend', 'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos', 'Season_spring', 'Season_summer'] + otherSensorsCO]

In [238]:
# Define features and target variable (NOx)
targetNOx = data['NOx(GT)']
otherSensorsNOx = ['PT08.S2(NMHC)', 'PT08.S1(CO)', 'PT08.S4(NO2)', 'PT08.S5(O3)']
featuresNOx = data[['PT08.S3(NOx)', 'T', 'RH', 'AH', 'Year_2005', 'Day', 'DayOfWeek', 'IsWeekend', 'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos', 'Season_spring', 'Season_summer'] + otherSensorsNOx]

In [239]:
# Define features and target variable (NMHC)
targetNMHC = data['NMHC(GT)']
otherSensorsNMHC = ['PT08.S1(CO)', 'PT08.S3(NOx)', 'PT08.S4(NO2)', 'PT08.S5(O3)']
featuresNMHC = data[['PT08.S2(NMHC)', 'T', 'RH', 'AH', 'Year_2005', 'Day', 'DayOfWeek', 'IsWeekend', 'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos', 'Season_spring', 'Season_summer'] + otherSensorsNMHC]

In [240]:
# Define features and target variable (NO2)
targetNO2 = data['NO2(GT)']
otherSensorsNO2 = ['PT08.S1(CO)', 'PT08.S2(NMHC)', 'PT08.S3(NOx)', 'PT08.S5(O3)']
featuresNO2 = data[['PT08.S4(NO2)', 'T', 'RH', 'AH', 'Year_2005', 'Day', 'DayOfWeek', 'IsWeekend', 'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos', 'Season_spring', 'Season_summer'] + otherSensorsNO2]

In [241]:
# Define features and target variable (C6H6)
targetC6H6 = data['C6H6(GT)']
otherSensorsC6H6 = ['PT08.S1(CO)', 'PT08.S2(NMHC)', 'PT08.S3(NOx)', 'PT08.S4(NO2)']
featuresC6H6 = data[['PT08.S5(O3)', 'T', 'RH', 'AH', 'Year_2005', 'Day', 'DayOfWeek', 'IsWeekend', 'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos', 'Season_spring', 'Season_summer'] + otherSensorsC6H6]

### Random Forest for Forecasting

In [242]:
# Random forest for CO prediction
# Split the data into train and test sets
# Using a time-based split to respect temporal order
split_idx = int(len(featuresCO) * 0.8)
X_train, X_test = featuresCO.iloc[:split_idx], featuresCO.iloc[split_idx:]
y_train, y_test = targetCO.iloc[:split_idx], targetCO.iloc[split_idx:]

# Initialize and train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_predCO = rf.predict(X_test)

# Evaluate the model
mseCO = mean_squared_error(y_test, y_predCO)
r2CO = r2_score(y_test, y_predCO)
maeCO = np.mean(np.abs(y_test - y_predCO))

print(f"Random Forest MSE: {mseCO:.4f}")
print(f"Random Forest R2: {r2CO:.4f}")
print(f"Random Forest MAE: {maeCO:.4f}")

Random Forest MSE: 0.6567
Random Forest R2: 0.6531
Random Forest MAE: 0.5339


In [243]:
print(y_train.describe())

count    3300.000000
mean        2.102068
std         1.177624
min         0.100000
25%         1.300000
50%         2.136398
75%         2.500000
max         9.400000
Name: CO(GT), dtype: float64


In [244]:
# Random forest for NOx prediction
# Split the data into train and test sets
# Using a time-based split to respect temporal order
split_idx = int(len(featuresNOx) * 0.8)
X_train, X_test = featuresNOx.iloc[:split_idx], featuresNOx.iloc[split_idx:]
y_train, y_test = targetNOx.iloc[:split_idx], targetNOx.iloc[split_idx:]

# Initialize and train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_predNOx = rf.predict(X_test)

# Evaluate the model
mseNOx = mean_squared_error(y_test, y_predNOx)
r2NOx = r2_score(y_test, y_predNOx)
maeNOx = np.mean(np.abs(y_test - y_predNOx))

print(f"Random Forest NOx MSE: {mseNOx:.4f}")
print(f"Random Forest NOx R2: {r2NOx:.4f}")
print(f"Random Forest NOx MAE: {maeNOx:.4f}")

Random Forest NOx MSE: 35739.4641
Random Forest NOx R2: 0.2711
Random Forest NOx MAE: 129.4883


In [245]:
print(y_train.describe())

count    3300.000000
mean      199.650957
std       144.916482
min         2.000000
25%        95.000000
50%       190.000000
75%       231.379071
max      1247.000000
Name: NOx(GT), dtype: float64


In [246]:
# Random forest for NMHC prediction
# Split the data into train and test sets
# Using a time-based split to respect temporal order
split_idx = int(len(featuresNMHC) * 0.8)
X_train, X_test = featuresNMHC.iloc[:split_idx], featuresNMHC.iloc[split_idx:]
y_train, y_test = targetNMHC.iloc[:split_idx], targetNMHC.iloc[split_idx:]

# Initialize and train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_predNMHC = rf.predict(X_test)

# Evaluate the model
mseNMHC = mean_squared_error(y_test, y_predNMHC)
r2NMHC = r2_score(y_test, y_predNMHC)
maeNMHC = np.mean(np.abs(y_test - y_predNMHC))

print(f"Random Forest NMHC MSE: {mseNMHC:.4f}")
print(f"Random Forest NMHC R2: {r2NMHC:.4f}")
print(f"Random Forest NMHC MAE: {maeNMHC:.4f}")

Random Forest NMHC MSE: 25550.3967
Random Forest NMHC R2: -31629859089887759990535747534848.0000
Random Forest NMHC MAE: 102.2592


In [247]:
print(y_train.describe())

count    3300.000000
mean      205.751701
std        84.711793
min         7.000000
25%       205.751701
50%       205.751701
75%       205.751701
max      1084.000000
Name: NMHC(GT), dtype: float64


In [248]:
# Random forest for NO2 prediction
# Split the data into train and test sets
# Using a time-based split to respect temporal order
split_index = int(0.8 * len(data))
X_train, X_test = featuresNO2.iloc[:split_index], featuresNO2.iloc[split_index:]
y_train, y_test = targetNO2.iloc[:split_index], targetNO2.iloc[split_index:]

# Initialize and train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_predNO2 = rf.predict(X_test)

# Evaluate the model
mseNO2 = mean_squared_error(y_test, y_predNO2)
r2NO2 = r2_score(y_test, y_predNO2)
maeNO2 = np.mean(np.abs(y_test - y_predNO2))

print(f"Random Forest NO2 MSE: {mseNO2:.4f}")
print(f"Random Forest NO2 R2: {r2NO2:.4f}")
print(f"Random Forest NO2 MAE: {maeNO2:.4f}")

Random Forest NO2 MSE: 2985.9885
Random Forest NO2 R2: -0.1669
Random Forest NO2 MAE: 40.9113


In [249]:
print(y_train.describe())

count    3300.000000
mean       98.776701
std        32.651140
min         5.000000
25%        76.000000
50%       109.101690
75%       112.000000
max       233.000000
Name: NO2(GT), dtype: float64


In [250]:
# Random forest for C6H6 prediction
# Split the data into train and test sets
# Using a time-based split to respect temporal order
split_idx = int(len(featuresC6H6) * 0.8)
X_train, X_test = featuresC6H6.iloc[:split_idx], featuresC6H6.iloc[split_idx:]
y_train, y_test = targetC6H6.iloc[:split_idx], targetC6H6.iloc[split_idx:]

# Initialize and train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_predC6H6 = rf.predict(X_test)

# Evaluate the model
mseC6H6 = mean_squared_error(y_test, y_predC6H6)
r2C6H6 = r2_score(y_test, y_predC6H6)
maeC6H6 = np.mean(np.abs(y_test - y_predC6H6))

print(f"Random Forest C6H6 MSE: {mseC6H6:.4f}")
print(f"Random Forest C6H6 R2: {r2C6H6:.4f}")
print(f"Random Forest C6H6 MAE: {maeC6H6:.4f}")

Random Forest C6H6 MSE: 0.0011
Random Forest C6H6 R2: 1.0000
Random Forest C6H6 MAE: 0.0145


In [251]:
print(y_train.describe())

count    3300.000000
mean       10.344737
std         7.139922
min         0.300000
25%         5.100000
50%         8.900000
75%        13.700000
max        48.200000
Name: C6H6(GT), dtype: float64


### LSTM  

In [252]:
featuresCO

Unnamed: 0,PT08.S1(CO),T,RH,AH,Year_2005,Day,DayOfWeek,IsWeekend,Hour_sin,Hour_cos,Month_sin,Month_cos,Season_spring,Season_summer,PT08.S2(NMHC),PT08.S3(NOx),PT08.S4(NO2),PT08.S5(O3)
0,1143.0,12.0,61.6,0.8593,False,4,6,1,0.000000,1.000000e+00,5.000000e-01,0.866025,False,False,825.0,986.0,1477.0,978.0
1,1034.0,10.7,67.2,0.8630,False,4,6,1,0.500000,8.660254e-01,5.000000e-01,0.866025,False,False,716.0,1085.0,1405.0,891.0
2,956.0,9.0,73.1,0.8394,False,4,6,1,0.707107,7.071068e-01,5.000000e-01,0.866025,False,False,713.0,1099.0,1422.0,849.0
3,909.0,10.2,66.6,0.8299,False,4,6,1,0.866025,5.000000e-01,5.000000e-01,0.866025,False,False,615.0,1237.0,1322.0,790.0
4,996.0,11.0,63.7,0.8325,False,4,6,1,0.965926,2.588190e-01,5.000000e-01,0.866025,False,False,648.0,1176.0,1340.0,852.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4121,972.0,9.2,75.3,0.8764,True,3,5,1,1.000000,6.123234e-17,-2.449294e-16,1.000000,False,False,542.0,1029.0,1056.0,443.0
4122,992.0,9.4,73.1,0.8621,True,3,5,1,0.965926,-2.588190e-01,-2.449294e-16,1.000000,False,False,565.0,912.0,1074.0,547.0
4123,1053.0,10.6,66.1,0.8444,True,3,5,1,0.866025,-5.000000e-01,-2.449294e-16,1.000000,False,False,671.0,807.0,1137.0,570.0
4124,1140.0,12.6,58.5,0.8517,True,3,5,1,0.707107,-7.071068e-01,-2.449294e-16,1.000000,False,False,800.0,679.0,1209.0,782.0


In [253]:
#LSTM pred for CO

# Features (all numeric columns)
X = featuresCO.values  # shape: (num_samples, num_features)

# Target
y = targetCO.values.reshape(-1, 1)  # make it 2D for scaler

# Scale features
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)

scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y)

# Function to create sequences
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i+seq_length])
        y_seq.append(y[i+seq_length])
    return np.array(X_seq), np.array(y_seq)

seq_length = 10  # number of past timesteps to use
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, shuffle=False)

# Build LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, X_seq.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Predict
y_pred = model.predict(X_test)

# Inverse scale to original CO values
y_test_inv = scaler_y.inverse_transform(y_test)
y_pred_inv = scaler_y.inverse_transform(y_pred)

# Calculate RMSE
rmseLSTMCO = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))
# Calculate R²
r2LSTMCO = r2_score(y_test_inv, y_pred_inv)
# Calculate MAE
maeLSTMCO = mean_absolute_error(y_test_inv, y_pred_inv)

print(f"LSTM CO RMSE: {rmseLSTMCO:.4f}")
print(f"LSTM CO R2: {r2LSTMCO:.4f}")
print(f"LSTM CO MAE: {maeLSTMCO:.4f}")


Epoch 1/50


  super().__init__(**kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0148 - val_loss: 0.0118
Epoch 2/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0125 - val_loss: 0.0107
Epoch 3/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0120 - val_loss: 0.0102
Epoch 4/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0118 - val_loss: 0.0109
Epoch 5/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0117 - val_loss: 0.0099
Epoch 6/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0114 - val_loss: 0.0099
Epoch 7/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0111 - val_loss: 0.0099
Epoch 8/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0111 - val_loss: 0.0103
Epoch 9/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [254]:
# LSTM pred for NOx

# Features (all numeric columns)
X = featuresNOx.values  # shape: (num_samples, num_features)

# Target
y = targetNOx.values.reshape(-1, 1)  # make it 2D for scaler

# Scale features
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)

# Scale target
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y)

# Function to create sequences
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i+seq_length])
        y_seq.append(y[i+seq_length])
    return np.array(X_seq), np.array(y_seq)
seq_length = 10  # number of past timesteps to use
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, shuffle=False)

# Build LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, X_seq.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split  =0.1)

# Predict 
y_pred = model.predict(X_test)
y_pred_inv = scaler_y.inverse_transform(y_pred)
y_test_inv = scaler_y.inverse_transform(y_test)

# Calculate RMSE
rmseLSTMNOx = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))
# Calculate R²
r2LSTMNOx = r2_score(y_test_inv, y_pred_inv)   
# Calculate MAE
maeLSTMNOx = mean_absolute_error(y_test_inv, y_pred_inv)

print(f"LSTM NOx RMSE: {rmseLSTMNOx:.4f}")
print(f"LSTM NOx R2: {r2LSTMNOx:.4f}")
print(f"LSTM NOx MAE: {maeLSTMNOx:.4f}")

Epoch 1/50


  super().__init__(**kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0133 - val_loss: 0.0071
Epoch 2/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0089 - val_loss: 0.0085
Epoch 3/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0083 - val_loss: 0.0057
Epoch 4/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0081 - val_loss: 0.0059
Epoch 5/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0078 - val_loss: 0.0059
Epoch 6/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0075 - val_loss: 0.0067
Epoch 7/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0074 - val_loss: 0.0066
Epoch 8/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0073 - val_loss: 0.0063
Epoch 9/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [255]:
# LSTM pred for NMHC

# Features (all numeric columns)
X = featuresNMHC.values  # shape: (num_samples, num_features)

# Target
y = targetNMHC.values.reshape(-1, 1)  # make it 2D for scaler

# Scale features
scaler_X = MinMaxScaler()
X = scaler_X.fit_transform(X)

# Scale target
scaler_y = MinMaxScaler()
y = scaler_y.fit_transform(y)

# Function to create sequences
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i+seq_length])
        y_seq.append(y[i+seq_length])
    return np.array(X_seq), np.array(y_seq)
seq_length = 10  # number of past timesteps to use
X_seq, y_seq = create_sequences(X, y, seq_length)

# Split into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, shuffle=False)
# Build LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, X_seq.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Predict
y_pred = model.predict(X_test)
y_pred_inv = scaler_y.inverse_transform(y_pred)
y_test_inv = scaler_y.inverse_transform(y_test)

# Calculate RMSE
rmseLSTMNMHC = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))
# Calculate R²
r2LSTMNMHC = r2_score(y_test_inv, y_pred_inv)   
# Calculate MAE
maeLSTMNMHC = mean_absolute_error(y_test_inv, y_pred_inv)

print(f"LSTM NMHC RMSE: {rmseLSTMNMHC:.4f}")
print(f"LSTM NMHC R2: {r2LSTMNMHC:.4f}")
print(f"LSTM NMHC MAE: {maeLSTMNMHC:.4f}")

Epoch 1/50


  super().__init__(**kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 0.0078 - val_loss: 0.0102
Epoch 2/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0060 - val_loss: 0.0047
Epoch 3/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0054 - val_loss: 0.0036
Epoch 4/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0052 - val_loss: 0.0085
Epoch 5/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0050 - val_loss: 0.0035
Epoch 6/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0050 - val_loss: 0.0094
Epoch 7/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0049 - val_loss: 0.0042
Epoch 8/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0047 - val_loss: 0.0032
Epoch 9/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [256]:
# LSTM pred for NO2
# Features (all numeric columns)
X = featuresNO2.values  # shape: (num_samples, num_features)

# Target
y = targetNO2.values.reshape(-1, 1)  # make it 2D for scaler

# Scale features
scaler_X = MinMaxScaler()
X = scaler_X.fit_transform(X)

# Scale target
scaler_y = MinMaxScaler()
y = scaler_y.fit_transform(y)

# Function to create sequences
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i+seq_length])
        y_seq.append(y[i+seq_length])
    return np.array(X_seq), np.array(y_seq)
seq_length = 10  # number of past timesteps to use
X_seq, y_seq = create_sequences(X, y, seq_length)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, shuffle=False)

# Build LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, X_seq.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Predict
y_pred = model.predict(X_test)
y_pred_inv = scaler_y.inverse_transform(y_pred)
y_test_inv = scaler_y.inverse_transform(y_test)

# Calculate RMSE
rmseLSTMNO2 = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))
# Calculate R²
r2LSTMNO2 = r2_score(y_test_inv, y_pred_inv)   
# Calculate MAE
maeLSTMNO2 = mean_absolute_error(y_test_inv, y_pred_inv)

print(f"LSTM NO2 RMSE: {rmseLSTMNO2:.4f}")
print(f"LSTM NO2 R2: {r2LSTMNO2:.4f}")
print(f"LSTM NO2 MAE: {maeLSTMNO2:.4f}")

Epoch 1/50


  super().__init__(**kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.0116 - val_loss: 0.0068
Epoch 2/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0079 - val_loss: 0.0066
Epoch 3/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0073 - val_loss: 0.0065
Epoch 4/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0072 - val_loss: 0.0086
Epoch 5/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0071 - val_loss: 0.0063
Epoch 6/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0066 - val_loss: 0.0058
Epoch 7/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0066 - val_loss: 0.0063
Epoch 8/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0064 - val_loss: 0.0084
Epoch 9/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [257]:
# LSTM pred for C6H6
# Features (all numeric columns)
X = featuresC6H6.values  # shape: (num_samples, num_features)

# Target
y = targetC6H6.values.reshape(-1, 1)  # make it 2D for scaler

# Scale features
scaler_X = MinMaxScaler()
X = scaler_X.fit_transform(X)

# Scale target
scaler_y = MinMaxScaler()
y = scaler_y.fit_transform(y)

# Function to create sequences
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i+seq_length])
        y_seq.append(y[i+seq_length])
    return np.array(X_seq), np.array(y_seq)
seq_length = 10  # number of past timesteps to use
X_seq, y_seq = create_sequences(X, y, seq_length)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, shuffle=False)
# Build LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, X_seq.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1) 

# Predict
y_pred = model.predict(X_test)
y_pred_inv = scaler_y.inverse_transform(y_pred)
y_test_inv = scaler_y.inverse_transform(y_test)

# Calculate RMSE
rmseLSTMC6H6 = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))
# Calculate R²
r2LSTMC6H6 = r2_score(y_test_inv, y_pred_inv)   
# Calculate MAE
maeLSTMC6H6 = mean_absolute_error(y_test_inv, y_pred_inv)

print(f"LSTM C6H6 RMSE: {rmseLSTMC6H6:.4f}")
print(f"LSTM C6H6 R2: {r2LSTMC6H6:.4f}")
print(f"LSTM C6H6 MAE: {maeLSTMC6H6:.4f}")

Epoch 1/50


  super().__init__(**kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 0.0169 - val_loss: 0.0133
Epoch 2/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0152 - val_loss: 0.0127
Epoch 3/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0149 - val_loss: 0.0112
Epoch 4/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0140 - val_loss: 0.0112
Epoch 5/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0144 - val_loss: 0.0118
Epoch 6/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0138 - val_loss: 0.0109
Epoch 7/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0135 - val_loss: 0.0117
Epoch 8/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0135 - val_loss: 0.0114
Epoch 9/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

### Compare Results

In [258]:
# Collect metrics for each chemical and method

# Baseline
baseline_metrics = baseline_results_df[['RMSE', 'R2', 'MAE']].copy()
baseline_metrics.columns = ['Baseline_RMSE', 'Baseline_R2', 'Baseline_MAE']

# Random Forest
rf_metrics = pd.DataFrame({
    'RandomForest_RMSE': [
        mseCO**0.5, mseC6H6**0.5, mseNMHC**0.5, mseNOx**0.5, mseNO2**0.5
    ],
    'RandomForest_R2': [
        r2CO, r2C6H6, r2NMHC, r2NOx, r2NO2
    ],
    'RandomForest_MAE': [
        maeCO, maeC6H6, maeNMHC, maeNOx, maeNO2
    ]
}, index=baseline_metrics.index)

# LSTM
lstm_metrics = pd.DataFrame({
    'LSTM_RMSE': [
        rmseLSTMCO, rmseLSTMC6H6, rmseLSTMNMHC, rmseLSTMNOx, rmseLSTMNO2
    ],
    'LSTM_R2': [
        r2LSTMCO, r2LSTMC6H6, r2LSTMNMHC, r2LSTMNOx, r2LSTMNO2
    ],
    'LSTM_MAE': [
        maeLSTMCO, maeLSTMC6H6, maeLSTMNMHC, maeLSTMNOx, maeLSTMNO2
    ]
}, index=baseline_metrics.index)

# Combine all metrics
results_df = pd.concat([baseline_metrics, rf_metrics, lstm_metrics], axis=1)
results_df

Unnamed: 0,Baseline_RMSE,Baseline_R2,Baseline_MAE,RandomForest_RMSE,RandomForest_R2,RandomForest_MAE,LSTM_RMSE,LSTM_R2,LSTM_MAE
CO(GT),1.249437,0.163001,0.715873,0.810346,0.6530707,0.533871,1.299601,0.1092395,1.042072
C6H6(GT),36.774404,0.23917,3.880327,0.03289,0.9999706,0.014473,7.270918,-0.4371704,5.933194
NMHC(GT),4176.09314,0.272041,15.160911,159.844915,-3.162986e+31,102.259191,112.719224,-3.9322e+30,81.689935
NOx(GT),18134.807847,0.407981,79.901954,189.04884,0.2710955,129.488256,287.497348,-0.6834827,210.750107
NO2(GT),1090.107564,0.391304,21.90123,54.644199,-0.1669251,40.911344,55.044288,-0.1813188,40.495548
