In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.metrics import MeanAbsoluteError, RootMeanSquaredError
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np

In [None]:
# reading file dropping NO column since it is not useful
# converign datetime column to datetime and setting as index
data = pd.read_csv('pm.csv')
data = data.iloc[24:]
data['datetime'] = pd.to_datetime(data['datetime'])
data = data.set_index('datetime')
data.drop(columns=['No'], inplace=True)
data['pm2.5'] = data['pm2.5'].interpolate(method='time')

In [None]:
# spliting the data
# taking last 6100 rows for validation set and rest for train
train_set = data[:-6100]
validation_set = data[-6100:]

# defingin scalers
features_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# train set features and target split
train_features = train_set.drop(columns=['pm2.5'])
train_target = train_set['pm2.5'].values.reshape(-1, 1)
val_features = validation_set.drop(columns=['pm2.5'])
val_target = validation_set['pm2.5'].values.reshape(-1, 1)

# scaling features and target between 0 and 1
train_features_scaled = features_scaler.fit_transform(train_features.values)
train_target_scaled = target_scaler.fit_transform(train_target)
val_features_scaled = features_scaler.transform(val_features.values)
val_target_scaled = target_scaler.transform(val_target)

# creating timeseries generators for train and validation sets
train_series = TimeseriesGenerator(train_features_scaled,  train_target_scaled ,sampling_rate=1 ,length=96, batch_size=32)
val_series = TimeseriesGenerator(val_features_scaled, val_target_scaled, sampling_rate=1, length=96, batch_size=32)

In [None]:
# Build model
model = Sequential([
    LSTM(128, activation='tanh', return_sequences=True, input_shape=(96, train_features_scaled.shape[1])),
    Dropout(0.2),
    LSTM(64, activation='tanh', return_sequences=False),
    Dropout(0.2),
    Dense(64, activation='relu', kernel_regularizer=l2(0.02)),
    Dropout(0.2),
    Dense(1, activation='linear')
])

# model fit and compile 
# note the RMSR is for scaled values betwween 0-1
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=[MeanAbsoluteError()])
model.summary()
history = model.fit( train_series, validation_data=val_series, epochs=35, verbose=2)


In [None]:
# reading file, removing NO column as before
# setting index as datatime and tranforming to datatime type
# taking the last 96 rows from train dataset since the model will use them to create the first prediction
# also removing pm2.5 from last_96
# the making a final df from last 96 + test csv
df_test = pd.read_csv('test.csv')
df_dates = pd.to_datetime(df_test['datetime'])
df_test['datetime'] = pd.to_datetime(df_test['datetime'])
df_test = df_test.set_index('datetime')
df_test.drop(columns=['No',], inplace=True)
last_rows = data[-96:].copy()
last_rows.drop(columns=['pm2.5',], inplace=True)
df_test = pd.concat([last_rows, df_test])

In [None]:
# tranforming the final test df
# and creating a time series 
df_test_scaled = features_scaler.transform(df_test.values)
test_series = TimeseriesGenerator(df_test_scaled, df_test_scaled, 96, sampling_rate=1, batch_size=320)
predictions = model.predict(test_series)

In [None]:
# inversing the prediction from 0-1 to orignal scale
# and creating submission file 
actual_predictions = target_scaler.inverse_transform(predictions)
actual_predictions_int = np.round(actual_predictions).astype(int)
df_dates = df_dates.dt.strftime('%Y-%m-%d %#H:%M:%S')

final_cvs = pd.DataFrame({
    'row ID': df_dates,
    'pm2.5': actual_predictions_int.flatten()
})

final_cvs.to_csv('joel-predictions.csv', index=False)
print(final_cvs.head())