In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras import layers as tfkl

def predict(inputData):
    """This is the function that will be used to test your predictive model.

    It will receive an input data which consist of 168 (1 week == 24h * 7 days)
    lists with the date, hour and NO2 values.

    Something like: [['2023-01-01', 1, 62.0], ...,  ['2023-01-07', 24, 60.0]]

    This function must return a list of 24 values with the NO2 prediction for
    each hour of the next day.
    """
    # Data curation


    return  [inputData[-1, 2]] * 24

WINDOW_SIZE = 24 * 7
PREDICTION_HORIZON = 24

file_path = "/content/sample_data/validationData.csv"
inputData = pd.read_csv(file_path).values
# The final evaluation will be done just by changing the input data:
# inputData = pd.read_csv("testData.csv").values

realValues = []
predictionValues = []
for start in range(0, len(inputData) - WINDOW_SIZE - PREDICTION_HORIZON):
    end = start + WINDOW_SIZE
    windowToPredict = inputData[start:end]
    realValues.extend(inputData[:, 2][end : end + PREDICTION_HORIZON])
    predictionValues.extend(predict(windowToPredict))

# Compute the RMSE (root mean squared error).
rmse = np.nanmean(np.subtract(realValues, predictionValues) ** 2) ** 0.5

print("Final prediction mean RMSE score:", rmse)

Final prediction mean RMSE score: 21.656732460443422


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers as tfkl

## Read the train and validation data
train_data_processed = pd.read_csv('/content/trainData.csv')
validation_data_processed = pd.read_csv('/content/validationData.csv')

# Display the first few rows of the train and validation datasets
train_head = train_data_processed.head()
validation_head = validation_data_processed.head()

train_head, validation_head
# Remove the 'NO2' column and retain 'NO2_interpolated'
# train_data_processed = train_data_processed.drop(columns=['NO2_mean_hourly'])

# Display the updated validation dataset's head
print("Updated Train Dataset:")
print(train_data_processed.head())
print("\nShape of the Train Dataset:", train_data_processed.shape)

# Remove the 'NO2' column and retain 'NO2_interpolated'
#validation_data_processed = validation_data_processed.drop(columns=['NO2'])

# Display the updated validation dataset's head
print("Updated Validation Dataset:")
print(validation_data_processed.head())
print("\nShape of the Val Dataset:", validation_data_processed.shape)

Updated Train Dataset:
  nom_estacio        data  hour        NO2
0  Ciutadella  2013-01-01     1  41.968854
1  Ciutadella  2013-01-01     2  60.968854
2  Ciutadella  2013-01-01     3  74.968854
3  Ciutadella  2013-01-01     4  76.968854
4  Ciutadella  2013-01-01     5  54.968854

Shape of the Train Dataset: (436104, 4)
Updated Validation Dataset:
         data  hour        NO2
0  2023-01-01     1  92.347199
1  2023-01-01     2  91.347199
2  2023-01-01     3  92.347199
3  2023-01-01     4  88.347199
4  2023-01-01     5  81.347199

Shape of the Val Dataset: (8760, 3)


In [None]:
def create_sliding_windows(
    series: np.ndarray,
    input_length: int = 168,
    forecast_length: int = 24
):
    """
    Given a 1D or 2D NumPy array `series` of shape (time, features),
    return X of shape (num_samples, input_length, features) and
    y of shape (num_samples, forecast_length, features).

    If the series is 1D, it will be reshaped to (time, 1).
    """
    if series.ndim == 1:
        # Reshape to (time, 1)
        series = series.reshape(-1, 1)

    X_list, y_list = [], []

    max_start = len(series) - input_length - forecast_length + 1
    for start_idx in range(max_start):
        end_idx = start_idx + input_length
        forecast_end_idx = end_idx + forecast_length

        X_window = series[start_idx:end_idx]
        y_window = series[end_idx:forecast_end_idx]

        X_list.append(X_window)
        y_list.append(y_window)

    return np.array(X_list), np.array(y_list)


In [None]:
train_series = train_data_processed['NO2'].values

# Choose window sizes
input_length = 168
forecast_length = 24

X_train, y_train = create_sliding_windows(
    series=train_series,
    input_length=input_length,
    forecast_length=forecast_length
)

print("X_train shape:", X_train.shape)  # (N, 168, 1) if only NO2
print("y_train shape:", y_train.shape)  # (N, 24, 1)


X_train shape: (435913, 168, 1)
y_train shape: (435913, 24, 1)


In [None]:
val_series = validation_data_processed['NO2'].values

X_val, y_val = create_sliding_windows(
    series=val_series,
    input_length=input_length,
    forecast_length=forecast_length
)

print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_val shape: (8569, 168, 1)
y_val shape: (8569, 24, 1)


In [None]:
def build_CONV_LSTM_model(input_shape, output_shape):
    # Ensure the input time steps are at least as many as the output time steps
    assert input_shape[0] >= output_shape[0], \
        "For this exercise we want input time steps >= output time steps"

    input_layer = tfkl.Input(shape=input_shape, name='input_layer')

    # 1st LSTM layer
    x = tfkl.LSTM(128, return_sequences=True, name='lstm1')(input_layer)
    x = tfkl.Dropout(0.3)(x)

    # 2nd LSTM layer
    x = tfkl.LSTM(128, return_sequences=True, name='lstm2')(x)
    x = tfkl.Dropout(0.3)(x)

    # 1D Convolution + ReLU
    x = tfkl.Conv1D(128, 3, padding='same', name='conv1')(x)
    x = tfkl.Activation('relu', name='relu_after_conv1')(x)
    x = tfkl.Dropout(0.3)(x)

    # 1D Convolution + ReLU
    x = tfkl.Conv1D(128, 3, padding='same', name='conv2')(x)
    x = tfkl.Activation('relu', name='relu_after_conv2')(x)
    x = tfkl.Dropout(0.3)(x)

    # Final Convolution => matches desired output's features
    output_layer = tfkl.Conv1D(output_shape[1], 3, padding='same', name='output_layer')(x)

    # Crop the time dimension to match output_shape[0]
    crop_size = output_layer.shape[1] - output_shape[0]
    output_layer = tfkl.Cropping1D((0, crop_size), name='cropping')(output_layer)

    model = tf.keras.Model(inputs=input_layer, outputs=output_layer, name='CONV_LSTM_model')
    model.compile(
        loss=tf.keras.losses.MeanSquaredError(),
        optimizer=tf.keras.optimizers.AdamW()
    )
    return model

# Build it with the shapes you have
input_shape = (168, 1)  # 168 hours, 1 feature if you're only using NO2
output_shape = (24, 1)  # 24 hours forecast, 1 feature

model = build_CONV_LSTM_model(input_shape, output_shape)
model.summary()


In [None]:
history = model.fit(
    X_train, y_train,
    epochs=10,               # Increase as needed
    batch_size=32,           # Adjust to your system's memory
    validation_data=(X_val, y_val),
    verbose=1
)

Epoch 1/10
[1m  455/13623[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:04:44[0m 568ms/step - loss: nan

KeyboardInterrupt: 