<a href="https://colab.research.google.com/github/juanglondono-coder/deep-learning-final-ydl-air-pollution/blob/main/05_submission_kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Imports y configuración

In [1]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras import layers, models

print("TensorFlow:", tf.__version__)

TensorFlow: 2.19.0


In [2]:
os.makedirs("results", exist_ok=True)
os.makedirs("submissions", exist_ok=True)

2. Carga de train.csv, test.csv y sample_submission.csv

In [3]:
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample.csv")

print("train shape:", train.shape)
print("test shape :", test.shape)
print("sample_submission columns:", sample_sub.columns.tolist())

train shape: (6218, 14)
test shape : (2773, 14)
sample_submission columns: ['ID', 'Target']


3. Unificar estructura y preprocesar (train + test)

In [4]:
# Quitar índices viejos si existen
for df in [train, test]:
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)

# Asegurar que test tenga columna objetivo (vacía)
target_col = "NMHC(GT)"
if target_col not in test.columns:
    test[target_col] = np.nan

# Añadir columnas auxiliares
train["dataset"] = "train"
test["dataset"]  = "test"
test["row_in_test"] = np.arange(len(test))  # para mapear luego predicciones a filas de test
train["row_in_test"] = np.nan

# Concatenar y ordenar por tiempo
full = pd.concat([train, test], ignore_index=True)

full["Datetime"] = pd.to_datetime(full["Datetime"])
full = full.sort_values("Datetime").reset_index(drop=True)

full.head()

Unnamed: 0,Datetime,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,dataset,row_in_test,index
0,2004-03-10 18:00:00,2.6,1360.0,150,11.881723,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754,train,,
1,2004-03-10 19:00:00,2.0,1292.25,112,9.397165,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487,train,,
2,2004-03-10 20:00:00,2.2,1402.0,88,8.997817,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239,train,,
3,2004-03-10 21:00:00,2.2,1375.5,80,9.228796,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713,train,,
4,2004-03-10 22:00:00,1.6,1272.25,51,6.518224,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794,train,,


In [5]:
cols_with_minus200 = ['CO(GT)', 'NMHC(GT)', 'NOx(GT)', 'NO2(GT)']

for c in cols_with_minus200:
    if c in full.columns:
        full[c] = full[c].replace(-200, np.nan)

In [6]:
# Features = todas menos Datetime, target y columnas auxiliares
aux_cols = ['dataset', 'row_in_test']
feature_cols = [c for c in full.columns
                if c not in ['Datetime', target_col] + aux_cols + ['index']] # Exclude 'index' column

print("Features:", feature_cols)

# Separar índices de train y test
idx_train = full['dataset'] == 'train'
idx_test  = full['dataset'] == 'test'

X_train_raw = full.loc[idx_train, feature_cols]
X_test_raw  = full.loc[idx_test,  feature_cols]

y_train_full = full.loc[idx_train, target_col]   # target real en train

# Imputador y escalador ajustados SOLO con train
imputer = SimpleImputer(strategy='median')
scaler  = StandardScaler()

X_train_imp = imputer.fit_transform(X_train_raw)
X_train_scaled = scaler.fit_transform(X_train_imp)

# Aplicar mismas transf. a test
X_test_imp = imputer.transform(X_test_raw)
X_test_scaled = scaler.transform(X_test_imp)

# Reemplazar en el DataFrame "full" (para tener todo junto)
full.loc[idx_train, feature_cols] = X_train_scaled
full.loc[idx_test,  feature_cols] = X_test_scaled

Features: ['CO(GT)', 'PT08.S1(CO)', 'C6H6(GT)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']


4. Construir ventanas para train y test (multivariante)

In [7]:
def build_windows_full(df, feature_cols, target_col, window_size=24):
    """
    df: DataFrame 'full' ya escalado, con columnas:
         - feature_cols
         - target_col (NaN en test)
         - 'dataset' ('train'/'test')
         - 'row_in_test' (índice de fila en test)
    """
    values_feat = df[feature_cols].values
    target      = df[target_col].values
    dataset     = df['dataset'].values
    row_in_test = df['row_in_test'].values

    X_train_win, y_train, X_test_win, test_row_ids = [], [], [], []

    n = len(df)

    for i in range(window_size, n):
        window = values_feat[i-window_size:i, :]  # (window_size, n_features)

        if dataset[i] == 'train' and not np.isnan(target[i]):
            # ventana para entrenar (tenemos y)
            X_train_win.append(window)
            y_train.append(target[i])

        elif dataset[i] == 'test':
            # ventana para predecir (no tenemos y)
            X_test_win.append(window)
            test_row_ids.append(int(row_in_test[i]))

    X_train_win = np.array(X_train_win)
    y_train = np.array(y_train)
    X_test_win = np.array(X_test_win)
    test_row_ids = np.array(test_row_ids)

    return X_train_win, y_train, X_test_win, test_row_ids

window_size = 24

X_train_seq, y_train_seq, X_test_seq, test_row_ids = build_windows_full(
    full, feature_cols, target_col, window_size=window_size
)

print("X_train_seq:", X_train_seq.shape)
print("y_train_seq:", y_train_seq.shape)
print("X_test_seq :", X_test_seq.shape)
print("len(test_row_ids):", len(test_row_ids))

X_train_seq: (863, 24, 11)
y_train_seq: (863,)
X_test_seq : (2773, 24, 11)
len(test_row_ids): 2773


5. Entrenar el modelo GRU con target log (sobre TODA la info de train)

In [8]:
# Transformación log del target
y_train_log = np.log1p(y_train_seq)

n_timesteps = X_train_seq.shape[1]
n_features  = X_train_seq.shape[2]

model_gru = models.Sequential([
    layers.Input(shape=(n_timesteps, n_features)),
    layers.GRU(64, return_sequences=False),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])

model_gru.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='mse',
    metrics=['mae']
)

model_gru.summary()

In [9]:
# Split simple train/val al final de la serie de entrenamiento
n_train = len(X_train_seq)
val_fraction = 0.15
n_val = int(n_train * val_fraction)

X_train_gru = X_train_seq[:-n_val]
y_train_gru = y_train_log[:-n_val]

X_val_gru = X_train_seq[-n_val:]
y_val_gru = y_train_log[-n_val:]

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

history_gru = model_gru.fit(
    X_train_gru, y_train_gru,
    validation_data=(X_val_gru, y_val_gru),
    epochs=50,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 76ms/step - loss: 21.5986 - mae: 4.5284 - val_loss: 19.0105 - val_mae: 4.2489
Epoch 2/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 11.8524 - mae: 3.2434 - val_loss: 8.0380 - val_mae: 2.6436
Epoch 3/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 2.4428 - mae: 1.2404 - val_loss: 0.6505 - val_mae: 0.6851
Epoch 4/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 1.2311 - mae: 0.9085 - val_loss: 1.8520 - val_mae: 1.1486
Epoch 5/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 0.7127 - mae: 0.6693 - val_loss: 0.9920 - val_mae: 0.7945
Epoch 6/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 0.5317 - mae: 0.5843 - val_loss: 0.5810 - val_mae: 0.5962
Epoch 7/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - loss

In [10]:
# Predicciones en espacio log
y_test_log_pred = model_gru.predict(X_test_seq, verbose=0).ravel()

# Volver a escala original
y_test_pred = np.expm1(y_test_log_pred)

print("Predicciones test:", y_test_pred.shape)
print("Ejemplo:", y_test_pred[:5])

Predicciones test: (2773,)
Ejemplo: [23.643162 32.284866 10.327839 11.99681  12.840307]


In [11]:
# Vector con una predicción por fila de test
y_pred_submit = np.zeros(len(test))

# Rellenar según el mapping test_row_ids
for pred, rid in zip(y_test_pred, test_row_ids):
    y_pred_submit[rid] = pred

# (Opcional) Si por cualquier razón hubiera filas sin ventana,
# se podría rellenar con la media o mediana de y_train_seq
if (y_pred_submit == 0).any():
    fallback = np.median(y_train_seq)
    y_pred_submit[y_pred_submit == 0] = fallback

6. Crear submission.csv con el formato de Kaggle

In [12]:
sample_sub.head()

Unnamed: 0,ID,Target
0,6366,879.75
1,6367,698.5
2,6368,701.25
3,6369,653.75
4,6370,638.0


In [15]:
sample_sub = pd.read_csv("sample.csv")
id_col = sample_sub.columns[0]
target_sub_col = sample_sub.columns[1]   # única columna de predicción

# Create submission DataFrame and set 'ID' as index
submission = sample_sub.copy().set_index(id_col)

# Create a temporary DataFrame with ID and prediction for the test set
# Assuming the test data corresponds to the first len(y_pred_submit) IDs in sample_sub
predictions_with_ids = pd.DataFrame({
    id_col: sample_sub[id_col].iloc[:len(y_pred_submit)].values,
    target_sub_col: y_pred_submit
}).set_index(id_col) # Set ID as index for predictions_with_ids

# Update the submission DataFrame with the generated predictions
# IDs not present in predictions_with_ids will retain their original values from sample_sub
submission.update(predictions_with_ids)

# Reset index to bring 'ID' back as a column
submission = submission.reset_index()

submission.head()

Unnamed: 0,ID,Target
0,6366,23.643162
1,6367,32.284866
2,6368,10.327839
3,6369,11.99681
4,6370,12.840307


In [16]:
sub_path = os.path.join("submissions", "submission_gru_log_target.csv")
submission.to_csv(sub_path, index=False)
print("Submission guardada en:", sub_path)

Submission guardada en: submissions/submission_gru_log_target.csv
