<a href="https://colab.research.google.com/github/juanglondono-coder/deep-learning-final-ydl-air-pollution/blob/main/02_preprocesado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

1. Cargar datos

In [2]:
df = pd.read_csv('train.csv')

2. Eliminar columna de índice viejo si existe

In [3]:
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

3. Convertir Datetime a tipo fecha-hora

In [4]:
df['Datetime'] = pd.to_datetime(df['Datetime'])

4. Ordenar por tiempo.

In [5]:
df = df.sort_values('Datetime').reset_index(drop=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6218 entries, 0 to 6217
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Datetime      6218 non-null   datetime64[ns]
 1   CO(GT)        6218 non-null   float64       
 2   PT08.S1(CO)   6218 non-null   float64       
 3   NMHC(GT)      6218 non-null   int64         
 4   C6H6(GT)      6218 non-null   float64       
 5   NOx(GT)       6218 non-null   float64       
 6   PT08.S3(NOx)  6218 non-null   float64       
 7   NO2(GT)       6218 non-null   float64       
 8   PT08.S4(NO2)  6218 non-null   float64       
 9   PT08.S5(O3)   6218 non-null   float64       
 10  T             6218 non-null   float64       
 11  RH            6218 non-null   float64       
 12  AH            6218 non-null   float64       
dtypes: datetime64[ns](1), float64(11), int64(1)
memory usage: 631.6 KB


In [7]:
df.head()

Unnamed: 0,Datetime,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10 18:00:00,2.6,1360.0,150,11.881723,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10 19:00:00,2.0,1292.25,112,9.397165,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10 20:00:00,2.2,1402.0,88,8.997817,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10 21:00:00,2.2,1375.5,80,9.228796,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10 22:00:00,1.6,1272.25,51,6.518224,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


5. Reemplazar valores negativos

In [8]:
cols_with_minus200 = ['CO(GT)', 'NMHC(GT)', 'NOx(GT)', 'NO2(GT)']

for c in cols_with_minus200:
    if c in df.columns:
        df[c] = df[c].replace(-200, np.nan)

In [9]:
df[cols_with_minus200].isna().mean()

Unnamed: 0,0
CO(GT),0.232711
NMHC(GT),0.85735
NOx(GT),0.226118
NO2(GT),0.2266


6. Filtrar filas sin variable objetivo

In [10]:
target_col = 'NMHC(GT)'

In [11]:
df = df[~df[target_col].isna()].copy()
print("Shape después de filtrar filas sin target:", df.shape)

Shape después de filtrar filas sin target: (887, 13)


In [12]:
df[target_col].describe()

Unnamed: 0,NMHC(GT)
count,887.0
mean,218.607666
std,206.61513
min,7.0
25%,66.0
50%,145.0
75%,297.0
max,1189.0


7. Definir variables objetivo e imputar faltantes

In [13]:
feature_cols = [c for c in df.columns
                if c not in ['Datetime', target_col]]

print("Features:", feature_cols)

Features: ['CO(GT)', 'PT08.S1(CO)', 'C6H6(GT)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']


8. Imputar valores faltantes en features (mediana)

In [15]:
X_raw = df[feature_cols]
y = df[target_col]

In [16]:
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X_raw)

In [18]:
X_imputed_df = pd.DataFrame(X_imputed, columns=feature_cols, index=df.index)

In [19]:
print("NaNs en features después de imputar:")
print(X_imputed_df.isna().sum())

NaNs en features después de imputar:
CO(GT)          0
PT08.S1(CO)     0
C6H6(GT)        0
NOx(GT)         0
PT08.S3(NOx)    0
NO2(GT)         0
PT08.S4(NO2)    0
PT08.S5(O3)     0
T               0
RH              0
AH              0
dtype: int64


9. Escalar datos

In [21]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed_df)

In [22]:
X_scaled_df = pd.DataFrame(X_scaled, columns=feature_cols, index=df.index)

In [23]:
X_scaled_df.describe().T.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CO(GT),887.0,-1.041382e-16,1.000564,-1.419902,-0.775351,-0.202416,0.513752,4.166209
PT08.S1(CO),887.0,8.811691e-17,1.000564,-1.788307,-0.796834,-0.155112,0.699836,3.477393
C6H6(GT),887.0,4.8063770000000004e-17,1.000564,-1.303022,-0.831685,-0.210225,0.563735,3.892075
NOx(GT),887.0,1.682232e-16,1.000564,-1.613004,-0.745441,-0.175329,0.512524,4.187271
PT08.S3(NOx),887.0,2.883826e-16,1.000564,-1.885849,-0.75125,-0.177899,0.616903,3.397588


10. Salidas para modelos tabulares: X_tab, y_tab

In [24]:
X_tab = X_imputed_df.copy()

In [28]:
y_tab = y.copy()

In [29]:
print("X_tab shape:", X_tab.shape)
print("y_tab shape:", y_tab.shape)

X_tab shape: (887, 11)
y_tab shape: (887,)


11. Crear dataset secuencial para LSTM

In [30]:
def create_windows(df_features, df_target, feature_cols, target_col, window_size=24):
    """
    df_features: DataFrame con las features (ya escaladas e imputadas), indexado igual que df_target
    df_target: Serie o DataFrame con el target, indexado igual que df_features
    feature_cols: lista de nombres de columnas de features
    target_col: nombre de la columna objetivo
    window_size: número de pasos hacia atrás (ej. 24 horas)

    Devuelve:
    X_seq: array de forma (n_samples, window_size, n_features)
    y_seq: array de forma (n_samples,)
    """
    # Unir features y target en un solo array para recorrer
    data = df_features[feature_cols].copy()
    data[target_col] = df_target

    values = data[feature_cols + [target_col]].values
    n_features = len(feature_cols)

    X_list, y_list = [], []

    # Empezamos a partir de window_size para tener ventana completa
    for i in range(window_size, len(values)):
        # Ventana: [i-window_size, ..., i-1]
        X_window = values[i-window_size:i, :n_features]
        # Target: valor del target en el instante i (último de la ventana)
        y_value = values[i, -1]

        X_list.append(X_window)
        y_list.append(y_value)

    X_seq = np.array(X_list)
    y_seq = np.array(y_list)

    return X_seq, y_seq

In [31]:
window_size = 24  # puedes cambiarlo a 12, 48, etc.

X_seq, y_seq = create_windows(
    df_features=X_scaled_df,
    df_target=y,
    feature_cols=feature_cols,
    target_col=target_col,
    window_size=window_size
)

In [32]:
print("X_seq shape:", X_seq.shape)  # (n_samples, window_size, n_features)
print("y_seq shape:", y_seq.shape)  # (n_samples,)

X_seq shape: (863, 24, 11)
y_seq shape: (863,)
