<a href="https://colab.research.google.com/github/jkeza1/time_series_forecasting/blob/main/air_quality_forecasting_starter_code_(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Beijing Air Quality Forecasting Starter Notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import os

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [4]:

plt.style.use('default')
warnings.filterwarnings('ignore')
np.random.seed(42)
tf.random.set_seed(42)

print("Libraries imported successfully!")
def save_submission(predictions, experiment_name, test_index):
    """Save submission with timestamp and experiment info"""
    os.makedirs('submissions', exist_ok=True)

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    submission = pd.DataFrame({
        'row ID': test_index.strftime('%Y-%m-%d %-H:%M:%S'),
        'pm2.5': predictions.round().astype(int)
    })

    filename = f'submissions/{timestamp}_{experiment_name}.csv'
    submission.to_csv(filename, index=False)

    print(f" Submission saved: {filename}")
    print(f"📊Predictions - Min: {predictions.min():.1f}, Max: {predictions.max():.1f}")

    return filename, submission

print("Submission tracking ready!")


Libraries imported successfully!
Submission tracking ready!


In [5]:
# Mount Google Drive to access datasets
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
# Load the datasets
# Ensure train.csv and test.csv are saved in your Google Drive in the same folder.
# Replace the file paths below with the actual paths to your dataset.
train = pd.read_csv('/content/drive/MyDrive/Kaggle_competition_ML/air_quality_forecasting/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Kaggle_competition_ML/air_quality_forecasting/test.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/Kaggle_competition_ML/air_quality_forecasting/sample_submission.csv')


In [7]:
print(f"📊 Training data: {train.shape}")
print(f"📊 Test data: {test.shape}")
print(f"\n📋 Columns: {list(train.columns)}")

📊 Training data: (30676, 12)
📊 Test data: (13148, 11)

📋 Columns: ['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'datetime', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'pm2.5']


In [8]:
# Check
print(train.head())
print(test.head())


   No      DEWP      TEMP      PRES       Iws        Is        Ir  \
0   1 -1.580878 -1.922250  0.443328 -0.441894 -0.069353 -0.137667   
1   2 -1.580878 -2.004228  0.345943 -0.379306 -0.069353 -0.137667   
2   3 -1.580878 -1.922250  0.248559 -0.343514 -0.069353 -0.137667   
3   4 -1.580878 -2.168183  0.248559 -0.280926 -0.069353 -0.137667   
4   5 -1.511594 -2.004228  0.151174 -0.218339 -0.069353 -0.137667   

              datetime   cbwd_NW   cbwd_SE   cbwd_cv  pm2.5  
0  2010-01-01 00:00:00  1.448138 -0.732019 -0.522096    NaN  
1  2010-01-01 01:00:00  1.448138 -0.732019 -0.522096    NaN  
2  2010-01-01 02:00:00  1.448138 -0.732019 -0.522096    NaN  
3  2010-01-01 03:00:00  1.448138 -0.732019 -0.522096    NaN  
4  2010-01-01 04:00:00  1.448138 -0.732019 -0.522096    NaN  
      No      DEWP      TEMP      PRES       Iws        Is        Ir  \
0  30677  1.190496  0.701029 -2.186052 -0.003982 -0.069353 -0.137667   
1  30678  1.121211  0.619051 -2.186052  0.031811 -0.069353 -0.137667 

In [10]:
print(f"\n❓ Missing values:")
print(f"Train: {train.isnull().sum().sum()}")
print(f"Test: {test.isnull().sum().sum()}")


❓ Missing values:
Train: 1921
Test: 0


In [11]:
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

train.set_index('datetime', inplace=True)
test.set_index('datetime', inplace=True)

In [13]:
print(f"Time range - Train: {train.index.min()} to {train.index.max()}")
print(f"Time range - Test: {test.index.min()} to {test.index.max()}")

Time range - Train: 2010-01-01 00:00:00 to 2013-07-02 03:00:00
Time range - Test: 2013-07-02 04:00:00 to 2014-12-31 23:00:00


In [16]:
# Key insights
if 'pm2.5' in train.columns:
    print(f"\nPM2.5 statistics:")
    print(f"Mean: {train['pm2.5'].mean():.1f}, Std: {train['pm2.5'].std():.1f}")
    print(f"Min: {train['pm2.5'].min():.1f}, Max: {train['pm2.5'].max():.1f}")



PM2.5 statistics:
Mean: 100.8, Std: 93.1
Min: 0.0, Max: 994.0


# Explore the training data

In this sections explore your dataset with appropiate statistics and visualisations to understand your better. Ensure that you explain output of every code cell and what it entails.

In [61]:
# Apply feature engineering
train_enhanced = create_advanced_features(train_clean)
test_enhanced = create_test_features(test_clean)

# Handle remaining NaNs
train_enhanced = train_enhanced.fillna(method='bfill').fillna(0)
test_enhanced = test_enhanced.fillna(method='bfill').fillna(0)

In [18]:
train_clean = train.copy()
# First fill with forward fill, then backward fill, then interpolate
train_clean = train_clean.fillna(method='ffill', limit=12)  # Limit forward fill to 12 hours
train_clean = train_clean.fillna(method='bfill', limit=12)  # Limit backward fill to 12 hours
train_clean = train_clean.interpolate(method='time')

In [19]:
# For test data
test_clean = test.copy()
test_clean = test_clean.fillna(method='ffill')
test_clean = test_clean.fillna(method='bfill')
test_clean = test_clean.interpolate(method='linear')

print(f"✅ After treatment - Train: {train_clean.isnull().sum().sum()}, Test: {test_clean.isnull().sum().sum()}")
print(f"📊 Clean datasets - Train: {train_clean.shape}, Test: {test_clean.shape}")

✅ After treatment - Train: 12, Test: 0
📊 Clean datasets - Train: (30676, 11), Test: (13148, 10)


In [22]:
# Advanced Feature Engineering
def create_advanced_features(df, target_col='pm2.5'):
    """Create lag features, rolling statistics, and temporal features"""
    df_enhanced = df.copy()

    # Temporal features
    df_enhanced['hour'] = df_enhanced.index.hour
    df_enhanced['day_of_week'] = df_enhanced.index.dayofweek
    df_enhanced['month'] = df_enhanced.index.month
    df_enhanced['season'] = (df_enhanced.index.month % 12 + 3) // 3
    df_enhanced['is_weekend'] = df_enhanced['day_of_week'].isin([5, 6]).astype(int)
     # Cyclical encoding for temporal features
    df_enhanced['hour_sin'] = np.sin(2 * np.pi * df_enhanced['hour'] / 24)
    df_enhanced['hour_cos'] = np.cos(2 * np.pi * df_enhanced['hour'] / 24)
    df_enhanced['day_sin'] = np.sin(2 * np.pi * df_enhanced['day_of_week'] / 7)
    df_enhanced['day_cos'] = np.cos(2 * np.pi * df_enhanced['day_of_week'] / 7)
    df_enhanced['month_sin'] = np.sin(2 * np.pi * df_enhanced['month'] / 12)
    df_enhanced['month_cos'] = np.cos(2 * np.pi * df_enhanced['month'] / 12)

    if target_col in df_enhanced.columns:
        # Lag features for PM2.5
        for lag in [1, 2, 3, 6, 12, 24, 48]:
            df_enhanced[f'pm2.5_lag_{lag}'] = df_enhanced[target_col].shift(lag)
             # Rolling statistics for PM2.5
        for window in [6, 12, 24, 48]:
            df_enhanced[f'pm2.5_roll_mean_{window}'] = df_enhanced[target_col].rolling(window, min_periods=1).mean()
            df_enhanced[f'pm2.5_roll_std_{window}'] = df_enhanced[target_col].rolling(window, min_periods=1).std()
            df_enhanced[f'pm2.5_roll_min_{window}'] = df_enhanced[target_col].rolling(window, min_periods=1).min()
            df_enhanced[f'pm2.5_roll_max_{window}'] = df_enhanced[target_col].rolling(window, min_periods=1).max()

    # Weather interaction features
    df_enhanced['temp_dewp_diff'] = df_enhanced['TEMP'] - df_enhanced['DEWP']
    df_enhanced['wind_pressure'] = df_enhanced['Iws'] * df_enhanced['PRES']
    df_enhanced['humidity_index'] = df_enhanced['DEWP'] / (df_enhanced['TEMP'] + 1e-6)
    # Rolling features for weather variables
    for col in ['DEWP', 'TEMP', 'PRES', 'Iws']:
        for window in [6, 12, 24]:
            df_enhanced[f'{col}_roll_mean_{window}'] = df_enhanced[col].rolling(window, min_periods=1).mean()
            df_enhanced[f'{col}_roll_std_{window}'] = df_enhanced[col].rolling(window, min_periods=1).std()

    # Wind direction features
    wind_cols = [col for col in df_enhanced.columns if 'cbwd' in col]
    if len(wind_cols) >= 2:
        df_enhanced['wind_complexity'] = sum(df_enhanced[col] for col in wind_cols)
        # Drop original temporal columns (keep encoded versions)
    df_enhanced = df_enhanced.drop(['hour', 'day_of_week', 'month'], axis=1)

    return df_enhanced

def create_test_features(df):
    """Create features for test data (without PM2.5 lag features)"""
    df_enhanced = df.copy()

    # Temporal features
    df_enhanced['hour'] = df_enhanced.index.hour
    df_enhanced['day_of_week'] = df_enhanced.index.dayofweek
    df_enhanced['month'] = df_enhanced.index.month
    df_enhanced['season'] = (df_enhanced.index.month % 12 + 3) // 3
    df_enhanced['is_weekend'] = df_enhanced['day_of_week'].isin([5, 6]).astype(int)
    # Cyclical encoding for temporal features
    df_enhanced['hour_sin'] = np.sin(2 * np.pi * df_enhanced['hour'] / 24)
    df_enhanced['hour_cos'] = np.cos(2 * np.pi * df_enhanced['hour'] / 24)
    df_enhanced['day_sin'] = np.sin(2 * np.pi * df_enhanced['day_of_week'] / 7)
    df_enhanced['day_cos'] = np.cos(2 * np.pi * df_enhanced['day_of_week'] / 7)
    df_enhanced['month_sin'] = np.sin(2 * np.pi * df_enhanced['month'] / 12)
    df_enhanced['month_cos'] = np.cos(2 * np.pi * df_enhanced['month'] / 12)

    # Weather interaction features
    df_enhanced['temp_dewp_diff'] = df_enhanced['TEMP'] - df_enhanced['DEWP']
    df_enhanced['wind_pressure'] = df_enhanced['Iws'] * df_enhanced['PRES']
    df_enhanced['humidity_index'] = df_enhanced['DEWP'] / (df_enhanced['TEMP'] + 1e-6)

    # Rolling features for weather variables
    for col in ['DEWP', 'TEMP', 'PRES', 'Iws']:
        for window in [6, 12, 24]:
            df_enhanced[f'{col}_roll_mean_{window}'] = df_enhanced[col].rolling(window, min_periods=1).mean()
            df_enhanced[f'{col}_roll_std_{window}'] = df_enhanced[col].rolling(window, min_periods=1).std()

    # Wind direction features
    wind_cols = [col for col in df_enhanced.columns if 'cbwd' in col]
    if len(wind_cols) >= 2:
        df_enhanced['wind_complexity'] = sum(df_enhanced[col] for col in wind_cols)

    # Drop original temporal columns
    df_enhanced = df_enhanced.drop(['hour', 'day_of_week', 'month'], axis=1)

    return df_enhanced

print("🛠️ Feature engineering functions defined.")

# Apply feature engineering
print("🔧 Creating advanced features...")
train_enhanced = create_advanced_features(train_clean)
test_enhanced = create_test_features(test_clean)

print(f" Original features: {train_clean.shape[1]}")
print(f" Enhanced train features: {train_enhanced.shape[1]}")
print(f" Enhanced test features: {test_enhanced.shape[1]}")
print(f" New features added: {train_enhanced.shape[1] - train_clean.shape[1]}")

🛠️ Feature engineering functions defined.
🔧 Creating advanced features...
 Original features: 11
 Enhanced train features: 70
 Enhanced test features: 46
 New features added: 59


In [25]:
# Handle remaining NaNs (from lag and rolling features)
print(" Handling remaining NaN values...")
train_enhanced = train_enhanced.fillna(method='bfill').fillna(method='ffill').fillna(0)
test_enhanced = test_enhanced.fillna(method='bfill').fillna(method='ffill').fillna(0)

print(f"Final datasets - Train: {train_enhanced.shape}, Test: {test_enhanced.shape}")


🔧 Handling remaining NaN values...
Final datasets - Train: (30676, 70), Test: (13148, 46)


# Handle missing values


- Check the dataset for missing values and decide how to handle them.
- In this example, missing values are filled with the mean. You can experiment with other strategies.

In [48]:
# Handle missing values
print("🔧 Handling missing values...")

# For training data
train_clean = train.copy()
train_clean = train_clean.fillna(method='ffill')  # Forward fill
train_clean = train_clean.fillna(method='bfill')  # Backward fill
train_clean = train_clean.interpolate(method='linear')  # Linear interpolation

# For test data
test_clean = test.copy()
test_clean = test_clean.fillna(method='ffill')
test_clean = test_clean.fillna(method='bfill')
test_clean = test_clean.interpolate(method='linear')

print(f"✅ After treatment - Train: {train_clean.isnull().sum().sum()}, Test: {test_clean.isnull().sum().sum()}")
print(f"📊 Clean datasets - Train: {train_clean.shape}, Test: {test_clean.shape}")

🔧 Handling missing values...
✅ After treatment - Train: 0, Test: 0
📊 Clean datasets - Train: (30676, 11), Test: (13148, 10)


In [49]:
# Handle remaining NaNs (from lag and rolling features)
print("🔧 Handling remaining NaN values...")
train_enhanced = train_enhanced.fillna(method='bfill').fillna(0)
test_enhanced = test_enhanced.fillna(method='bfill').fillna(0)

print(f"✅ Final datasets - Train: {train_enhanced.shape}, Test: {test_enhanced.shape}")

🔧 Handling remaining NaN values...
✅ Final datasets - Train: (30676, 70), Test: (13148, 46)


# Separate features and target

- Feel free to trop any non-essential columns like that you think might not contribute to modeling.

In [43]:
# Time series sequence creation
def create_sequences(data, target, sequence_length=24):
    """Create sequences for LSTM input"""
    X, y = [], []

    for i in range(sequence_length, len(data)):
        X.append(data[i-sequence_length:i])
        y.append(target[i])

    return np.array(X), np.array(y)

In [46]:
# ======================
# SEPARATE FEATURES & TARGET
# ======================

print("🎯 Separating features and target variables...")

# For training data - use the common features we already identified
train_features = X_train_common  # This already has the common features only
train_target = train_enhanced['pm2.5']

# For test data - use the common features
test_features = X_test_common    # This already has the common features only

print(f"📊 Training features shape: {train_features.shape}")
print(f"📊 Training target shape: {train_target.shape}")
print(f"📊 Test features shape: {test_features.shape}")

# Display feature names
print(f"\n🔍 Feature columns: {list(train_features.columns)}")
print(f"🎯 Target column: pm2.5")

🎯 Separating features and target variables...
📊 Training features shape: (30676, 45)
📊 Training target shape: (30676,)
📊 Test features shape: (13148, 45)

🔍 Feature columns: ['DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'season', 'is_weekend', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'temp_dewp_diff', 'wind_pressure', 'humidity_index', 'DEWP_roll_mean_6', 'DEWP_roll_std_6', 'DEWP_roll_mean_12', 'DEWP_roll_std_12', 'DEWP_roll_mean_24', 'DEWP_roll_std_24', 'TEMP_roll_mean_6', 'TEMP_roll_std_6', 'TEMP_roll_mean_12', 'TEMP_roll_std_12', 'TEMP_roll_mean_24', 'TEMP_roll_std_24', 'PRES_roll_mean_6', 'PRES_roll_std_6', 'PRES_roll_mean_12', 'PRES_roll_std_12', 'PRES_roll_mean_24', 'PRES_roll_std_24', 'Iws_roll_mean_6', 'Iws_roll_std_6', 'Iws_roll_mean_12', 'Iws_roll_std_12', 'Iws_roll_mean_24', 'Iws_roll_std_24', 'wind_complexity']
🎯 Target column: pm2.5


In [47]:
print("🏗️ Creating time series sequences...")
X_seq, y_seq = create_sequences(X_train_scaled, train_target.values, SEQUENCE_LENGTH)

# Train/validation split
split_idx = int(0.85 * len(X_seq))
X_train, X_val = X_seq[:split_idx], X_seq[split_idx:]
y_train_seq, y_val = y_seq[:split_idx], y_seq[split_idx:]

print(f"📊 Sequences shape: {X_seq.shape}")
print(f"📊 Train: {X_train.shape}, Validation: {X_val.shape}")

🏗️ Creating time series sequences...
📊 Sequences shape: (30640, 36, 68)
📊 Train: (26044, 36, 68), Validation: (4596, 36, 68)


# Build model

Below is a simple LSTM model. Your task is to experiment with different parameters like, numbers of layers, units, activation functions, and optimizers, etc to get the best performing model. Experiment with other optimizers (e.g., SGD) or hyperparameters to improve performance.

In [59]:
# ======================
# CORRECT FEATURE ALIGNMENT
# ======================

print("🔧 Correcting feature alignment...")

# First, let's see what features we actually have
print("📊 Training features:", train_enhanced.columns.tolist())
print("📊 Test features:", test_enhanced.columns.tolist())

# The problem: Training has PM2.5 lag features that test can't have
# Solution: Remove PM2.5-dependent features from training
pm25_dependent_features = [col for col in train_enhanced.columns if 'pm2.5_' in col]
print(f"🚫 Removing PM2.5-dependent features: {pm25_dependent_features}")

# Use only features that exist in both datasets
common_features = [col for col in train_enhanced.columns
                  if col in test_enhanced.columns and col not in ['pm2.5', 'No']]

print(f"✅ Using {len(common_features)} common features:")
print(common_features)

# Create aligned datasets
X_train_aligned = train_enhanced[common_features]
y_train = train_enhanced['pm2.5']
X_test_aligned = test_enhanced[common_features]

print(f"📊 Aligned training features: {X_train_aligned.shape}")
print(f"📊 Aligned test features: {X_test_aligned.shape}")

# Scale the aligned features
scaler_aligned = StandardScaler()
X_train_scaled_aligned = scaler_aligned.fit_transform(X_train_aligned)
X_test_scaled_aligned = scaler_aligned.transform(X_test_aligned)

print("✅ Feature alignment completed!")

🔧 Correcting feature alignment...
📊 Training features: ['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'pm2.5', 'season', 'is_weekend', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'pm2.5_lag_1', 'pm2.5_lag_2', 'pm2.5_lag_3', 'pm2.5_lag_6', 'pm2.5_lag_12', 'pm2.5_lag_24', 'pm2.5_lag_48', 'pm2.5_roll_mean_6', 'pm2.5_roll_std_6', 'pm2.5_roll_min_6', 'pm2.5_roll_max_6', 'pm2.5_roll_mean_12', 'pm2.5_roll_std_12', 'pm2.5_roll_min_12', 'pm2.5_roll_max_12', 'pm2.5_roll_mean_24', 'pm2.5_roll_std_24', 'pm2.5_roll_min_24', 'pm2.5_roll_max_24', 'pm2.5_roll_mean_48', 'pm2.5_roll_std_48', 'pm2.5_roll_min_48', 'pm2.5_roll_max_48', 'temp_dewp_diff', 'wind_pressure', 'humidity_index', 'DEWP_roll_mean_6', 'DEWP_roll_std_6', 'DEWP_roll_mean_12', 'DEWP_roll_std_12', 'DEWP_roll_mean_24', 'DEWP_roll_std_24', 'TEMP_roll_mean_6', 'TEMP_roll_std_6', 'TEMP_roll_mean_12', 'TEMP_roll_std_12', 'TEMP_roll_mean_24', 'TEMP_roll_std_24', 'PRES_roll_mean_6'

In [60]:
# ======================
# RE-TRAIN WITH ALIGNED FEATURES (QUICK VERSION)
# ======================

print("🔄 Training model with aligned features...")

# Create sequences with aligned features
X_seq_aligned, y_seq_aligned = create_sequences(X_train_scaled_aligned, y_train.values, SEQUENCE_LENGTH)

# Train/validation split
split_idx = int(0.85 * len(X_seq_aligned))
X_train_aligned, X_val_aligned = X_seq_aligned[:split_idx], X_seq_aligned[split_idx:]
y_train_aligned, y_val_aligned = y_seq_aligned[:split_idx], y_seq_aligned[split_idx:]

print(f"📊 Aligned sequences shape: {X_seq_aligned.shape}")

# Build model with correct input shape
model_aligned = create_enhanced_model(X_train_aligned.shape[1:])
model_aligned.compile(
    optimizer=Adam(learning_rate=0.002),
    loss='huber',
    metrics=['mae']
)

# Train quickly (fewer epochs since we already know good parameters)
history_aligned = model_aligned.fit(
    X_train_aligned, y_train_aligned,
    validation_data=(X_val_aligned, y_val_aligned),
    epochs=30,  # Fewer epochs for quick training
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

# Evaluate
val_pred_aligned = model_aligned.predict(X_val_aligned, verbose=0)
val_rmse_aligned = np.sqrt(mean_squared_error(y_val_aligned, val_pred_aligned))
print(f"🎯 Aligned Model RMSE: {val_rmse_aligned:.2f}")

🔄 Training model with aligned features...
📊 Aligned sequences shape: (30640, 36, 45)
Epoch 1/30
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 187ms/step - loss: 82.6972 - mae: 83.1958 - val_loss: 77.0363 - val_mae: 77.5345 - learning_rate: 0.0020
Epoch 2/30
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 178ms/step - loss: 54.2871 - mae: 54.7843 - val_loss: 49.8468 - val_mae: 50.3418 - learning_rate: 0.0020
Epoch 3/30
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 177ms/step - loss: 35.9352 - mae: 36.4298 - val_loss: 44.0857 - val_mae: 44.5804 - learning_rate: 0.0020
Epoch 4/30
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 193ms/step - loss: 31.1083 - mae: 31.6023 - val_loss: 41.7749 - val_mae: 42.2707 - learning_rate: 0.0020
Epoch 5/30
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 181ms/step - loss: 28.5551 - mae: 29.0485 - val_loss: 42.3813 - val_mae: 42.8761 - learning_rate: 0.0020


In [62]:
# ======================
# GENERATE PREDICTIONS (WILL WORK NOW)
# ======================

print("🔮 Generating test predictions with aligned features...")

test_sequences = []
for i in range(len(X_test_scaled_aligned)):
    if i < SEQUENCE_LENGTH:
        needed_from_train = SEQUENCE_LENGTH - (i + 1)
        if needed_from_train > 0:
            sequence = np.vstack([X_train_scaled_aligned[-needed_from_train:], X_test_scaled_aligned[:i+1]])
        else:
            sequence = X_test_scaled_aligned[:SEQUENCE_LENGTH]
    else:
        sequence = X_test_scaled_aligned[i-SEQUENCE_LENGTH+1:i+1]

    if sequence.shape[0] != SEQUENCE_LENGTH:
        if sequence.shape[0] < SEQUENCE_LENGTH:
            padding_needed = SEQUENCE_LENGTH - sequence.shape[0]
            padding = np.repeat(sequence[0:1], padding_needed, axis=0)
            sequence = np.vstack([padding, sequence])
        else:
            sequence = sequence[-SEQUENCE_LENGTH:]

    test_sequences.append(sequence)

X_test_seq = np.array(test_sequences)
print(f"📊 Test sequences shape: {X_test_seq.shape}")

# This will work now - same feature dimensions!
test_predictions = model_aligned.predict(X_test_seq, verbose=0)
test_predictions = np.maximum(test_predictions.flatten(), 0)

print(f"📊 Predictions - Min: {test_predictions.min():.1f}, Max: {test_predictions.max():.1f}")

🔮 Generating test predictions with aligned features...
📊 Test sequences shape: (13148, 36, 45)
📊 Predictions - Min: 13.8, Max: 392.3


In [45]:
print("🚀 Building and training LSTM model...")

# Create and train model
model = create_enhanced_model(X_train.shape[1:])
model.compile(
    optimizer=Adam(learning_rate=0.002),
    loss='huber',
    metrics=['mae']
)

history = model.fit(
    X_train, y_train_seq,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

print("✅ Model training completed!")

🚀 Building and training LSTM model...
Epoch 1/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 209ms/step - loss: 79.6651 - mae: 80.1634 - val_loss: 36.5207 - val_mae: 37.0150 - learning_rate: 0.0020
Epoch 2/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 189ms/step - loss: 26.0483 - mae: 26.5417 - val_loss: 23.7085 - val_mae: 24.2011 - learning_rate: 0.0020
Epoch 3/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 194ms/step - loss: 21.5637 - mae: 22.0560 - val_loss: 21.6454 - val_mae: 22.1370 - learning_rate: 0.0020
Epoch 4/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 182ms/step - loss: 19.7368 - mae: 20.2289 - val_loss: 21.6713 - val_mae: 22.1614 - learning_rate: 0.0020
Epoch 5/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 177ms/step - loss: 18.8892 - mae: 19.3802 - val_loss: 21.3461 - val_mae: 21.8367 - learning_rate: 0.0020
Epoch 6/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━

In [63]:
print("📊 Evaluating model performance...")

# Evaluate model
val_pred = model.predict(X_val, verbose=0)
val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))

print(f"\n🎯 Model Performance:")
print(f"Validation RMSE: {val_rmse:.2f}")
print(f"Target: < 3000 RMSE")

if val_rmse < 3000:
    print("🎉 TARGET ACHIEVED!")
else:
    print(f"📈 Need {val_rmse - 3000:.1f} points improvement")

📊 Evaluating model performance...

🎯 Model Performance:
Validation RMSE: 37.26
Target: < 3000 RMSE
🎉 TARGET ACHIEVED!


In [64]:
# ======================
# CORRECT FEATURE ALIGNMENT
# ======================

print("🔧 Correcting feature alignment...")

# First, let's see what features we actually have
print("📊 Training features:", train_enhanced.columns.tolist())
print("📊 Test features:", test_enhanced.columns.tolist())

# The problem: Training has PM2.5 lag features that test can't have
# Solution: Remove PM2.5-dependent features from training
pm25_dependent_features = [col for col in train_enhanced.columns if 'pm2.5_' in col]
print(f"🚫 Removing PM2.5-dependent features: {pm25_dependent_features}")

# Use only features that exist in both datasets
common_features = [col for col in train_enhanced.columns
                  if col in test_enhanced.columns and col not in ['pm2.5', 'No']]

print(f"✅ Using {len(common_features)} common features:")
print(common_features)

# Create aligned datasets
X_train_aligned = train_enhanced[common_features]
y_train = train_enhanced['pm2.5']
X_test_aligned = test_enhanced[common_features]

print(f"📊 Aligned training features: {X_train_aligned.shape}")
print(f"📊 Aligned test features: {X_test_aligned.shape}")

# Scale the aligned features
scaler_aligned = StandardScaler()
X_train_scaled_aligned = scaler_aligned.fit_transform(X_train_aligned)
X_test_scaled_aligned = scaler_aligned.transform(X_test_aligned)

print("✅ Feature alignment completed!")

🔧 Correcting feature alignment...
📊 Training features: ['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'pm2.5', 'season', 'is_weekend', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'pm2.5_lag_1', 'pm2.5_lag_2', 'pm2.5_lag_3', 'pm2.5_lag_6', 'pm2.5_lag_12', 'pm2.5_lag_24', 'pm2.5_lag_48', 'pm2.5_roll_mean_6', 'pm2.5_roll_std_6', 'pm2.5_roll_min_6', 'pm2.5_roll_max_6', 'pm2.5_roll_mean_12', 'pm2.5_roll_std_12', 'pm2.5_roll_min_12', 'pm2.5_roll_max_12', 'pm2.5_roll_mean_24', 'pm2.5_roll_std_24', 'pm2.5_roll_min_24', 'pm2.5_roll_max_24', 'pm2.5_roll_mean_48', 'pm2.5_roll_std_48', 'pm2.5_roll_min_48', 'pm2.5_roll_max_48', 'temp_dewp_diff', 'wind_pressure', 'humidity_index', 'DEWP_roll_mean_6', 'DEWP_roll_std_6', 'DEWP_roll_mean_12', 'DEWP_roll_std_12', 'DEWP_roll_mean_24', 'DEWP_roll_std_24', 'TEMP_roll_mean_6', 'TEMP_roll_std_6', 'TEMP_roll_mean_12', 'TEMP_roll_std_12', 'TEMP_roll_mean_24', 'TEMP_roll_std_24', 'PRES_roll_mean_6'

In [65]:
# ======================
# CREATE SUBMISSION
# ======================

print("💾 Creating submission file...")

# Create and save submission
experiment_name = f"enhanced_lstm_rmse_{val_rmse:.0f}"
filename, submission = save_submission(
    test_predictions,
    experiment_name,
    test.index
)

print(f"\n🎯 EXPERIMENT COMPLETE!")
print(f"📈 Validation RMSE: {val_rmse:.2f}")
print(f"💾 Submission saved: {filename}")
print(f"\n📋 Sample predictions:")
print(submission.head(10))

# Check if submission file exists
import os
if os.path.exists(filename):
    print(f"✅ Confirmed: {filename} exists in submissions folder!")
    print(f"📁 File size: {os.path.getsize(filename)} bytes")
else:
    print(f"❌ Warning: {filename} not found!")

💾 Creating submission file...
✅ Submission saved: submissions/20250920_092621_enhanced_lstm_rmse_37.csv
📊 Predictions - Min: 13.8, Max: 392.3

🎯 EXPERIMENT COMPLETE!
📈 Validation RMSE: 37.26
💾 Submission saved: submissions/20250920_092621_enhanced_lstm_rmse_37.csv

📋 Sample predictions:
                row ID  pm2.5
0   2013-07-02 4:00:00     31
1   2013-07-02 5:00:00     30
2   2013-07-02 6:00:00     29
3   2013-07-02 7:00:00     28
4   2013-07-02 8:00:00     28
5   2013-07-02 9:00:00     27
6  2013-07-02 10:00:00     25
7  2013-07-02 11:00:00     25
8  2013-07-02 12:00:00     24
9  2013-07-02 13:00:00     24
✅ Confirmed: submissions/20250920_092621_enhanced_lstm_rmse_37.csv exists in submissions folder!
📁 File size: 301342 bytes


In [66]:
# Create and save submission
experiment_name = f"enhanced_lstm_rmse_{val_rmse:.0f}"
filename, submission = save_submission(
    test_predictions,
    experiment_name,
    test.index
)

print(f"\n🎯 EXPERIMENT COMPLETE!")
print(f"📈 Validation RMSE: {val_rmse:.2f}")
print(f"💾 Submission saved: {filename}")
print(f"\n📋 Sample predictions:")
print(submission.head(10))

# Check if submission file exists
import os
if os.path.exists(filename):
    print(f"✅ Confirmed: {filename} exists in submissions folder!")
    print(f"📁 File size: {os.path.getsize(filename)} bytes")
else:
    print(f"❌ Warning: {filename} not found!")

✅ Submission saved: submissions/20250920_092625_enhanced_lstm_rmse_37.csv
📊 Predictions - Min: 13.8, Max: 392.3

🎯 EXPERIMENT COMPLETE!
📈 Validation RMSE: 37.26
💾 Submission saved: submissions/20250920_092625_enhanced_lstm_rmse_37.csv

📋 Sample predictions:
                row ID  pm2.5
0   2013-07-02 4:00:00     31
1   2013-07-02 5:00:00     30
2   2013-07-02 6:00:00     29
3   2013-07-02 7:00:00     28
4   2013-07-02 8:00:00     28
5   2013-07-02 9:00:00     27
6  2013-07-02 10:00:00     25
7  2013-07-02 11:00:00     25
8  2013-07-02 12:00:00     24
9  2013-07-02 13:00:00     24
✅ Confirmed: submissions/20250920_092625_enhanced_lstm_rmse_37.csv exists in submissions folder!
📁 File size: 301342 bytes
