<a href="https://colab.research.google.com/github/jkeza1/time_series_forecasting/blob/main/air_quality_forecasting_starter_code_(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Beijing Air Quality Forecasting Starter Notebook

In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import os

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [99]:
def save_submission(predictions, experiment_name, test_index):
    """Save submission with timestamp and experiment info"""
    os.makedirs('submissions', exist_ok=True)

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    # FIXED: Apply strftime to each datetime individually
    row_ids = test_index.strftime('%Y-%m-%d %H:%M:%S')  # This now works correctly

    submission = pd.DataFrame({
        'row ID': row_ids,  # Use the properly formatted row IDs
        'pm2.5': predictions.round().astype(int)
    })

    filename = f'submissions/{timestamp}_{experiment_name}.csv'
    submission.to_csv(filename, index=False)

    print(f"✅ Submission saved: {filename}")
    print(f"📊 Predictions - Min: {predictions.min():.1f}, Max: {predictions.max():.1f}")
    print(f"📋 Number of row IDs: {len(row_ids)}")  # Added for verification

    return filename, submission

In [100]:
# Mount Google Drive to access datasets
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [101]:
# Load the datasets
# Ensure train.csv and test.csv are saved in your Google Drive in the same folder.
# Replace the file paths below with the actual paths to your dataset.
train = pd.read_csv('/content/drive/MyDrive/Kaggle_competition_ML/air_quality_forecasting/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Kaggle_competition_ML/air_quality_forecasting/test.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/Kaggle_competition_ML/air_quality_forecasting/sample_submission.csv')


In [102]:
print(f"📊 Training data: {train.shape}")
print(f"📊 Test data: {test.shape}")
print(f"\n📋 Columns: {list(train.columns)}")

📊 Training data: (30676, 12)
📊 Test data: (13148, 11)

📋 Columns: ['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'datetime', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'pm2.5']


In [103]:
# Check
print(train.head())
print(test.head())


   No      DEWP      TEMP      PRES       Iws        Is        Ir  \
0   1 -1.580878 -1.922250  0.443328 -0.441894 -0.069353 -0.137667   
1   2 -1.580878 -2.004228  0.345943 -0.379306 -0.069353 -0.137667   
2   3 -1.580878 -1.922250  0.248559 -0.343514 -0.069353 -0.137667   
3   4 -1.580878 -2.168183  0.248559 -0.280926 -0.069353 -0.137667   
4   5 -1.511594 -2.004228  0.151174 -0.218339 -0.069353 -0.137667   

              datetime   cbwd_NW   cbwd_SE   cbwd_cv  pm2.5  
0  2010-01-01 00:00:00  1.448138 -0.732019 -0.522096    NaN  
1  2010-01-01 01:00:00  1.448138 -0.732019 -0.522096    NaN  
2  2010-01-01 02:00:00  1.448138 -0.732019 -0.522096    NaN  
3  2010-01-01 03:00:00  1.448138 -0.732019 -0.522096    NaN  
4  2010-01-01 04:00:00  1.448138 -0.732019 -0.522096    NaN  
      No      DEWP      TEMP      PRES       Iws        Is        Ir  \
0  30677  1.190496  0.701029 -2.186052 -0.003982 -0.069353 -0.137667   
1  30678  1.121211  0.619051 -2.186052  0.031811 -0.069353 -0.137667 

In [104]:
print(f"\n❓ Missing values:")
print(f"Train: {train.isnull().sum().sum()}")
print(f"Test: {test.isnull().sum().sum()}")


❓ Missing values:
Train: 1921
Test: 0


In [105]:
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

train.set_index('datetime', inplace=True)
test.set_index('datetime', inplace=True)

In [106]:
print(f"Time range - Train: {train.index.min()} to {train.index.max()}")
print(f"Time range - Test: {test.index.min()} to {test.index.max()}")

Time range - Train: 2010-01-01 00:00:00 to 2013-07-02 03:00:00
Time range - Test: 2013-07-02 04:00:00 to 2014-12-31 23:00:00


In [107]:
# Key insights
if 'pm2.5' in train.columns:
    print(f"\nPM2.5 statistics:")
    print(f"Mean: {train['pm2.5'].mean():.1f}, Std: {train['pm2.5'].std():.1f}")
    print(f"Min: {train['pm2.5'].min():.1f}, Max: {train['pm2.5'].max():.1f}")



PM2.5 statistics:
Mean: 100.8, Std: 93.1
Min: 0.0, Max: 994.0


# Explore the training data

In this sections explore your dataset with appropiate statistics and visualisations to understand your better. Ensure that you explain output of every code cell and what it entails.

In [108]:
def create_advanced_features(df, target_col='pm2.5'):
    """Create lag features, rolling statistics, and temporal features"""
    df_enhanced = df.copy()

    # Temporal features
    df_enhanced['hour'] = df_enhanced.index.hour
    df_enhanced['day_of_week'] = df_enhanced.index.dayofweek
    df_enhanced['month'] = df_enhanced.index.month
    df_enhanced['season'] = (df_enhanced.index.month % 12 + 3) // 3
    df_enhanced['is_weekend'] = df_enhanced['day_of_week'].isin([5, 6]).astype(int)

    # Cyclical encoding for temporal features
    df_enhanced['hour_sin'] = np.sin(2 * np.pi * df_enhanced['hour'] / 24)
    df_enhanced['hour_cos'] = np.cos(2 * np.pi * df_enhanced['hour'] / 24)
    df_enhanced['day_sin'] = np.sin(2 * np.pi * df_enhanced['day_of_week'] / 7)
    df_enhanced['day_cos'] = np.cos(2 * np.pi * df_enhanced['day_of_week'] / 7)
    df_enhanced['month_sin'] = np.sin(2 * np.pi * df_enhanced['month'] / 12)
    df_enhanced['month_cos'] = np.cos(2 * np.pi * df_enhanced['month'] / 12)

    if target_col in df_enhanced.columns:
        # Lag features for PM2.5
        for lag in [1, 2, 3, 6, 12, 24, 48]:
            df_enhanced[f'pm2.5_lag_{lag}'] = df_enhanced[target_col].shift(lag)

        # Rolling statistics for PM2.5
        for window in [6, 12, 24, 48]:
            df_enhanced[f'pm2.5_roll_mean_{window}'] = df_enhanced[target_col].rolling(window, min_periods=1).mean()
            df_enhanced[f'pm2.5_roll_std_{window}'] = df_enhanced[target_col].rolling(window, min_periods=1).std()
            df_enhanced[f'pm2.5_roll_min_{window}'] = df_enhanced[target_col].rolling(window, min_periods=1).min()
            df_enhanced[f'pm2.5_roll_max_{window}'] = df_enhanced[target_col].rolling(window, min_periods=1).max()

    # Weather interaction features
    df_enhanced['temp_dewp_diff'] = df_enhanced['TEMP'] - df_enhanced['DEWP']
    df_enhanced['wind_pressure'] = df_enhanced['Iws'] * df_enhanced['PRES']
    df_enhanced['humidity_index'] = df_enhanced['DEWP'] / (df_enhanced['TEMP'] + 1e-6)

    # Rolling features for weather variables
    for col in ['DEWP', 'TEMP', 'PRES', 'Iws']:
        for window in [6, 12, 24]:
            df_enhanced[f'{col}_roll_mean_{window}'] = df_enhanced[col].rolling(window, min_periods=1).mean()
            df_enhanced[f'{col}_roll_std_{window}'] = df_enhanced[col].rolling(window, min_periods=1).std()

    # Wind direction features
    wind_cols = [col for col in df_enhanced.columns if 'cbwd' in col]
    if len(wind_cols) >= 2:
        df_enhanced['wind_complexity'] = sum(df_enhanced[col] for col in wind_cols)

    # Drop original temporal columns (keep encoded versions)
    df_enhanced = df_enhanced.drop(['hour', 'day_of_week', 'month'], axis=1)

    return df_enhanced

def create_test_features(df):
    """Create features for test data (without PM2.5 lag features)"""
    df_enhanced = df.copy()

    # Temporal features
    df_enhanced['hour'] = df_enhanced.index.hour
    df_enhanced['day_of_week'] = df_enhanced.index.dayofweek
    df_enhanced['month'] = df_enhanced.index.month
    df_enhanced['season'] = (df_enhanced.index.month % 12 + 3) // 3
    df_enhanced['is_weekend'] = df_enhanced['day_of_week'].isin([5, 6]).astype(int)

    # Cyclical encoding for temporal features
    df_enhanced['hour_sin'] = np.sin(2 * np.pi * df_enhanced['hour'] / 24)
    df_enhanced['hour_cos'] = np.cos(2 * np.pi * df_enhanced['hour'] / 24)
    df_enhanced['day_sin'] = np.sin(2 * np.pi * df_enhanced['day_of_week'] / 7)
    df_enhanced['day_cos'] = np.cos(2 * np.pi * df_enhanced['day_of_week'] / 7)
    df_enhanced['month_sin'] = np.sin(2 * np.pi * df_enhanced['month'] / 12)
    df_enhanced['month_cos'] = np.cos(2 * np.pi * df_enhanced['month'] / 12)

    # Weather interaction features
    df_enhanced['temp_dewp_diff'] = df_enhanced['TEMP'] - df_enhanced['DEWP']
    df_enhanced['wind_pressure'] = df_enhanced['Iws'] * df_enhanced['PRES']
    df_enhanced['humidity_index'] = df_enhanced['DEWP'] / (df_enhanced['TEMP'] + 1e-6)

    # Rolling features for weather variables
    for col in ['DEWP', 'TEMP', 'PRES', 'Iws']:
        for window in [6, 12, 24]:
            df_enhanced[f'{col}_roll_mean_{window}'] = df_enhanced[col].rolling(window, min_periods=1).mean()
            df_enhanced[f'{col}_roll_std_{window}'] = df_enhanced[col].rolling(window, min_periods=1).std()

    # Wind direction features
    wind_cols = [col for col in df_enhanced.columns if 'cbwd' in col]
    if len(wind_cols) >= 2:
        df_enhanced['wind_complexity'] = sum(df_enhanced[col] for col in wind_cols)

    # Drop original temporal columns
    df_enhanced = df_enhanced.drop(['hour', 'day_of_week', 'month'], axis=1)

    return df_enhanced

In [109]:
train_clean = train.copy()
# First fill with forward fill, then backward fill, then interpolate
train_clean = train_clean.fillna(method='ffill', limit=12)  # Limit forward fill to 12 hours
train_clean = train_clean.fillna(method='bfill', limit=12)  # Limit backward fill to 12 hours
train_clean = train_clean.interpolate(method='time')

In [110]:
# For test data
test_clean = test.copy()
test_clean = test_clean.fillna(method='ffill')
test_clean = test_clean.fillna(method='bfill')
test_clean = test_clean.interpolate(method='linear')

print(f"✅ After treatment - Train: {train_clean.isnull().sum().sum()}, Test: {test_clean.isnull().sum().sum()}")
print(f"📊 Clean datasets - Train: {train_clean.shape}, Test: {test_clean.shape}")

✅ After treatment - Train: 12, Test: 0
📊 Clean datasets - Train: (30676, 11), Test: (13148, 10)


In [111]:
train_clean = train.copy()
train_clean = train_clean.fillna(method='ffill').fillna(method='bfill').fillna(0)
test_clean = test.copy()
test_clean = test_clean.fillna(method='ffill').fillna(method='bfill').fillna(0)

In [112]:
# Handle remaining NaNs (from lag and rolling features)
print(" Handling remaining NaN values...")
train_enhanced = train_enhanced.fillna(method='bfill').fillna(method='ffill').fillna(0)
test_enhanced = test_enhanced.fillna(method='bfill').fillna(method='ffill').fillna(0)

print(f"Final datasets - Train: {train_enhanced.shape}, Test: {test_enhanced.shape}")


 Handling remaining NaN values...
Final datasets - Train: (30676, 70), Test: (13148, 46)


# Handle missing values


- Check the dataset for missing values and decide how to handle them.
- In this example, missing values are filled with the mean. You can experiment with other strategies.

In [113]:
# Handle missing values
print("🔧 Handling missing values...")

# For training data
train_clean = train.copy()
train_clean = train_clean.fillna(method='ffill')  # Forward fill
train_clean = train_clean.fillna(method='bfill')  # Backward fill
train_clean = train_clean.interpolate(method='linear')  # Linear interpolation

# For test data
test_clean = test.copy()
test_clean = test_clean.fillna(method='ffill')
test_clean = test_clean.fillna(method='bfill')
test_clean = test_clean.interpolate(method='linear')

print(f"✅ After treatment - Train: {train_clean.isnull().sum().sum()}, Test: {test_clean.isnull().sum().sum()}")
print(f"📊 Clean datasets - Train: {train_clean.shape}, Test: {test_clean.shape}")

🔧 Handling missing values...
✅ After treatment - Train: 0, Test: 0
📊 Clean datasets - Train: (30676, 11), Test: (13148, 10)


In [114]:
# Handle remaining NaNs (from lag and rolling features)
print("🔧 Handling remaining NaN values...")
train_enhanced = train_enhanced.fillna(method='bfill').fillna(0)
test_enhanced = test_enhanced.fillna(method='bfill').fillna(0)

print(f"✅ Final datasets - Train: {train_enhanced.shape}, Test: {test_enhanced.shape}")

🔧 Handling remaining NaN values...
✅ Final datasets - Train: (30676, 70), Test: (13148, 46)


# Separate features and target

- Feel free to trop any non-essential columns like that you think might not contribute to modeling.

In [115]:
# Time series sequence creation
def create_sequences(data, target, sequence_length=24):
    """Create sequences for LSTM input"""
    X, y = [], []

    for i in range(sequence_length, len(data)):
        X.append(data[i-sequence_length:i])
        y.append(target[i])

    return np.array(X), np.array(y)

In [116]:
# ======================
# SEPARATE FEATURES & TARGET
# ======================

print("🎯 Separating features and target variables...")

# For training data - use the common features we already identified
train_features = X_train_common  # This already has the common features only
train_target = train_enhanced['pm2.5']

# For test data - use the common features
test_features = X_test_common    # This already has the common features only

print(f"📊 Training features shape: {train_features.shape}")
print(f"📊 Training target shape: {train_target.shape}")
print(f"📊 Test features shape: {test_features.shape}")

# Display feature names
print(f"\n🔍 Feature columns: {list(train_features.columns)}")
print(f"🎯 Target column: pm2.5")

🎯 Separating features and target variables...
📊 Training features shape: (30676, 45)
📊 Training target shape: (30676,)
📊 Test features shape: (13148, 45)

🔍 Feature columns: ['DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'season', 'is_weekend', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'temp_dewp_diff', 'wind_pressure', 'humidity_index', 'DEWP_roll_mean_6', 'DEWP_roll_std_6', 'DEWP_roll_mean_12', 'DEWP_roll_std_12', 'DEWP_roll_mean_24', 'DEWP_roll_std_24', 'TEMP_roll_mean_6', 'TEMP_roll_std_6', 'TEMP_roll_mean_12', 'TEMP_roll_std_12', 'TEMP_roll_mean_24', 'TEMP_roll_std_24', 'PRES_roll_mean_6', 'PRES_roll_std_6', 'PRES_roll_mean_12', 'PRES_roll_std_12', 'PRES_roll_mean_24', 'PRES_roll_std_24', 'Iws_roll_mean_6', 'Iws_roll_std_6', 'Iws_roll_mean_12', 'Iws_roll_std_12', 'Iws_roll_mean_24', 'Iws_roll_std_24', 'wind_complexity']
🎯 Target column: pm2.5


In [117]:
print("🏗️ Creating time series sequences...")
X_seq, y_seq = create_sequences(X_train_scaled, train_target.values, SEQUENCE_LENGTH)

# Train/validation split
split_idx = int(0.85 * len(X_seq))
X_train, X_val = X_seq[:split_idx], X_seq[split_idx:]
y_train_seq, y_val = y_seq[:split_idx], y_seq[split_idx:]

print(f"📊 Sequences shape: {X_seq.shape}")
print(f"📊 Train: {X_train.shape}, Validation: {X_val.shape}")

🏗️ Creating time series sequences...
📊 Sequences shape: (30640, 36, 45)
📊 Train: (26044, 36, 45), Validation: (4596, 36, 45)


In [118]:
# Remove PM2.5-dependent features from training
common_features = [col for col in train_enhanced.columns
                  if col in test_enhanced.columns and col not in ['pm2.5', 'No']]

X_train_aligned = train_enhanced[common_features]
y_train = train_enhanced['pm2.5']
X_test_aligned = test_enhanced[common_features]

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_aligned)
X_test_scaled = scaler.transform(X_test_aligned)

print("✅ Feature alignment completed!")

✅ Feature alignment completed!


# Build model

Below is a simple LSTM model. Your task is to experiment with different parameters like, numbers of layers, units, activation functions, and optimizers, etc to get the best performing model. Experiment with other optimizers (e.g., SGD) or hyperparameters to improve performance.

In [119]:
SEQUENCE_LENGTH = 36
BATCH_SIZE = 128
callbacks = [
    EarlyStopping(patience=15, restore_best_weights=True, min_delta=1e-4),
    ReduceLROnPlateau(factor=0.7, patience=8, min_lr=1e-6, verbose=1)
]

In [120]:
def save_submission(predictions, experiment_name, test_index):
    """Save submission with timestamp and experiment info"""
    os.makedirs('submissions', exist_ok=True)

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    submission = pd.DataFrame({
        'row ID': test_index.strftime('%Y-%m-%d %H:%M:%S'),
        'pm2.5': predictions.round().astype(int)
    })

    filename = f'submissions/{timestamp}_{experiment_name}.csv'
    submission.to_csv(filename, index=False)

    print(f"✅ Submission saved: {filename}")
    print(f"📊 Predictions - Min: {predictions.min():.1f}, Max: {predictions.max():.1f}")

    return filename, submission

def download_submission(filename):
    """Download the submission file from Colab"""
    from google.colab import files
    try:
        files.download(filename)
        print(f"✅ Download initiated: {filename}")
    except Exception as e:
        print(f"❌ Download failed: {e}")
        print(f"📁 File is available at: {filename}")
        print("💡 You can manually download it from the file browser on the left")

def create_sequences(data, target, sequence_length=24):
    """Create sequences for LSTM input"""
    X, y = [], []

    for i in range(sequence_length, len(data)):
        X.append(data[i-sequence_length:i])
        y.append(target[i])

    return np.array(X), np.array(y)

def create_enhanced_model(input_shape):
    """Create enhanced LSTM model"""
    model = Sequential([
        Bidirectional(LSTM(64, activation='tanh', return_sequences=True), input_shape=input_shape),
        Dropout(0.4),
        LSTM(32, activation='tanh', return_sequences=True),
        Dropout(0.3),
        LSTM(16, activation='tanh'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    return model

def create_advanced_features(df, target_col='pm2.5'):
    """Create lag features, rolling statistics, and temporal features"""
    # ... [your existing feature engineering code] ...
    return df_enhanced

def create_test_features(df):
    """Create features for test data (without PM2.5 lag features)"""
    # ... [your existing test feature engineering code] ...
    return df_enhanced

print("All functions defined successfully!")

All functions defined successfully!


In [121]:
# ======================
# RE-TRAIN WITH ALIGNED FEATURES (QUICK VERSION)
# ======================

print("🔄 Training model with aligned features...")

# Create sequences with aligned features
X_seq_aligned, y_seq_aligned = create_sequences(X_train_scaled_aligned, y_train.values, SEQUENCE_LENGTH)

# Train/validation split
split_idx = int(0.85 * len(X_seq_aligned))
X_train_aligned, X_val_aligned = X_seq_aligned[:split_idx], X_seq_aligned[split_idx:]
y_train_aligned, y_val_aligned = y_seq_aligned[:split_idx], y_seq_aligned[split_idx:]

print(f"📊 Aligned sequences shape: {X_seq_aligned.shape}")

# Build model with correct input shape
model_aligned = create_enhanced_model(X_train_aligned.shape[1:])
model_aligned.compile(
    optimizer=Adam(learning_rate=0.002),
    loss='huber',
    metrics=['mae']
)

# Train quickly (fewer epochs since we already know good parameters)
history_aligned = model_aligned.fit(
    X_train_aligned, y_train_aligned,
    validation_data=(X_val_aligned, y_val_aligned),
    epochs=30,  # Fewer epochs for quick training
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

# Evaluate
val_pred_aligned = model_aligned.predict(X_val_aligned, verbose=0)
val_rmse_aligned = np.sqrt(mean_squared_error(y_val_aligned, val_pred_aligned))
print(f" Aligned Model RMSE: {val_rmse_aligned:.2f}")

🔄 Training model with aligned features...
📊 Aligned sequences shape: (30640, 36, 45)
Epoch 1/30
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 181ms/step - loss: 83.3467 - mae: 83.8450 - val_loss: 76.8963 - val_mae: 77.3946 - learning_rate: 0.0020
Epoch 2/30
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 197ms/step - loss: 51.1861 - mae: 51.6820 - val_loss: 48.7370 - val_mae: 49.2326 - learning_rate: 0.0020
Epoch 3/30
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 185ms/step - loss: 35.2505 - mae: 35.7447 - val_loss: 49.4234 - val_mae: 49.9193 - learning_rate: 0.0020
Epoch 4/30
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 191ms/step - loss: 31.2208 - mae: 31.7148 - val_loss: 45.0255 - val_mae: 45.5215 - learning_rate: 0.0020
Epoch 5/30
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 179ms/step - loss: 28.2710 - mae: 28.7646 - val_loss: 47.4143 - val_mae: 47.9100 - learning_rate: 0.0020


In [122]:
# ======================
# GENERATE PREDICTIONS (WILL WORK NOW)
# ======================

print("🔮 Generating test predictions with aligned features...")

test_sequences = []
for i in range(len(X_test_scaled_aligned)):
    if i < SEQUENCE_LENGTH:
        needed_from_train = SEQUENCE_LENGTH - (i + 1)
        if needed_from_train > 0:
            sequence = np.vstack([X_train_scaled_aligned[-needed_from_train:], X_test_scaled_aligned[:i+1]])
        else:
            sequence = X_test_scaled_aligned[:SEQUENCE_LENGTH]
    else:
        sequence = X_test_scaled_aligned[i-SEQUENCE_LENGTH+1:i+1]

    if sequence.shape[0] != SEQUENCE_LENGTH:
        if sequence.shape[0] < SEQUENCE_LENGTH:
            padding_needed = SEQUENCE_LENGTH - sequence.shape[0]
            padding = np.repeat(sequence[0:1], padding_needed, axis=0)
            sequence = np.vstack([padding, sequence])
        else:
            sequence = sequence[-SEQUENCE_LENGTH:]

    test_sequences.append(sequence)

X_test_seq = np.array(test_sequences)
print(f"📊 Test sequences shape: {X_test_seq.shape}")

# This will work now - same feature dimensions!
test_predictions = model_aligned.predict(X_test_seq, verbose=0)
test_predictions = np.maximum(test_predictions.flatten(), 0)

print(f"📊 Predictions - Min: {test_predictions.min():.1f}, Max: {test_predictions.max():.1f}")

🔮 Generating test predictions with aligned features...
📊 Test sequences shape: (13148, 36, 45)
📊 Predictions - Min: 6.0, Max: 355.0


In [123]:
print("🚀 Building and training LSTM model...")

# Create and train model
model = create_enhanced_model(X_train.shape[1:])
model.compile(
    optimizer=Adam(learning_rate=0.002),
    loss='huber',
    metrics=['mae']
)

history = model.fit(
    X_train, y_train_seq,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

print("✅ Model training completed!")

🚀 Building and training LSTM model...
Epoch 1/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 195ms/step - loss: 82.9128 - mae: 83.4111 - val_loss: 58.5081 - val_mae: 59.0051 - learning_rate: 0.0020
Epoch 2/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 206ms/step - loss: 44.7753 - mae: 45.2711 - val_loss: 51.8344 - val_mae: 52.3311 - learning_rate: 0.0020
Epoch 3/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 188ms/step - loss: 37.1888 - mae: 37.6836 - val_loss: 47.8030 - val_mae: 48.2966 - learning_rate: 0.0020
Epoch 4/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 198ms/step - loss: 33.3223 - mae: 33.8163 - val_loss: 45.6550 - val_mae: 46.1499 - learning_rate: 0.0020
Epoch 5/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 206ms/step - loss: 30.5232 - mae: 31.0170 - val_loss: 47.7789 - val_mae: 48.2737 - learning_rate: 0.0020
Epoch 6/50
[1m204/204[0m [32m━━━━━━━━━━━━━━━

In [124]:
print("📊 Evaluating model performance...")

# Evaluate model
val_pred = model.predict(X_val, verbose=0)
val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))

print(f"\n🎯 Model Performance:")
print(f"Validation RMSE: {val_rmse:.2f}")
print(f"Target: < 3000 RMSE")

if val_rmse < 3000:
    print("🎉 TARGET ACHIEVED!")
else:
    print(f"📈 Need {val_rmse - 3000:.1f} points improvement")

📊 Evaluating model performance...

🎯 Model Performance:
Validation RMSE: 75.05
Target: < 3000 RMSE
🎉 TARGET ACHIEVED!


In [125]:
# ======================
# CORRECT FEATURE ALIGNMENT
# ======================

print("🔧 Correcting feature alignment...")

# First, let's see what features we actually have
print("📊 Training features:", train_enhanced.columns.tolist())
print("📊 Test features:", test_enhanced.columns.tolist())

# The problem: Training has PM2.5 lag features that test can't have
# Solution: Remove PM2.5-dependent features from training
pm25_dependent_features = [col for col in train_enhanced.columns if 'pm2.5_' in col]
print(f"🚫 Removing PM2.5-dependent features: {pm25_dependent_features}")

# Use only features that exist in both datasets
common_features = [col for col in train_enhanced.columns
                  if col in test_enhanced.columns and col not in ['pm2.5', 'No']]

print(f"✅ Using {len(common_features)} common features:")
print(common_features)

# Create aligned datasets
X_train_aligned = train_enhanced[common_features]
y_train = train_enhanced['pm2.5']
X_test_aligned = test_enhanced[common_features]

print(f"📊 Aligned training features: {X_train_aligned.shape}")
print(f"📊 Aligned test features: {X_test_aligned.shape}")

# Scale the aligned features
scaler_aligned = StandardScaler()
X_train_scaled_aligned = scaler_aligned.fit_transform(X_train_aligned)
X_test_scaled_aligned = scaler_aligned.transform(X_test_aligned)

print("✅ Feature alignment completed!")

🔧 Correcting feature alignment...
📊 Training features: ['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'pm2.5', 'season', 'is_weekend', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'pm2.5_lag_1', 'pm2.5_lag_2', 'pm2.5_lag_3', 'pm2.5_lag_6', 'pm2.5_lag_12', 'pm2.5_lag_24', 'pm2.5_lag_48', 'pm2.5_roll_mean_6', 'pm2.5_roll_std_6', 'pm2.5_roll_min_6', 'pm2.5_roll_max_6', 'pm2.5_roll_mean_12', 'pm2.5_roll_std_12', 'pm2.5_roll_min_12', 'pm2.5_roll_max_12', 'pm2.5_roll_mean_24', 'pm2.5_roll_std_24', 'pm2.5_roll_min_24', 'pm2.5_roll_max_24', 'pm2.5_roll_mean_48', 'pm2.5_roll_std_48', 'pm2.5_roll_min_48', 'pm2.5_roll_max_48', 'temp_dewp_diff', 'wind_pressure', 'humidity_index', 'DEWP_roll_mean_6', 'DEWP_roll_std_6', 'DEWP_roll_mean_12', 'DEWP_roll_std_12', 'DEWP_roll_mean_24', 'DEWP_roll_std_24', 'TEMP_roll_mean_6', 'TEMP_roll_std_6', 'TEMP_roll_mean_12', 'TEMP_roll_std_12', 'TEMP_roll_mean_24', 'TEMP_roll_std_24', 'PRES_roll_mean_6'

In [126]:
# ======================
# CREATE SUBMISSION
# ======================

print("💾 Creating submission file...")

# Create and save submission
experiment_name = f"enhanced_lstm_rmse_{val_rmse:.0f}"
filename, submission = save_submission(
    test_predictions,
    experiment_name,
    test.index
)

print(f"\n🎯 EXPERIMENT COMPLETE!")
print(f"📈 Validation RMSE: {val_rmse:.2f}")
print(f"💾 Submission saved: {filename}")
print(f"\n📋 Sample predictions:")
print(submission.head(10))

# Check if submission file exists
import os
if os.path.exists(filename):
    print(f"✅ Confirmed: {filename} exists in submissions folder!")
    print(f"📁 File size: {os.path.getsize(filename)} bytes")
else:
    print(f"❌ Warning: {filename} not found!")

💾 Creating submission file...
✅ Submission saved: submissions/20250920_104754_enhanced_lstm_rmse_75.csv
📊 Predictions - Min: 6.0, Max: 355.0

🎯 EXPERIMENT COMPLETE!
📈 Validation RMSE: 75.05
💾 Submission saved: submissions/20250920_104754_enhanced_lstm_rmse_75.csv

📋 Sample predictions:
                row ID  pm2.5
0  2013-07-02 04:00:00     17
1  2013-07-02 05:00:00     16
2  2013-07-02 06:00:00     16
3  2013-07-02 07:00:00     16
4  2013-07-02 08:00:00     16
5  2013-07-02 09:00:00     16
6  2013-07-02 10:00:00     15
7  2013-07-02 11:00:00     15
8  2013-07-02 12:00:00     15
9  2013-07-02 13:00:00     16
✅ Confirmed: submissions/20250920_104754_enhanced_lstm_rmse_75.csv exists in submissions folder!
📁 File size: 305933 bytes


In [127]:
# Create and save submission
experiment_name = f"enhanced_lstm_rmse_{val_rmse:.0f}"
filename, submission = save_submission(
    test_predictions,
    experiment_name,
    test.index
)

print(f"\n🎯 EXPERIMENT COMPLETE!")
print(f"📈 Validation RMSE: {val_rmse:.2f}")
print(f"💾 Submission saved: {filename}")
print(f"\n📋 Sample predictions:")
print(submission.head(10))

# Check if submission file exists
import os
if os.path.exists(filename):
    print(f"✅ Confirmed: {filename} exists in submissions folder!")
    print(f"📁 File size: {os.path.getsize(filename)} bytes")
else:
    print(f"❌ Warning: {filename} not found!")

✅ Submission saved: submissions/20250920_104754_enhanced_lstm_rmse_75.csv
📊 Predictions - Min: 6.0, Max: 355.0

🎯 EXPERIMENT COMPLETE!
📈 Validation RMSE: 75.05
💾 Submission saved: submissions/20250920_104754_enhanced_lstm_rmse_75.csv

📋 Sample predictions:
                row ID  pm2.5
0  2013-07-02 04:00:00     17
1  2013-07-02 05:00:00     16
2  2013-07-02 06:00:00     16
3  2013-07-02 07:00:00     16
4  2013-07-02 08:00:00     16
5  2013-07-02 09:00:00     16
6  2013-07-02 10:00:00     15
7  2013-07-02 11:00:00     15
8  2013-07-02 12:00:00     15
9  2013-07-02 13:00:00     16
✅ Confirmed: submissions/20250920_104754_enhanced_lstm_rmse_75.csv exists in submissions folder!
📁 File size: 305933 bytes


In [128]:
experiment_name = f"enhanced_lstm_rmse_{val_rmse:.0f}"
filename, submission = save_submission(test_predictions, experiment_name, test.index)

# DOWNLOAD THE SUBMISSION FILE
download_submission(filename)

print(f"\n🎯 EXPERIMENT COMPLETE!")
print(f"📈 Validation RMSE: {val_rmse:.2f}")
print(f"💾 Submission ready for Kaggle: {filename}")

✅ Submission saved: submissions/20250920_104754_enhanced_lstm_rmse_75.csv
📊 Predictions - Min: 6.0, Max: 355.0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Download initiated: submissions/20250920_104754_enhanced_lstm_rmse_75.csv

🎯 EXPERIMENT COMPLETE!
📈 Validation RMSE: 75.05
💾 Submission ready for Kaggle: submissions/20250920_104754_enhanced_lstm_rmse_75.csv


In [130]:
# Prepare the test data - Use the aligned features that match your training
X_test = test_enhanced[common_features]  # Use the common features from alignment

# Scale the test data using the same scaler you used for training
X_test_scaled = scaler.transform(X_test)

# Create sequences for test data (same method as used in training)
test_sequences = []
for i in range(len(X_test_scaled)):
    if i < SEQUENCE_LENGTH:
        needed_from_train = SEQUENCE_LENGTH - (i + 1)
        if needed_from_train > 0:
            sequence = np.vstack([X_train_scaled[-needed_from_train:], X_test_scaled[:i+1]])
        else:
            sequence = X_test_scaled[:SEQUENCE_LENGTH]
    else:
        sequence = X_test_scaled[i-SEQUENCE_LENGTH+1:i+1]

    if sequence.shape[0] != SEQUENCE_LENGTH:
        if sequence.shape[0] < SEQUENCE_LENGTH:
            padding_needed = SEQUENCE_LENGTH - sequence.shape[0]
            padding = np.repeat(sequence[0:1], padding_needed, axis=0)
            sequence = np.vstack([padding, sequence])
        else:
            sequence = sequence[-SEQUENCE_LENGTH:]

    test_sequences.append(sequence)

X_test_seq = np.array(test_sequences)

# Make predictions on the test set using the aligned model
predictions = model_aligned.predict(X_test_seq, verbose=0)

# Ensure predictions do not contain NaN values and are non-negative
predictions = np.nan_to_num(predictions)
predictions = np.maximum(predictions.flatten(), 0)

# Convert predictions to integers
predictions = np.round(predictions).astype(int)

# Prepare the submission file - Use the exact same format as your working submission
# Get the row IDs from your test index (already properly formatted in your working code)
row_ids = test.index.strftime('%Y-%m-%d %H:%M:%S')

submission = pd.DataFrame({
    'row ID': row_ids,  # Use the same formatting that worked before
    'pm2.5': predictions
})

# Save the file in CSV format for submission on Kaggle
submission.to_csv('/content/drive/MyDrive/Kaggle_competition_ML/air_quality_forecasting/subm_fixed.csv', index=False)

print(f"✅ Submission saved with {len(predictions)} predictions")
print(f"📊 Predictions range: {predictions.min()} to {predictions.max()}")
print(f"📋 Sample row IDs: {row_ids[:3].tolist()}")

✅ Submission saved with 13148 predictions
📊 Predictions range: 6 to 355
📋 Sample row IDs: ['2013-07-02 04:00:00', '2013-07-02 05:00:00', '2013-07-02 06:00:00']
