In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
import warnings
from datetime import datetime, timedelta

warnings.filterwarnings('ignore')
tf.random.set_seed(42)
np.random.seed(42)

# --- 1. Load Data ---
try:
    df_train = pd.read_csv('training_data/train/train.csv')
    df_transactions = pd.read_csv('training_data/train/transactions.csv')
    df_test = pd.read_csv('testing data/test_8gqdJqH.csv')
    print("All data files loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading data files: {e}")
    print("Please ensure your folder structure and file names are correct.")
    exit()

# --- 2. Initial Data Cleaning & Type Conversion ---
print("\n--- Starting Data Cleaning and Preprocessing ---")

# Convert date columns to datetime objects
for df in [df_train, df_transactions, df_test]:
    df['doj'] = pd.to_datetime(df['doj'])
if 'doi' in df_transactions.columns:
    df_transactions['doi'] = pd.to_datetime(df_transactions['doi'])

print("Date columns converted to datetime objects.")

# --- 3. Advanced Time Series Feature Engineering ---
def create_time_series_features(df):
    """
    Creates comprehensive time-series features optimized for LSTM models.
    """
    # Basic time features
    df['month'] = df['doj'].dt.month
    df['year'] = df['doj'].dt.year
    df['day_of_week'] = df['doj'].dt.dayofweek
    df['day_of_year'] = df['doj'].dt.dayofyear
    df['week_of_year'] = df['doj'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['doj'].dt.quarter
    df['is_weekend'] = (df['doj'].dt.dayofweek >= 5).astype(int)
    df['is_monday'] = (df['doj'].dt.dayofweek == 0).astype(int)
    df['is_friday'] = (df['doj'].dt.dayofweek == 4).astype(int)
    
    # Cyclical encoding for better time representation
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    
    # Holiday indicators (approximate)
    df['is_holiday_season'] = ((df['month'] == 12) | (df['month'] == 1)).astype(int)
    df['is_summer'] = ((df['month'] >= 5) & (df['month'] <= 7)).astype(int)
    df['is_winter'] = ((df['month'] >= 11) | (df['month'] <= 2)).astype(int)
    
    # Create unique route identifier
    df['route'] = df['srcid'].astype(str) + '_' + df['destid'].astype(str)
    
    return df

def create_lag_features(df, target_col, lags=[1, 2, 3, 7, 14, 30]):
    """
    Creates lag features for time series analysis.
    """
    df = df.sort_values(['route', 'doj']).reset_index(drop=True)
    
    for lag in lags:
        df[f'{target_col}_lag_{lag}'] = df.groupby('route')[target_col].shift(lag)
    
    return df

def create_rolling_features(df, target_col, windows=[3, 7, 14, 30]):
    """
    Creates rolling statistics features.
    """
    df = df.sort_values(['route', 'doj']).reset_index(drop=True)
    
    for window in windows:
        # Rolling mean
        df[f'{target_col}_rolling_mean_{window}'] = (
            df.groupby('route')[target_col].rolling(window=window, min_periods=1).mean().reset_index(0, drop=True)
        )
        # Rolling std
        df[f'{target_col}_rolling_std_{window}'] = (
            df.groupby('route')[target_col].rolling(window=window, min_periods=1).std().reset_index(0, drop=True)
        )
        # Rolling max
        df[f'{target_col}_rolling_max_{window}'] = (
            df.groupby('route')[target_col].rolling(window=window, min_periods=1).max().reset_index(0, drop=True)
        )
        # Rolling min
        df[f'{target_col}_rolling_min_{window}'] = (
            df.groupby('route')[target_col].rolling(window=window, min_periods=1).min().reset_index(0, drop=True)
        )
    
    return df

print("\n--- Engineering Advanced Time Series Features ---")
# Apply feature engineering
df_transactions = create_time_series_features(df_transactions)
df_test = create_time_series_features(df_test)

# Add lag and rolling features to transactions
df_transactions = create_lag_features(df_transactions, 'cumsum_seatcount')
df_transactions = create_lag_features(df_transactions, 'cumsum_searchcount')
df_transactions = create_rolling_features(df_transactions, 'cumsum_seatcount')
df_transactions = create_rolling_features(df_transactions, 'cumsum_searchcount')

print("Advanced time series feature engineering complete.")

# --- 4. Prepare Time Series Data ---
print("\n--- Preparing Time Series Data ---")

# Filter transactions for dbd = 15
dbd_filter = 15
df_transactions_filtered = df_transactions[df_transactions['dbd'] == dbd_filter].copy()

print(f"Filtered transactions for dbd = {dbd_filter}. Shape: {df_transactions_filtered.shape}")

# Create training dataset
df_model_train = pd.merge(
    df_train,
    df_transactions_filtered,
    on=['doj', 'srcid', 'destid'],
    how='inner'
)

print(f"Training data shape after merge: {df_model_train.shape}")

# Fill missing values for lag and rolling features
lag_roll_cols = [col for col in df_model_train.columns if ('lag_' in col or 'rolling_' in col)]
df_model_train[lag_roll_cols] = df_model_train[lag_roll_cols].fillna(0)

# --- 5. Encode Categorical Variables ---
print("\n--- Encoding Categorical Variables ---")

# Label encode categorical variables
categorical_cols = ['srcid', 'destid', 'srcid_region', 'destid_region', 'srcid_tier', 'destid_tier', 'route']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_model_train[f'{col}_encoded'] = le.fit_transform(df_model_train[col].astype(str))
    label_encoders[col] = le

# --- 6. Create Sequences for LSTM ---
def create_sequences(data, target_col, sequence_length=30):
    """
    Creates sequences for LSTM training.
    """
    sequences = []
    targets = []
    
    # Sort by route and date
    data_sorted = data.sort_values(['route', 'doj']).reset_index(drop=True)
    
    # Group by route to create sequences
    for route in data_sorted['route'].unique():
        route_data = data_sorted[data_sorted['route'] == route].copy()
        
        if len(route_data) >= sequence_length:
            for i in range(len(route_data) - sequence_length + 1):
                sequence = route_data.iloc[i:i+sequence_length]
                target = route_data.iloc[i+sequence_length-1][target_col]
                
                # Select features for sequence
                feature_cols = [
                    'cumsum_seatcount', 'cumsum_searchcount',
                    'month_sin', 'month_cos', 'day_of_week_sin', 'day_of_week_cos',
                    'day_of_year_sin', 'day_of_year_cos', 'quarter',
                    'is_weekend', 'is_monday', 'is_friday', 'is_holiday_season',
                    'is_summer', 'is_winter',
                    'srcid_encoded', 'destid_encoded'
                ] + lag_roll_cols
                
                # Only include columns that exist in the data
                available_cols = [col for col in feature_cols if col in sequence.columns]
                sequence_features = sequence[available_cols].values
                
                sequences.append(sequence_features)
                targets.append(target)
    
    return np.array(sequences), np.array(targets)

print("\n--- Creating LSTM Sequences ---")
sequence_length = 10  # Reduced sequence length for better training
X_sequences, y_sequences = create_sequences(df_model_train, 'final_seatcount', sequence_length)

print(f"Created {len(X_sequences)} sequences with shape {X_sequences.shape}")

# --- 7. Scale Features ---
print("\n--- Scaling Features ---")

# Reshape for scaling
n_samples, n_timesteps, n_features = X_sequences.shape
X_reshaped = X_sequences.reshape(-1, n_features)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_scaled = X_scaled.reshape(n_samples, n_timesteps, n_features)

# Scale target
target_scaler = StandardScaler()
y_scaled = target_scaler.fit_transform(y_sequences.reshape(-1, 1)).ravel()

print(f"Feature scaling complete. Final X shape: {X_scaled.shape}")

# --- 8. Split Data ---
print("\n--- Splitting Data for Training ---")

# Use time-based split (80-20)
split_idx = int(0.8 * len(X_scaled))
X_train, X_val = X_scaled[:split_idx], X_scaled[split_idx:]
y_train, y_val = y_scaled[:split_idx], y_scaled[split_idx:]

print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}")

# --- 9. Build LSTM Model ---
print("\n--- Building LSTM Model ---")

def create_lstm_model(input_shape):
    """
    Creates an optimized LSTM model for time series forecasting.
    """
    model = Sequential([
        LSTM(128, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        BatchNormalization(),
        
        LSTM(64, return_sequences=True),
        Dropout(0.2),
        BatchNormalization(),
        
        LSTM(32, return_sequences=False),
        Dropout(0.2),
        BatchNormalization(),
        
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )
    
    return model

model = create_lstm_model((sequence_length, n_features))
print(model.summary())

# --- 10. Train Model ---
print("\n--- Training LSTM Model ---")

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=8,
    min_lr=1e-6,
    verbose=1
)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# --- 11. Evaluate Model ---
print("\n--- Evaluating Model ---")

# Make predictions
train_pred_scaled = model.predict(X_train)
val_pred_scaled = model.predict(X_val)

# Inverse transform predictions
train_pred = target_scaler.inverse_transform(train_pred_scaled)
val_pred = target_scaler.inverse_transform(val_pred_scaled)
y_train_orig = target_scaler.inverse_transform(y_train.reshape(-1, 1))
y_val_orig = target_scaler.inverse_transform(y_val.reshape(-1, 1))

# Calculate RMSE
train_rmse = np.sqrt(mean_squared_error(y_train_orig, train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val_orig, val_pred))

print(f"Training RMSE: {train_rmse:.4f}")
print(f"Validation RMSE: {val_rmse:.4f}")

# --- 12. Prepare Test Data and Make Predictions ---
print("\n--- Preparing Test Data for Prediction ---")

# For test predictions, we need to create sequences similar to training
# First, encode test categorical variables
for col in categorical_cols:
    if col in df_test.columns:
        # Handle unseen categories
        df_test[f'{col}_encoded'] = df_test[col].astype(str).map(
            dict(zip(label_encoders[col].classes_, range(len(label_encoders[col].classes_))))
        ).fillna(0).astype(int)

# Create test dataset
df_model_test = pd.merge(
    df_test,
    df_transactions_filtered,
    on=['doj', 'srcid', 'destid', 'route'],
    how='left'
)

# Fill missing values
df_model_test = df_model_test.fillna(0)

# For test predictions, we'll use the last sequence from training data for each route
# and predict the next value
print("\n--- Generating Test Predictions ---")

test_predictions = []
route_keys = []

for _, test_row in df_model_test.iterrows():
    route = test_row['route']
    route_key = test_row['route_key']
    
    # Find historical data for this route
    route_history = df_model_train[df_model_train['route'] == route].copy()
    
    if len(route_history) >= sequence_length:
        # Use the last sequence_length records
        route_history = route_history.sort_values('doj').tail(sequence_length)
        
        # Prepare features
        feature_cols = [
            'cumsum_seatcount', 'cumsum_searchcount',
            'month_sin', 'month_cos', 'day_of_week_sin', 'day_of_week_cos',
            'day_of_year_sin', 'day_of_year_cos', 'quarter',
            'is_weekend', 'is_monday', 'is_friday', 'is_holiday_season',
            'is_summer', 'is_winter',
            'srcid_encoded', 'destid_encoded'
        ] + lag_roll_cols
        
        available_cols = [col for col in feature_cols if col in route_history.columns]
        sequence = route_history[available_cols].values
        
        # Reshape and scale
        sequence = sequence.reshape(1, sequence_length, len(available_cols))
        sequence_scaled = scaler.transform(sequence.reshape(-1, len(available_cols)))
        sequence_scaled = sequence_scaled.reshape(1, sequence_length, len(available_cols))
        
        # Predict
        pred_scaled = model.predict(sequence_scaled, verbose=0)
        pred = target_scaler.inverse_transform(pred_scaled)[0, 0]
        
        # Ensure non-negative
        pred = max(0, pred)
        test_predictions.append(int(round(pred)))
    else:
        # If no sufficient history, use mean of available data or 0
        if len(route_history) > 0:
            pred = route_history['final_seatcount'].mean()
        else:
            pred = 0
        test_predictions.append(int(round(pred)))
    
    route_keys.append(route_key)

# --- 13. Create Submission File ---
print("\n--- Creating Submission File ---")

submission_df = pd.DataFrame({
    'route_key': route_keys,
    'final_seatcount': test_predictions
})

submission_df.to_csv('lstm_submission_file.csv', index=False)

print("LSTM submission file 'lstm_submission_file.csv' created successfully.")
print(f"\nFinal Validation RMSE: {val_rmse:.4f}")
print("\nTop 5 rows of the submission file:")
print(submission_df.head())

print(f"\nPrediction statistics:")
print(f"Mean prediction: {np.mean(test_predictions):.2f}")
print(f"Std prediction: {np.std(test_predictions):.2f}")
print(f"Min prediction: {np.min(test_predictions)}")
print(f"Max prediction: {np.max(test_predictions)}")



All data files loaded successfully.

--- Starting Data Cleaning and Preprocessing ---
Date columns converted to datetime objects.

--- Engineering Advanced Time Series Features ---
Advanced time series feature engineering complete.

--- Preparing Time Series Data ---
Filtered transactions for dbd = 15. Shape: (73100, 74)
Training data shape after merge: (67200, 75)

--- Encoding Categorical Variables ---

--- Creating LSTM Sequences ---
Created 66300 sequences with shape (66300, 10, 61)

--- Scaling Features ---
Feature scaling complete. Final X shape: (66300, 10, 61)

--- Splitting Data for Training ---
Training set: (53040, 10, 61), Validation set: (13260, 10, 61)

--- Building LSTM Model ---


None

--- Training LSTM Model ---
Epoch 1/100
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - loss: 0.5558 - mae: 0.5252 - val_loss: 0.2956 - val_mae: 0.3601 - learning_rate: 0.0010
Epoch 2/100
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - loss: 0.2266 - mae: 0.3354 - val_loss: 0.2039 - val_mae: 0.2951 - learning_rate: 0.0010
Epoch 3/100
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - loss: 0.1910 - mae: 0.3054 - val_loss: 0.1460 - val_mae: 0.2729 - learning_rate: 0.0010
Epoch 4/100
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - loss: 0.1817 - mae: 0.2943 - val_loss: 0.1757 - val_mae: 0.2914 - learning_rate: 0.0010
Epoch 5/100
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - loss: 0.1667 - mae: 0.2832 - val_loss: 0.1764 - val_mae: 0.2872 - learning_rate: 0.0010
Epoch 6/100
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 