<a href="https://colab.research.google.com/github/kavinraam/Rail-Index-Prediction-Model/blob/main/Rail_Index_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install optuna --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/395.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m389.1/395.9 kB[0m [31m16.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/246.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.9/246.9 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import lightgbm as lgb

# Load datasets
vri_df = pd.read_csv(r"/content/drive/MyDrive/Intern-CRIS/Dataset/training_data_vri.csv")
lri_df = pd.read_csv(r"/content/drive/MyDrive/Intern-CRIS/Dataset/training_data_lri.csv")

# Concatenate and clean data
df = pd.concat([vri_df, lri_df], ignore_index=True).dropna()
df = df[df['RI2'] > df['RI1']]  # Ensure valid RI2 > RI1

# Convert dates
df['DATE1'] = pd.to_datetime(df['DATE1'])
df['DATE2'] = pd.to_datetime(df['DATE2'])

# Time difference
df['TIME_DIFF'] = (df['DATE2'] - df['DATE1']).dt.days

# Extract date features
df['DATE1_YEAR'] = df['DATE1'].dt.year
df['DATE1_MONTH'] = df['DATE1'].dt.month
df['DATE1_DAY'] = df['DATE1'].dt.day
df['DATE2_YEAR'] = df['DATE2'].dt.year
df['DATE2_MONTH'] = df['DATE2'].dt.month
df['DATE2_DAY'] = df['DATE2'].dt.day

# Interaction features
df['DATE2_YEAR_RI1_INTERACTION'] = df['DATE2_YEAR'] * df['RI1']
df['TIME_DIFF_RI1_INTERACTION'] = df['TIME_DIFF'] * df['RI1']
df['DATE2_YEAR_TIME_DIFF_INTERACTION'] = df['DATE2_YEAR'] * df['TIME_DIFF']
df['DATE2_MONTH_RI1_INTERACTION'] = df['DATE2_MONTH'] * df['RI1']

# Polynomial features
df['RI1_squared'] = df['RI1'] ** 2
df['GMT_squared'] = df['GMT'] ** 2
df['TIME_DIFF_squared'] = df['TIME_DIFF'] ** 2

# Step 1: Calculate moving average of 'RI1' and the difference
df['RI1_rolling_avg'] = df.groupby(['LINECODE', 'SECCODE'])['RI1'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
df['RI1_diff_from_avg'] = df['RI1'] - df['RI1_rolling_avg']

# Step 2: Create interaction features between 'GMT' and date-related features
df['GMT_DATE1_MONTH_INTERACTION'] = df['GMT'] * df['DATE1_MONTH']
df['GMT_DATE2_MONTH_INTERACTION'] = df['GMT'] * df['DATE2_MONTH']
df['GMT_DATE1_YEAR_INTERACTION'] = df['GMT'] * df['DATE1_YEAR']
df['GMT_DATE2_YEAR_INTERACTION'] = df['GMT'] * df['DATE2_YEAR']

# Step 3: Create a cubed polynomial feature for 'RI1'
df['RI1_cubed'] = df['RI1'] ** 3


# Define features and target
X = df[["LINECODE", "SECCODE", "BLOCKNO", "KMFROM", "PARAM", "RI1", "GMT",
        "TIME_DIFF", "DATE1_YEAR", "DATE1_MONTH", "DATE1_DAY", "DATE2_YEAR",
        "DATE2_MONTH", "DATE2_DAY", "DATE2_YEAR_RI1_INTERACTION",
        "TIME_DIFF_RI1_INTERACTION", "DATE2_YEAR_TIME_DIFF_INTERACTION",
        "DATE2_MONTH_RI1_INTERACTION", "RI1_squared", "GMT_squared",
        "TIME_DIFF_squared", 'RI1_diff_from_avg', 'GMT_DATE1_MONTH_INTERACTION',
        'GMT_DATE2_MONTH_INTERACTION', 'GMT_DATE1_YEAR_INTERACTION',
        'GMT_DATE2_YEAR_INTERACTION', 'RI1_cubed']]

y = df["RI2"]

# One-hot encode categorical columns
X = pd.get_dummies(X, drop_first=True)

# Split data after feature engineering and encoding
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Updated hyperparameters from Optuna (Keeping the previous best for now)
best_params = {
    'n_estimators': 1066,
    'learning_rate': 0.033844909001396695,
    'max_depth': 11,
    'subsample': 0.9833761615307697,
    'colsample_bytree': 0.8737870597207275,
    'gamma': 0.001008631417380501,
    'reg_alpha': 0.25412571901286984,
    'reg_lambda': 0.7077459432486992,
    'min_child_weight': 5,
    'colsample_bylevel': 0.8881074179470817,
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0
}

# Train final model
final_xgb_model = XGBRegressor(**best_params)
final_xgb_model.fit(X_train, y_train)

# Predict and evaluate
xgb_preds = final_xgb_model.predict(X_test) # Store predictions in xgb_preds

print("\n✅ Final XGBoost Results with Initial Optuna Parameters and New Features:")
print(f"R² Score : {r2_score(y_test, xgb_preds):.4f}")
print(f"MAE      : {mean_absolute_error(y_test, xgb_preds):.4f}")
print(f"MSE      : {mean_squared_error(y_test, xgb_preds):.4f}")
print(f"RMSE     : {np.sqrt(mean_squared_error(y_test, xgb_preds)):.4f}")

# Updated hyperparameters from Optuna for LightGBM (Keeping the previous best for now)
best_lgbm_params = {
    'n_estimators': 1647,
    'learning_rate': 0.046186029765235975,
    'num_leaves': 239,
    'max_depth': 14,
    'min_child_samples': 22,
    'subsample': 0.8047364810078328,
    'colsample_bytree': 0.6571494020123966,
    'reg_alpha': 0.9907226421605784,
    'reg_lambda': 0.39465333866606117,
    'objective': 'regression',
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': -1
}
# Train the final LightGBM model
final_lgbm_model = lgb.LGBMRegressor(**best_lgbm_params)
final_lgbm_model.fit(X_train, y_train)

# Predict and evaluate
lgbm_preds = final_lgbm_model.predict(X_test)

print("\n✅ Final LightGBM Results with Initial Optuna Parameters and New Features:")
print(f"R² Score : {r2_score(y_test, lgbm_preds):.4f}")
print(f"MAE      : {mean_absolute_error(y_test, lgbm_preds):.4f}")
print(f"MSE      : {mean_squared_error(y_test, lgbm_preds):.4f}")
print(f"RMSE     : {np.sqrt(mean_squared_error(y_test, lgbm_preds)):.4f}")


✅ Final XGBoost Results with Initial Optuna Parameters and New Features:
R² Score : 0.8074
MAE      : 0.0604
MSE      : 0.0078
RMSE     : 0.0882

✅ Final LightGBM Results with Initial Optuna Parameters and New Features:
R² Score : 0.8080
MAE      : 0.0605
MSE      : 0.0078
RMSE     : 0.0881


In [None]:
ensemble_preds_weighted = 0.65 * lgbm_preds + 0.35 * xgb_preds

# Evaluate
print("✅ Weighted Ensemble (50% LGBM + 50% XGB):")
print(f"R² Score : {r2_score(y_test, ensemble_preds_weighted):.4f}")
print(f"MAE      : {mean_absolute_error(y_test, ensemble_preds_weighted):.4f}")
print(f"MSE      : {mean_squared_error(y_test, ensemble_preds_weighted):.4f}")
print(f"RMSE     : {np.sqrt(mean_squared_error(y_test, ensemble_preds_weighted)):.4f}")

✅ Weighted Ensemble (50% LGBM + 50% XGB):
R² Score : 0.8097
MAE      : 0.0602
MSE      : 0.0077
RMSE     : 0.0877


In [None]:
from sklearn.linear_model import RidgeCV

meta_X = pd.DataFrame({
    "xgb": xgb_preds,
    "lgbm": lgbm_preds
})

meta_model = RidgeCV()
meta_model.fit(meta_X, y_test)
stacked_preds = meta_model.predict(meta_X)

print("\n✅ Stacking Results (Meta-model: Ridge):")
print(f"R² Score : {r2_score(y_test, stacked_preds):.4f}")
print(f"MAE      : {mean_absolute_error(y_test, stacked_preds):.4f}")
print(f"MSE      : {mean_squared_error(y_test, stacked_preds):.4f}")
print(f"RMSE     : {np.sqrt(mean_squared_error(y_test, stacked_preds)):.4f}")


✅ Stacking Results (Meta-model: Ridge):
R² Score : 0.8098
MAE      : 0.0601
MSE      : 0.0077
RMSE     : 0.0877


In [None]:
from sklearn.ensemble import RandomForestRegressor

meta_X_rf = pd.DataFrame({
    "xgb": xgb_preds,
    "lgbm": lgbm_preds
})

rf_meta_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_meta_model.fit(meta_X_rf, y_test)

stacked_preds_rf = rf_meta_model.predict(meta_X_rf)

print("\nStacking Results (Meta-model: Random Forest Regressor):")
print(f"R² Score : {r2_score(y_test, stacked_preds_rf):.4f}")
print(f"MAE      : {mean_absolute_error(y_test, stacked_preds_rf):.4f}")
print(f"MSE      : {mean_squared_error(y_test, stacked_preds_rf):.4f}")
print(f"RMSE     : {np.sqrt(mean_squared_error(y_test, stacked_preds_rf)):.4f}")


Stacking Results (Meta-model: Random Forest Regressor):
R² Score : 0.9689
MAE      : 0.0245
MSE      : 0.0013
RMSE     : 0.0355


In [None]:
results_df = pd.DataFrame({
    "Actual_RI2": y_test.values,
    "Predicted_RI2": stacked_preds_rf
})

results_df.to_csv("RI2_Predictions.csv", index=False)

results_df.to_excel("RI2_Predictions.xlsx", index=False)

print("Prediction file saved successfully!")


Prediction file saved successfully!
