<a href="https://colab.research.google.com/github/kavinraam/Rail-Index-Prediction-Model/blob/main/Model_z.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#CODE WE GOT O/P OF 0.79 XGBOOST
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import optuna

# Load the datasets
vri_df = pd.read_csv(r"/content/drive/MyDrive/Intern-CRIS/Dataset/training_data_vri.csv")
lri_df = pd.read_csv(r"/content/drive/MyDrive/Intern-CRIS/Dataset/training_data_lri.csv")

# Concatenate and handle missing values
df = pd.concat([vri_df, lri_df], ignore_index=True).dropna()

# Filter data where RI2 > RI1
df = df[df['RI2'] > df['RI1']]

# Convert date columns to datetime objects
df['DATE1'] = pd.to_datetime(df['DATE1'])
df['DATE2'] = pd.to_datetime(df['DATE2'])

# Create TIME_DIFF feature
df['TIME_DIFF'] = (df['DATE2'] - df['DATE1']).dt.days

# Extract date components
df['DATE1_YEAR'] = df['DATE1'].dt.year
df['DATE1_MONTH'] = df['DATE1'].dt.month
df['DATE1_DAY'] = df['DATE1'].dt.day
df['DATE2_YEAR'] = df['DATE2'].dt.year
df['DATE2_MONTH'] = df['DATE2'].dt.month
df['DATE2_DAY'] = df['DATE2'].dt.day

# Create interaction features
df['DATE2_YEAR_RI1_INTERACTION'] = df['DATE2_YEAR'] * df['RI1']
df['TIME_DIFF_RI1_INTERACTION'] = df['TIME_DIFF'] * df['RI1']
df['DATE2_YEAR_TIME_DIFF_INTERACTION'] = df['DATE2_YEAR'] * df['TIME_DIFF']
df['DATE2_MONTH_RI1_INTERACTION'] = df['DATE2_MONTH'] * df['RI1']

# Create polynomial features
df['RI1_squared'] = df['RI1']**2
df['GMT_squared'] = df['GMT']**2
df['TIME_DIFF_squared'] = df['TIME_DIFF']**2


# Define features and target
X = df[["LINECODE", "SECCODE", "BLOCKNO", "KMFROM", "PARAM", "RI1", "GMT",
        "TIME_DIFF", "DATE1_YEAR", "DATE1_MONTH", "DATE1_DAY", "DATE2_YEAR",
        "DATE2_MONTH", "DATE2_DAY", "DATE2_YEAR_RI1_INTERACTION",
        "TIME_DIFF_RI1_INTERACTION", "DATE2_YEAR_TIME_DIFF_INTERACTION",
        "DATE2_MONTH_RI1_INTERACTION", "RI1_squared", "GMT_squared",
        "TIME_DIFF_squared"]]
y = df["RI2"]

# Apply one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

def objective(trial):
    """Defines the expanded hyperparameter search space and returns the cross-validated R² score."""
    xgb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.5, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'random_state': 42,
        'n_jobs': -1
    }

    model = XGBRegressor(**xgb_params)

    # Perform cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)

    # Return the mean R² score
    return scores.mean()

# Create an Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print("\n✅ Optuna optimization finished.")
print("Best hyperparameters:", study.best_params)
print(f"Best R² score (cross-validated): {study.best_value:.4f}")

# Train the final XGBoost model with the best parameters
best_xgb_model_optuna = XGBRegressor(**study.best_params, random_state=42, n_jobs=-1)
best_xgb_model_optuna.fit(X_train, y_train)

# Evaluate the model on the test set
xgb_optuna_preds = best_xgb_model_optuna.predict(X_test)

print("\n✅ Optimized XGBoost Results (Optuna) on Test Set:")
print(f"R² Score: {r2_score(y_test, xgb_optuna_preds):.4f}")
print(f"MAE     : {mean_absolute_error(y_test, xgb_optuna_preds):.4f}")
print(f"MSE     : {mean_squared_error(y_test, xgb_optuna_preds):.4f}")
print(f"RMSE    : {np.sqrt(mean_squared_error(y_test, xgb_optuna_preds)):.4f}")

[I 2025-07-02 06:06:37,951] A new study created in memory with name: no-name-dfd58d38-58b1-4e47-861d-d63608e6f805
[I 2025-07-02 06:07:08,467] Trial 0 finished with value: 0.7372814876112972 and parameters: {'n_estimators': 474, 'max_depth': 20, 'learning_rate': 0.08724375979615565, 'subsample': 0.8561670495145319, 'colsample_bytree': 0.5544350435585677, 'gamma': 0.4941613681153929, 'reg_alpha': 0.906372481421203, 'reg_lambda': 0.5147125617680705, 'min_child_weight': 5, 'colsample_bylevel': 0.9552588212652988}. Best is trial 0 with value: 0.7372814876112972.
[I 2025-07-02 06:07:54,911] Trial 1 finished with value: 0.6380636030234095 and parameters: {'n_estimators': 126, 'max_depth': 12, 'learning_rate': 0.00840409222052695, 'subsample': 0.5679977516394104, 'colsample_bytree': 0.9790987771415862, 'gamma': 0.08293288030278712, 'reg_alpha': 0.47577233203637403, 'reg_lambda': 0.5114755511946248, 'min_child_weight': 10, 'colsample_bylevel': 0.7029093047985111}. Best is trial 0 with value: 0.


✅ Optuna optimization finished.
Best hyperparameters: {'n_estimators': 940, 'max_depth': 13, 'learning_rate': 0.03434789406524535, 'subsample': 0.7949641584860221, 'colsample_bytree': 0.7048270636408362, 'gamma': 0.10895672932204054, 'reg_alpha': 0.16732723475916206, 'reg_lambda': 0.759304402176893, 'min_child_weight': 7, 'colsample_bylevel': 0.8674512681171598}
Best R² score (cross-validated): 0.7694

✅ Optimized XGBoost Results (Optuna) on Test Set:
R² Score: 0.7762
MAE     : 0.0667
MSE     : 0.0091
RMSE    : 0.0951
