In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split # <--- Used for Validation Split
from sklearn.model_selection import train_test_split, GridSearchCV

# --- 1. Data Loading ---
try:
    df_train = pd.read_csv("Training_TriGuard.csv")
    df_test = pd.read_csv("Testing_TriGuard.csv")
except FileNotFoundError:
    print("Error: Ensure 'Training_TriGuard.csv' and 'Testing_TriGuard.csv' are in the same folder.")
    exit()

In [None]:
# --- 2. Preprocessing and Feature Engineering ---
# (Skipping detailed code for Preprocessing/FE, assuming it's correctly applied
# to create X_train, y_train, and X_test)

# Drop rows with missing target and clean target
df_train.dropna(subset=['subrogation'], inplace=True)
df_train['subrogation'] = df_train['subrogation'].astype(int)

# Combine datasets for consistent preprocessing
df_test['subrogation'] = -1
df_combined = pd.concat([df_train, df_test], ignore_index=True)

# Impute remaining NAs
for col in df_combined.columns:
    if df_combined[col].dtype == 'object':
        df_combined[col].fillna(df_combined[col].mode()[0], inplace=True)
    elif df_combined[col].dtype in ['float64', 'int64']:
        if col in ['annual_income', 'vehicle_price', 'claim_est_payout']:
            df_combined[col].fillna(df_combined[col].median(), inplace=True)
        else:
            df_combined[col].fillna(df_combined[col].mean(), inplace=True)

# Feature Engineering: Calculate Driver Age at Claim
df_combined['claim_date'] = pd.to_datetime(df_combined['claim_date'], errors='coerce')
df_combined['claim_year'] = df_combined['claim_date'].dt.year
df_combined['driver_age'] = df_combined['claim_year'] - df_combined['year_of_born']

# Select features
exclude_cols = ['claim_number', 'claim_date', 'year_of_born', 'claim_year']
numerical_cols = df_combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols = [c for c in numerical_cols if c not in exclude_cols and c != 'subrogation']
categorical_cols = df_combined.select_dtypes(include=['object']).columns.tolist()

# One-Hot Encode categorical features
df_combined = pd.get_dummies(df_combined, columns=categorical_cols, drop_first=True)

# Final Feature Set
features = numerical_cols + [col for col in df_combined.columns if any(cat in col for cat in categorical_cols)]

# Re-separate training and testing data
X_train = df_combined[df_combined['subrogation'] != -1][features]
y_train = df_combined[df_combined['subrogation'] != -1]['subrogation']
X_test = df_combined[df_combined['subrogation'] == -1][features]

# Align columns
missing_cols = set(X_train.columns) - set(X_test.columns)
for c in missing_cols:
    X_test[c] = 0
X_test = X_test[X_train.columns]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_combined[col].fillna(df_combined[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_combined[col].fillna(df_combined[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate ob

In [None]:
# --- MODIFIED SECTION 3: Validation Split, Training, and F1 Optimization ---

## 3A. Split training data for internal F1 evaluation
X_train_split, X_validation, y_train_split, y_validation = train_test_split(
    X_train, y_train, test_size=0.3, random_state=42, stratify=y_train
)

# Recalculate imbalance ratio using the training split for model parameter
ratio = np.sum(y_train_split == 0) / np.sum(y_train_split == 1)

# 3B. Define base XGBoost model (parameters that WON'T be tuned go here)
xgb_base = xgb.XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=ratio,   # CRUCIAL: Handles the class imbalance
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    n_jobs=-1
)

# 3C. Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [200, 400, 600],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.03, 0.1],
    # You can comment these in/out depending on runtime
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

# 3D. Grid search with cross-validation, using F1 as the scoring metric
grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    scoring='f1',
    cv=3,          # 3-fold CV
    n_jobs=-1,     # use all cores
    verbose=1
)

print("Running Grid Search over XGBoost hyperparameters...")
grid_search.fit(X_train_split, y_train_split)

print("\n--- Grid Search Results ---")
print("Best Parameters:", grid_search.best_params_)
print(f"Best CV F1 Score: {grid_search.best_score_:.4f}")

# Best model after grid search (already fit on X_train_split)
best_xgb_model = grid_search.best_estimator_

# 3E. Use best model to get prediction probabilities on validation set
y_proba = best_xgb_model.predict_proba(X_validation)[:, 1]

# 3F. Tune threshold to maximize F1 score
best_f1 = 0
best_threshold = 0.5

for threshold in np.arange(0.05, 0.96, 0.01):
    y_pred_tuned = (y_proba >= threshold).astype(int)
    current_f1 = f1_score(y_validation, y_pred_tuned)

    if current_f1 > best_f1:
        best_f1 = current_f1
        best_threshold = threshold

print("\n--- F1 Score Results on Hold-out Validation Set ---")
print(f"Maximum F1 Score: {best_f1:.4f}")
print(f"Optimal Prediction Threshold: {best_threshold:.2f}")

Running Grid Search over XGBoost hyperparameters...
Fitting 3 folds for each of 108 candidates, totalling 324 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Grid Search Results ---
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 600, 'subsample': 0.8}
Best CV F1 Score: 0.5872

--- F1 Score Results on Hold-out Validation Set ---
Maximum F1 Score: 0.5785
Optimal Prediction Threshold: 0.57


In [None]:

# # --- MODIFIED SECTION 4: Final Training and Submission ---

# --- After youâ€™ve found best_threshold and printed F1 results ---

print("\nRetraining model on ALL training data with best hyperparameters...")

# Recompute imbalance ratio on the FULL training set
ratio_full = np.sum(y_train == 0) / np.sum(y_train == 1)

# Build a fresh model using the best params from grid search
final_xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=ratio_full,
    random_state=42,
    eval_metric='logloss',
    n_jobs=-1,
    **grid_search.best_params_
)

final_xgb_model.fit(X_train, y_train)

# Get final probabilities on official test set
y_test_proba = final_xgb_model.predict_proba(X_test)[:, 1]

# Use the best threshold you found earlier
y_test_pred = (y_test_proba >= best_threshold).astype(int)

# If you want them in a DataFrame for export:
submission = pd.DataFrame({
    "claim_number": df_test["claim_number"],
    "subrogation_pred": y_test_pred,
    "subrogation_proba": y_test_proba
})

submission.head()



Retraining model on ALL training data with best hyperparameters...


Unnamed: 0,claim_number,subrogation_pred,subrogation_proba
0,3126034,0,0.414453
1,7380142,1,0.619976
2,4655051,0,0.059248
3,6728725,1,0.733008
4,9848460,1,0.619321


In [None]:
if 'subrogation_proba' in submission.columns:
    submission.drop(columns=['subrogation_proba'], inplace=True)
submission_file = "submission.csv"
submission.to_csv(submission_file, index=False)

print(f"\nSuccessfully created submission file: {submission_file}")


Successfully created submission file: submission.csv
