In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import time
import logging

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
df = pd.read_csv('customer_data_cleaned.csv')
df.head()

Unnamed: 0,credit_score,age,balance,estimated_salary,country,gender,tenure
0,619.0,42,93101.008,11348.88,3,1,2
1,68.0,41,8387.86,112542.58,2,1,1
2,52.0,42,15966.8,113931.57,3,1,8
3,699.0,39,38038.282,93826.63,3,1,1
4,85.0,42,12551.82,7984.1,2,1,2


In [7]:


# Setup logging
logging.basicConfig(
    filename='model_training_log.txt',
    level=logging.INFO,
    format='%(asctime)s:%(levelname)s:%(message)s'
)

# Features & target
X = df.drop(columns=['credit_score'])
y = df['credit_score']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define initial model
model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4, # 3 - 6 is typical for tree-based models
    random_state=42
)

# Cross-validation to avoid overfitting
kf = KFold(n_splits=5, shuffle=True, random_state=42)

start_time = time.time()

cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2')

logging.info(f'Cross-Validation R² scores: {cv_scores}')
logging.info(f'Average CV R²: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})')

# Fit model on full train data
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

logging.info(f'Test Mean Squared Error: {mse:.2f}')
logging.info(f'Test R² Score: {r2:.3f}')

# Feature importance plot
plt.figure(figsize=(10,6))
pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=True).plot(kind='barh')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.savefig('feature_importance.png')
plt.close()

# 11️⃣ Predicted vs. Actual plot
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Credit Score')
plt.ylabel('Predicted Credit Score')
plt.title('Actual vs. Predicted Credit Score')
plt.savefig('actual_vs_predicted.png')
plt.close()

# 12️⃣ Save trained model
joblib.dump(model, 'xgb_credit_score_model.joblib')
logging.info('Model saved as xgb_credit_score_model.joblib')

# 13️⃣ Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6]
}

grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

logging.info(f'Best hyperparameters: {grid_search.best_params_}')
logging.info(f'Best CV R²: {grid_search.best_score_:.3f}')

# Save best model
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'xgb_credit_score_model_best.joblib')
logging.info('Best model saved as xgb_credit_score_model_best.joblib')

# 14️⃣ Log total time
end_time = time.time()
elapsed = end_time - start_time
logging.info(f'Total training time: {elapsed:.2f} seconds')
