In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score,roc_curve
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv("../data/model_train_data.csv")
test_data = pd.read_csv("../data/model_test_data.csv")

In [3]:
train_data.describe()

Unnamed: 0,age,income_level,fico_score,delinquency_status,charge_off_status,number_of_credit_applications,debt_to_income_ratio,payment_methods_high_risk,max_balance,avg_balance_last_12months,...,number_of_defaulted_accounts,new_accounts_opened_last_12months,multiple_applications_short_time_period,unusual_submission_pattern,applications_submitted_during_odd_hours,watchlist_blacklist_flag,public_records_flag,account_age_days,earliest_credit_account_age_days,days_recent_trade_activity
count,5398.0,5398.0,5398.0,5398.0,5398.0,5398.0,5398.0,5398.0,5398.0,5398.0,...,5398.0,5398.0,5398.0,5398.0,5398.0,5398.0,5398.0,5398.0,5398.0,5398.0
mean,46.224528,23527.375324,615.116895,25.474991,0.263616,1.838273,1.474413,0.238977,50094.307788,33267.860916,...,0.506113,1.390515,0.25917,0.20767,0.312338,0.123935,0.201,962.302149,1855.394591,223.654687
std,16.412856,21878.658332,130.005913,42.910907,0.440634,1.984086,0.66977,0.426498,49722.417608,21750.712757,...,0.733954,1.596085,0.43822,0.405676,0.46349,0.329538,0.400785,524.591119,1048.202676,106.060833
min,18.0,2003.0,301.0,0.0,0.0,1.0,0.448678,0.0,930.516529,467.221957,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,42.0,42.0
25%,32.0,6859.75,520.0,0.0,0.0,1.0,1.05277,0.0,25272.67674,14179.244192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,512.0,956.25,131.0
50%,46.0,14880.5,633.0,0.0,0.0,1.0,1.318397,0.0,44512.391347,30352.35306,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,968.0,1840.0,224.0
75%,61.0,37427.5,721.0,31.0,1.0,1.0,1.715266,0.0,69667.032476,50608.640512,...,1.0,3.0,1.0,0.0,1.0,0.0,0.0,1415.0,2772.0,314.0
max,74.0,99957.0,848.0,119.0,1.0,10.0,9.927906,1.0,999922.0833,92117.793445,...,2.0,4.0,1.0,1.0,1.0,1.0,1.0,1868.0,3694.0,407.0


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5398 entries, 0 to 5397
Data columns (total 24 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   age                                      5398 non-null   int64  
 1   location                                 5398 non-null   object 
 2   occupation                               5398 non-null   object 
 3   income_level                             5398 non-null   int64  
 4   fico_score                               5398 non-null   float64
 5   delinquency_status                       5398 non-null   int64  
 6   charge_off_status                        5398 non-null   int64  
 7   number_of_credit_applications            5398 non-null   int64  
 8   debt_to_income_ratio                     5398 non-null   float64
 9   payment_methods_high_risk                5398 non-null   int64  
 10  max_balance                              5398 no

## Seperate Features and Target

In [5]:
target = 'charge_off_status'

X_train = train_data.drop(columns=[target])  
y_train = train_data[target]  

X_test = test_data.drop(columns=[target])  
y_test = test_data[target]  

## One-hot encoding categorical features

In [6]:
categorical_features = ["occupation", "location"]

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

encoded_columns = encoder.get_feature_names_out(categorical_features)
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoded_columns)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoded_columns)

X_train = X_train.drop(columns=categorical_features).reset_index(drop=True)
X_test = X_test.drop(columns=categorical_features).reset_index(drop=True)

X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5398 entries, 0 to 5397
Data columns (total 42 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   age                                      5398 non-null   int64  
 1   income_level                             5398 non-null   int64  
 2   fico_score                               5398 non-null   float64
 3   delinquency_status                       5398 non-null   int64  
 4   number_of_credit_applications            5398 non-null   int64  
 5   debt_to_income_ratio                     5398 non-null   float64
 6   payment_methods_high_risk                5398 non-null   int64  
 7   max_balance                              5398 non-null   float64
 8   avg_balance_last_12months                5398 non-null   float64
 9   number_of_delinquent_accounts            5398 non-null   float64
 10  number_of_defaulted_accounts             5398 no

## Training XGBoost Model (Basic)

In [8]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  
    eval_metric='logloss',    
    random_state=1  
)
xgb_model.fit(X_train, y_train)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:delinquency_group: object

In [None]:
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1] 

In [None]:
# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# ROC-AUC Score
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_prob)}")


In [None]:
# Get ROC curve data
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {roc_auc_score(y_test, y_prob):.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Plot feature importance
xgb.plot_importance(xgb_model, importance_type='weight', max_num_features=20)
plt.show()


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False), param_grid=param_grid, cv=3, n_jobs=-1, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC Score:", grid_search.best_score_)


In [None]:
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
xgb_model_2 = xgb.XGBClassifier(
    objective='binary:logistic',  
    eval_metric='logloss', 
    scale_pos_weight=scale_pos_weight,
    random_state=1  
)
xgb_model_2.fit(X_train, y_train)

In [None]:
y_pred = xgb_model_2.predict(X_test)
y_prob = xgb_model_2.predict_proba(X_test)[:, 1] 

In [None]:
# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# ROC-AUC Score
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_prob)}")


In [None]:
# Plot feature importance
xgb.plot_importance(xgb_model_2, importance_type='weight', max_num_features=20)
plt.show()

In [None]:
from sklearn.inspection import PartialDependenceDisplay

In [None]:
PartialDependenceDisplay.from_estimator(xgb_model_2, X_train, ['delinquency_group_High'])