In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
%who

In [None]:
df = pd.read_excel(r"path")
df.info()
df.describe()

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=[variables])
y = df.into_default
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,random_state=42)
y_test

In [None]:
y_test.sum()


In [None]:
X_test

In [None]:
y_train.sum()

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. labels 
num_labels = X_train.select_dtypes('number').columns
cat_labels = X_train.select_dtypes('object').columns

# 2. instantiate preprocessors
num_preprocessor = FunctionTransformer()
cat_preprocessor = OneHotEncoder(drop = 'if_binary')

# 3. combine both preprocessors into one
preprocessor = ColumnTransformer([('cat',cat_preprocessor,cat_labels),
                                  ('num',num_preprocessor,num_labels)])

preprocessor.fit(X_train)

feature_labels = preprocessor.transformers_[0][1].get_feature_names_out().tolist()
feature_labels.extend(num_labels)

X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)

X_test_prep = X_test_prep.toarray()
X_train_prep = X_train_prep.toarray()

X_train_prep = pd.DataFrame(X_train_prep,columns = feature_labels)
X_test_prep = pd.DataFrame(X_test_prep,columns = feature_labels)
X_train_prep = X_train_prep.loc[:, [risk_drivers]]
X_test_prep = X_test_prep.loc[:, [risk_drivers]]

X_test_prep


In [None]:
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error, accuracy_score, f1_score, auc, precision_score, recall_score, classification_report, confusion_matrix, roc_auc_score
scale_pos_weight = y_train.value_counts()[0]/y_train.value_counts()[1]
scale_pos_weight

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV, KFold

xgb_model = xgb.XGBClassifier(scale_pos_weight = scale_pos_weight,random_state = 42,subsample=0.6,eval_metric='logloss', objective='reg:logistic')

xgb_pipe = Pipeline([('pre', preprocessor),
                    ('xgb',xgb_model)])

hyper_params = {'xgb__n_estimators': [ 500, 600],
                'xgb__max_depth':[5,6,7],
                'xgb__learning_rate':[.01,.03],
                'xgb__colsample_bylevel':[0.8],
                'xgb__colsample_bytree':[0.8],
                'xgb__gamma':[5],
                'xgb__reg_lambda':[6],
                'xgb__reg_alpha':[4],
                'xgb__max_delta_step':[2],
                'xgb__min_child_weight':[3]}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(estimator=xgb_pipe,
                    param_grid=hyper_params,
                    scoring='f1',
                   n_jobs = -1,
                   cv = kfold,
                   verbose =2)
grid.fit(X_train,y_train)

In [None]:
y_train_prob_xgb_tuned = grid.predict(X_train)
print("AUC: " + str(roc_auc_score(y_train,y_train_prob_xgb_tuned)) + ", F1Score: " + str(f1_score(y_train,y_train_prob_xgb_tuned)) + ", Precision: " + str(precision_score(y_train,y_train_prob_xgb_tuned)) + ", Recall: " + str(recall_score(y_train,y_train_prob_xgb_tuned)))


In [None]:
confusion_matrix(y_train,y_train_prob_xgb_tuned)

In [None]:
y_test_prob_xgb_tuned = grid.predict(X_test)
print("AUC: " + str(roc_auc_score(y_test,y_test_prob_xgb_tuned)) + ", F1Score: " + str(f1_score(y_test,y_test_prob_xgb_tuned)) + ", Precision: " + str(precision_score(y_test,y_test_prob_xgb_tuned)) + ", Recall: " + str(recall_score(y_test,y_test_prob_xgb_tuned)))


In [None]:
confusion_matrix(y_test,y_test_prob_xgb_tuned)

In [None]:
grid.best_params_

In [None]:
print(classification_report(y_test,y_test_prob_xgb_tuned))

In [None]:

y_train_pred = grid.predict(X_train)
y_train_pred_df = pd.DataFrame(y_train_pred)
y_train_pred_df

In [None]:

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

y_proba = grid.predict_proba(X_train)[:, 1]

fpr, tpr, thresholds = roc_curve(y_train, y_train_prob_xgb_tuned)
auc_score = roc_auc_score(y_train, y_train_prob_xgb_tuned)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.3f}')
plt.plot([0, 1], [0, 1], 'k--', label='Baseline')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

y_proba = grid.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_test_prob_xgb_tuned)
auc_score = roc_auc_score(y_test, y_test_prob_xgb_tuned)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.3f}')
plt.plot([0, 1], [0, 1], 'k--', label='Baseline')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import explainerdashboard


In [None]:
model = grid.best_estimator_
model

In [None]:
explainer = explainerdashboard.ClassifierExplainer(model=grid,
                                                   X = X_test[:500],
                                                   y= y_test[:500])

In [None]:
explainerdashboard.ExplainerDashboard.terminate(8050)

In [None]:
explainerdashboard.ExplainerDashboard(
    explainer,
    decision_trees=True
).run()


In [None]:
  !pip install -q dtreeviz
  !pip install -q xgboost

In [None]:
import dtreeviz
import graphviz
%who

In [None]:
# Upewnij się, że masz dostęp do modelu XGBClassifier (nie pipeline)

best_model = grid.best_estimator_.named_steps['xgb']
booster = best_model.get_booster()



import xgboost as xgb
import matplotlib.pyplot as plt

xgb.plot_tree(best_model, tree_idx=300, rankdir='TB')
fig = plt.gcf()
fig.set_size_inches(30, 15)
plt.show()



In [None]:
def main():


    # Use the best estimator's XGB model
    model = grid.best_estimator_.named_steps['xgb']

    # Create explainer with a subset of test data
    explainer = explainerdashboard.ClassifierExplainer(model=model,
                                   X=X_test_prep.iloc[:500],
                                   y=y_test.iloc[:500])

    # Run dashboard externally on port 8051
    explainerdashboard.ExplainerDashboard(explainer,
                      use_waitress=True).run()

if __name__ == "__main__":
    main()