In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import json
import seaborn as sns 
import warnings
import shap
import alibi
warnings.filterwarnings("ignore")

df = pd.read_excel("data_project.xlsx")
df["MonthRefunding"] = df["CreditAmount"]/df["CreditDuration"]

FileNotFoundError: [Errno 2] No such file or directory: 'data_project.xlsx'

## EDA 

In [None]:
df.dtypes

In [None]:
df.isna().sum()

In [None]:
with open('config.json', "r") as f:
    data = f.read()
  
      
# reconstructing the data as a dictionary
js = json.loads(data)

#On remplace les données codées par du texte

df = df.replace(js)

In [None]:
df.groupby("CreditRisk (y)").Age.mean()

In [None]:
#Age and credit risk

import seaborn as sns

# matplotlib histogram


# seaborn histogram
fig, ax = plt.subplots(figsize=(12,8), dpi=300)

sns.distplot(df[df["CreditRisk (y)"]==0]['Age'], hist=True, kde=True, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})
# Add labels
plt.title('Histogram of Age for risked credit people')
plt.xlabel('Age(years)')
plt.ylabel('Number of people');

fig, ax = plt.subplots(figsize=(12,8), dpi=300)

sns.distplot(df[df["CreditRisk (y)"]==1]['Age'], hist=True, kde=True, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})
# Add labels
plt.title('Histogram of Age for not-risked credit people')
plt.xlabel('Age(years)')
plt.ylabel('Number of people');

In [None]:
fig, axes = plt.subplots(2, 2)

df.groupby("EmploymentDuration")["CreditRisk (y)"].mean().sort_values(ascending=False).plot(kind="bar", figsize=(12, 12), title="Risk credit rate according to employment duration", ax=axes[1,1])
df.groupby("CreditRisk (y)")["CreditDuration"].mean().plot(kind="bar", figsize=(12, 12), title="Risk credit rate according to credit duration", rot=0, ax=axes[0,1])
df.groupby("CreditHistory")["CreditRisk (y)"].mean().sort_values(ascending=False).plot(kind="bar", figsize=(12, 13), title="Credit risk according to purpose product", ax=axes[1, 0])
df.boxplot("CreditAmount", ax=axes[0,0], by="CreditRisk (y)");

## First Modelling with surrogate model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

Oe = OneHotEncoder()
le = OrdinalEncoder()

to_encode = ['CreditHistory', 'EmploymentDuration', 'Housing', 'Purpose', 'Savings']

df["MonthRefunding"] = df["CreditAmount"]/df["CreditDuration"]

for col in to_encode:
    df[col] = le.fit_transform(np.array(df[col]).reshape(-1, 1))

X, y = df.drop(["CreditRisk (y)", "y_hat"], axis=1), df["CreditRisk (y)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("The accuracy of the model is {:.2f}".format(model.score(X_test, y_test)))
print("The precision of the model is {:.2f}".format(precision_score(y_pred, y_test)))
print("The recall of the model is {:.2f}".format(recall_score(y_pred, y_test)))
print("The f1-score of the model is {:.2f}".format(f1_score(y_pred, y_test)))

print("\n")

ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = model.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
plt.figure(figsize=(12, 12))
plt.title("ROC Curve of the Logistic Regression model", fontsize=15)
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, label='Model', color="blue")
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
ax = pd.DataFrame(model.coef_, columns=X.columns).T.sort_values(by=0,ascending=False).plot(kind="bar", figsize=(12, 12), title="Feature importance of the surrogate model", legend=False)

for p in ax.patches:
    ax.annotate(str(np.round(p.get_height(), 2)), (p.get_x() * 1.05, p.get_height() * 1.005));

In [None]:
from sklearn.inspection import plot_partial_dependence
from sklearn.inspection import partial_dependence

X, y = df.drop(["CreditRisk (y)", "y_hat"], axis=1), df["CreditRisk (y)"]

model = LogisticRegression()

model.fit(X, y)

In [None]:
#The partial_dependence function returns the dependencies and the grid
PDs, grid = partial_dependence(model, X, features = ['CreditDuration'], percentiles = [0,1])

#The plot_partial_dependence function returns a plot, but can also be unpacked into dependencies and grids
#plot_partial_dependence(model, X, features = ['CreditDuration'], percentiles = [0,1]);

def get_PDPvalues(col_name, data, model, grid_resolution = 100):
    Xnew = data.copy()
    sequence = np.linspace(np.min(data[col_name]), np.max(data[col_name]), grid_resolution)
    Y_pdp = []
    for each in sequence:
        Xnew[col_name] = each
        Y_temp = model.predict(Xnew)
        Y_pdp.append(np.mean(Y_temp))
    return pd.DataFrame({col_name: sequence, 'PDs': Y_pdp})

def plot_PDP(col_name, data, model):
    df = get_PDPvalues(col_name, data, model)
    plt.rcParams.update({'font.size': 16})
    plt.rcParams["figure.figsize"] = (6,5)
    fig, ax = plt.subplots()
    ax.plot(data[col_name], np.zeros(data[col_name].shape)+min(df['PDs'])-1, 'k|', ms=15)  # rug plot
    ax.plot(df[col_name], df['PDs'], lw = 2)
    ax.set_ylabel('Partial Dependence')
    return ax

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(4, 2, figsize=(18, 20))

df = get_PDPvalues('CreditAmount', X, model)
ax1.plot(X['CreditAmount'], np.zeros(X['CreditAmount'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax1.plot(df['CreditAmount'], df['PDs'], lw = 2)
ax1.set_ylabel('Partial Dependence')



df = get_PDPvalues('CreditDuration', X, model)
ax2.plot(X['CreditDuration'], np.zeros(X['CreditDuration'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax2.plot(df['CreditDuration'], df['PDs'], lw = 2)
ax2.set_ylabel('CreditDuration')



df = get_PDPvalues('InstallmentRate', X, model)
ax3.plot(X['InstallmentRate'], np.zeros(X['InstallmentRate'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax3.plot(df['InstallmentRate'], df['PDs'], lw = 2)
ax3_twin=ax3.twinx()

sns.distplot(X['InstallmentRate'], hist=False,
             bins=int(180/5), color = 'red',
             hist_kws={'edgecolor':'black'}, ax=ax3_twin)
ax3.set_ylabel('InstallmentRate')
ax3_twin.set_ylabel("InstallmentRate density",color="red",fontsize=14)
ax3.set_ylabel('InstallmentRate')



df = get_PDPvalues('Age', X, model)
ax4.plot(X['Age'], np.zeros(X['Age'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax4.plot(df['Age'], df['PDs'], lw = 2)

ax4_twin=ax4.twinx()

sns.distplot(X['Age'], hist=False,
             bins=int(180/5), color = 'red',
             hist_kws={'edgecolor':'black'}, ax=ax4_twin)
ax4.set_ylabel('Age')
ax4_twin.set_ylabel("Age density",color="red",fontsize=14)



df = get_PDPvalues('NumberOfCredits', X, model)
ax5.plot(X['NumberOfCredits'], np.zeros(X['NumberOfCredits'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax5.plot(df['NumberOfCredits'], df['PDs'], lw = 2)
ax5_twin=ax5.twinx()
sns.distplot(X['NumberOfCredits'], hist=False,
             bins=int(180/5), color = 'red',
             hist_kws={'edgecolor':'black'}, ax=ax5_twin)
ax5_twin.set_ylabel("NumberOfCredits density",color="red",fontsize=14)
ax5.set_ylabel('NumberOfCredits')

df = get_PDPvalues('CreditHistory', X, model)
ax6.plot(X['CreditHistory'], np.zeros(X['CreditHistory'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax6.plot(df['CreditHistory'], df['CreditHistory'], lw = 2)
ax6.set_ylabel('CreditHistory')

df = get_PDPvalues('MonthRefunding', X, model)
ax7.plot(X['MonthRefunding'], np.zeros(X['MonthRefunding'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax7.plot(df['MonthRefunding'], df['MonthRefunding'], lw = 2)
ax7.set_ylabel('MonthRefunding')

df = get_PDPvalues('Group', X, model)
ax8.plot(X['Group'], np.zeros(X['Group'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax8.plot(df['Group'], df['PDs'], lw = 2)
ax8.set_ylabel('Group');

In [None]:
#Try with ALE 

from alibi.explainers import ALE, plot_ale

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(4, 2, figsize=(18, 20))

ale = ALE(model.predict, feature_names=X_train.columns, target_names=["CreditRisk (y)"])
ale_exp = ale.explain(np.array(X_train))

plot_ale(ale_exp, features=["CreditDuration"], ax=ax1)
plot_ale(ale_exp, features=["CreditAmount"], ax=ax2)
plot_ale(ale_exp, features=["InstallmentRate"], ax=ax3)
plot_ale(ale_exp, features=["Age"], ax=ax4)
plot_ale(ale_exp, features=["NumberOfCredits"], ax=ax5)
plot_ale(ale_exp, features=["MonthRefunding"], ax=ax6)
plot_ale(ale_exp, features=["Gender"], ax=ax7)
plot_ale(ale_exp, features=["Group"], ax=ax8);

In [None]:
#Now, let's try with ICE

from sklearn.inspection import PartialDependenceDisplay
model = LogisticRegression()
model.fit(X, y)
features = [0, 1]

fig, ax = plt.subplots(figsize=(18, 18))

plot_partial_dependence(model,       
                                   features=[i for i in range(len(X_train.columns))], # column numbers of plots we want to show
                                   X=X,            # raw predictors data.
                                   feature_names=[i for i in X_train.columns], # labels on graphs
                                   grid_resolution=10, kind="individual", ax=ax);

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.LinearExplainer(model, X_train)

# calculate shap values. This is what we will plot.
# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
shap_values = explainer.shap_values(X_test)

# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values, X_test)

## Explainability with Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


df = pd.read_excel("data_project.xlsx")

with open('config.json', "r") as f:
    data = f.read()
  
      
# reconstructing the data as a dictionary
js = json.loads(data)

#On remplace les données codées par du texte

df = df.replace(js)

Oe = OneHotEncoder()
le = OrdinalEncoder()

to_encode = ['CreditHistory', 'EmploymentDuration', 'Housing', 'Purpose', 'Savings']

df["MonthRefunding"] = df["CreditAmount"]/df["CreditDuration"]

for col in to_encode:
    df[col] = le.fit_transform(np.array(df[col]).reshape(-1, 1))

X, y = df.drop(["CreditRisk (y)", "y_hat"], axis=1), df["CreditRisk (y)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("The accuracy of the model is {:.2f}".format(model.score(X_test, y_test)))
print("The precision of the model is {:.2f}".format(precision_score(y_pred, y_test)))
print("The recall of the model is {:.2f}".format(recall_score(y_pred, y_test)))
print("The f1-score of the model is {:.2f}".format(f1_score(y_pred, y_test)))

print("\n")

ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = model.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
plt.figure(figsize=(12, 12))
plt.title("ROC Curve of the Random Forest model", fontsize=15)
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, label='Model', color="blue")
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
from sklearn.inspection import plot_partial_dependence
from sklearn.inspection import partial_dependence

X, y = df.drop(["CreditRisk (y)", "y_hat"], axis=1), df["CreditRisk (y)"]

model = RandomForestClassifier()

model.fit(X, y)

In [None]:
#The partial_dependence function returns the dependencies and the grid
PDs, grid = partial_dependence(model, X, features = ['CreditDuration'], percentiles = [0,1])

#The plot_partial_dependence function returns a plot, but can also be unpacked into dependencies and grids
#plot_partial_dependence(model, X, features = ['CreditDuration'], percentiles = [0,1]);

def get_PDPvalues(col_name, data, model, grid_resolution = 100):
    Xnew = data.copy()
    sequence = np.linspace(np.min(data[col_name]), np.max(data[col_name]), grid_resolution)
    Y_pdp = []
    for each in sequence:
        Xnew[col_name] = each
        Y_temp = model.predict(Xnew)
        Y_pdp.append(np.mean(Y_temp))
    return pd.DataFrame({col_name: sequence, 'PDs': Y_pdp})

def plot_PDP(col_name, data, model):
    df = get_PDPvalues(col_name, data, model)
    plt.rcParams.update({'font.size': 16})
    plt.rcParams["figure.figsize"] = (6,5)
    fig, ax = plt.subplots()
    ax.plot(data[col_name], np.zeros(data[col_name].shape)+min(df['PDs'])-1, 'k|', ms=15)  # rug plot
    ax.plot(df[col_name], df['PDs'], lw = 2)
    ax.set_ylabel('Partial Dependence')
    return ax

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(4, 2, figsize=(18, 20))

df = get_PDPvalues('CreditAmount', X, model)
ax1.plot(X['CreditAmount'], np.zeros(X['CreditAmount'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax1.plot(df['CreditAmount'], df['PDs'], lw = 2)
ax1.set_ylabel('Partial Dependence')



df = get_PDPvalues('CreditDuration', X, model)
ax2.plot(X['CreditDuration'], np.zeros(X['CreditDuration'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax2.plot(df['CreditDuration'], df['PDs'], lw = 2)
ax2.set_ylabel('CreditDuration')



df = get_PDPvalues('InstallmentRate', X, model)
ax3.plot(X['InstallmentRate'], np.zeros(X['InstallmentRate'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax3.plot(df['InstallmentRate'], df['PDs'], lw = 2)
ax3_twin=ax3.twinx()

sns.distplot(X['InstallmentRate'], hist=False,
             bins=int(180/5), color = 'red',
             hist_kws={'edgecolor':'black'}, ax=ax3_twin)
ax3.set_ylabel('InstallmentRate')
ax3_twin.set_ylabel("InstallmentRate density",color="red",fontsize=14)
ax3.set_ylabel('InstallmentRate')



df = get_PDPvalues('Age', X, model)
ax4.plot(X['Age'], np.zeros(X['Age'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax4.plot(df['Age'], df['PDs'], lw = 2)

ax4_twin=ax4.twinx()

sns.distplot(X['Age'], hist=False,
             bins=int(180/5), color = 'red',
             hist_kws={'edgecolor':'black'}, ax=ax4_twin)
ax4.set_ylabel('Age')
ax4_twin.set_ylabel("Age density",color="red",fontsize=14)



df = get_PDPvalues('NumberOfCredits', X, model)
ax5.plot(X['NumberOfCredits'], np.zeros(X['NumberOfCredits'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax5.plot(df['NumberOfCredits'], df['PDs'], lw = 2)
ax5_twin=ax5.twinx()
sns.distplot(X['NumberOfCredits'], hist=False,
             bins=int(180/5), color = 'red',
             hist_kws={'edgecolor':'black'}, ax=ax5_twin)
ax5_twin.set_ylabel("NumberOfCredits density",color="red",fontsize=14)
ax5.set_ylabel('NumberOfCredits')

df = get_PDPvalues('CreditHistory', X, model)
ax6.plot(X['CreditHistory'], np.zeros(X['CreditHistory'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax6.plot(df['CreditHistory'], df['CreditHistory'], lw = 2)
ax6.set_ylabel('CreditHistory')

df = get_PDPvalues('MonthRefunding', X, model)
ax7.plot(X['MonthRefunding'], np.zeros(X['MonthRefunding'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax7.plot(df['MonthRefunding'], df['MonthRefunding'], lw = 2)
ax7.set_ylabel('MonthRefunding')

df = get_PDPvalues('Group', X, model)
ax8.plot(X['Group'], np.zeros(X['Group'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax8.plot(df['Group'], df['PDs'], lw = 2)
ax8.set_ylabel('Group');

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(model, X_train)

# calculate shap values. This is what we will plot.
# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
shap_values = explainer.shap_values(X_test)

# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values, X_test)

## BlackBox model, XGBOOST

In [None]:
from xgboost import XGBClassifier

df = pd.read_excel("data_project.xlsx")

with open('config.json', "r") as f:
    data = f.read()
  
      
# reconstructing the data as a dictionary
js = json.loads(data)

#On remplace les données codées par du texte

df = df.replace(js)

Oe = OneHotEncoder()
le = OrdinalEncoder()

to_encode = ['CreditHistory', 'EmploymentDuration', 'Housing', 'Purpose', 'Savings']

df["MonthRefunding"] = df["CreditAmount"]/df["CreditDuration"]

for col in to_encode:
    df[col] = le.fit_transform(np.array(df[col]).reshape(-1, 1))

X, y = df.drop(["CreditRisk (y)", "y_hat"], axis=1), df["CreditRisk (y)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = XGBClassifier()
model.fit(X_train, y_train)

#The partial_dependence function returns the dependencies and the grid
PDs, grid = partial_dependence(model, X, features = ['CreditDuration'], percentiles = [0,1])

#The plot_partial_dependence function returns a plot, but can also be unpacked into dependencies and grids
#plot_partial_dependence(model, X, features = ['CreditDuration'], percentiles = [0,1]);

def get_PDPvalues(col_name, data, model, grid_resolution = 100):
    Xnew = data.copy()
    sequence = np.linspace(np.min(data[col_name]), np.max(data[col_name]), grid_resolution)
    Y_pdp = []
    for each in sequence:
        Xnew[col_name] = each
        Y_temp = model.predict(Xnew)
        Y_pdp.append(np.mean(Y_temp))
    return pd.DataFrame({col_name: sequence, 'PDs': Y_pdp})

def plot_PDP(col_name, data, model):
    df = get_PDPvalues(col_name, data, model)
    plt.rcParams.update({'font.size': 16})
    plt.rcParams["figure.figsize"] = (6,5)
    fig, ax = plt.subplots()
    ax.plot(data[col_name], np.zeros(data[col_name].shape)+min(df['PDs'])-1, 'k|', ms=15)  # rug plot
    ax.plot(df[col_name], df['PDs'], lw = 2)
    ax.set_ylabel('Partial Dependence')
    return ax

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(4, 2, figsize=(18, 20))

df = get_PDPvalues('CreditAmount', X, model)
ax1.plot(X['CreditAmount'], np.zeros(X['CreditAmount'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax1.plot(df['CreditAmount'], df['PDs'], lw = 2)
ax1.set_ylabel('Partial Dependence')



df = get_PDPvalues('CreditDuration', X, model)
ax2.plot(X['CreditDuration'], np.zeros(X['CreditDuration'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax2.plot(df['CreditDuration'], df['PDs'], lw = 2)
ax2.set_ylabel('CreditDuration')



df = get_PDPvalues('InstallmentRate', X, model)
ax3.plot(X['InstallmentRate'], np.zeros(X['InstallmentRate'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax3.plot(df['InstallmentRate'], df['PDs'], lw = 2)
ax3_twin=ax3.twinx()

sns.distplot(X['InstallmentRate'], hist=False,
             bins=int(180/5), color = 'red',
             hist_kws={'edgecolor':'black'}, ax=ax3_twin)
ax3.set_ylabel('InstallmentRate')
ax3_twin.set_ylabel("InstallmentRate density",color="red",fontsize=14)
ax3.set_ylabel('InstallmentRate')



df = get_PDPvalues('Age', X, model)
ax4.plot(X['Age'], np.zeros(X['Age'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax4.plot(df['Age'], df['PDs'], lw = 2)

ax4_twin=ax4.twinx()

sns.distplot(X['Age'], hist=False,
             bins=int(180/5), color = 'red',
             hist_kws={'edgecolor':'black'}, ax=ax4_twin)
ax4.set_ylabel('Age')
ax4_twin.set_ylabel("Age density",color="red",fontsize=14)



df = get_PDPvalues('NumberOfCredits', X, model)
ax5.plot(X['NumberOfCredits'], np.zeros(X['NumberOfCredits'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax5.plot(df['NumberOfCredits'], df['PDs'], lw = 2)
ax5_twin=ax5.twinx()
sns.distplot(X['NumberOfCredits'], hist=False,
             bins=int(180/5), color = 'red',
             hist_kws={'edgecolor':'black'}, ax=ax5_twin)
ax5_twin.set_ylabel("NumberOfCredits density",color="red",fontsize=14)
ax5.set_ylabel('NumberOfCredits')

df = get_PDPvalues('CreditHistory', X, model)
ax6.plot(X['CreditHistory'], np.zeros(X['CreditHistory'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax6.plot(df['CreditHistory'], df['CreditHistory'], lw = 2)
ax6.set_ylabel('CreditHistory')

df = get_PDPvalues('MonthRefunding', X, model)
ax7.plot(X['MonthRefunding'], np.zeros(X['MonthRefunding'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax7.plot(df['MonthRefunding'], df['MonthRefunding'], lw = 2)
ax7.set_ylabel('MonthRefunding')

df = get_PDPvalues('Group', X, model)
ax8.plot(X['Group'], np.zeros(X['Group'].shape)+min(df['PDs'])-1, 'k|', ms=15)
ax8.plot(df['Group'], df['PDs'], lw = 2)
ax8.set_ylabel('Group');

In [None]:
from alibi.explainers import ALE, plot_ale

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(4, 2, figsize=(18, 20))

model.fit(X.values, y)

ale = ALE(model.predict, feature_names=X.columns, target_names=["CreditRisk (y)"])
ale_exp = ale.explain(np.array(X.values))

plot_ale(ale_exp, features=["CreditDuration"], ax=ax1)
plot_ale(ale_exp, features=["CreditAmount"], ax=ax2)
plot_ale(ale_exp, features=["InstallmentRate"], ax=ax3)
plot_ale(ale_exp, features=["Age"], ax=ax4)
plot_ale(ale_exp, features=["NumberOfCredits"], ax=ax5)
plot_ale(ale_exp, features=["MonthRefunding"], ax=ax6)
plot_ale(ale_exp, features=["Gender"], ax=ax7)
plot_ale(ale_exp, features=["Group"], ax=ax8);

In [None]:
model = XGBClassifier()
model.fit(X, y)

fig, ax = plt.subplots(figsize=(18, 18))

plot_partial_dependence(model,       
                                   features=[i for i in range(len(X_train.columns))], # column numbers of plots we want to show
                                   X=X,            # raw predictors data.
                                   feature_names=[i for i in X_train.columns], # labels on graphs
                                   grid_resolution=10, kind="individual", ax=ax);

In [None]:
import shap  # package used to calculate Shap values
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train, y_train)
# Create object that can calculate shap values
explainer = shap.TreeExplainer(model, X_train)

# calculate shap values. This is what we will plot.
# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
shap_values = explainer.shap_values(X_test)

# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values, X_test)

In [None]:
y_pred = model.predict(X_test)

print("The accuracy of the model is {:.2f}".format(model.score(X_test, y_test)))
print("The precision of the model is {:.2f}".format(precision_score(y_pred, y_test)))
print("The recall of the model is {:.2f}".format(recall_score(y_pred, y_test)))
print("The f1-score of the model is {:.2f}".format(f1_score(y_pred, y_test)))

print("\n")

ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = model.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
plt.figure(figsize=(12, 12))
plt.title("ROC Curve of the XGBOOST model", fontsize=15)
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, label='Model', color="blue")
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()