In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
# set some display setting for pandas 

pd.set_option("display.max_columns", None)
# set some display setting for seaborn

sns.set_style("whitegrid")

In [None]:
df=pd.read_csv("german_credit_data.csv")
df.head()

In [None]:
df["Age"].describe()

In [None]:
df["Risk"].value_counts()
# The results show class imbalances. We must find a way to handle these imbalances

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.describe(include="all").T

In [None]:
df["Job"].unique()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df= df.dropna().reset_index(drop=True)

In [None]:
df.drop(columns='Unamed: 0', inplace = True)

In [None]:
df.columns

In [None]:
df[["Age", 'Credit amount', "Duration"]].hist(bins=20, edgecolor="black")
plt.suptitle("Distribution of Numerical Features", fontsize=14)
plt.show()

In [None]:
plt.figure(figuresize=(10,5))
for i, col in enumerate(["Age", 'Credit amount', "Duration"]):
    plt.subplot(1,3, i+1)
    sns.boxplot(y=df[col], color="skyblue")
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
df.query("Duration >70")
df.query("Duration >=60")

In [None]:
categorical_cols =["Sex", "Housing", "Saving accounts", "Checking account", "Purpose"]

In [None]:
# 19 minutes
plt.figure(figsize=(10,5))
for i, col in enumerate(categorical_cols):
    plt.subplot(3,3,i+1)
    sns.countplot(data=df, x=col, palette="Set2", order=df[col].value_counts)
    plt.title(f"Distribution of {col}")
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
corr = df[["Age", "Job", "Credit amount", "Duration"]].corr()
corr

In [None]:
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.show()

In [None]:
df.groupby("Job")["Credit amount"].mean()

In [None]:
df.groupby("Sex")["Credit amount"].mean()

In [None]:
pd.pivot_table(df, values="Credit amount", index="Housing", columns="Purpose")

In [None]:
sns.scatterplot(data=df, x="Age", y="Credit amount", hue="Sex", size="Duration", 
                alpha=0.7, palette="Set1" )
plt.title("Credit amount vs Age coloured by sex and sized by Duration")

In [None]:
sns.violinplot(data=df, x="Savings accounts", y="Credit amount", palette ="Pastel1")
plt.title("Credit Amount Distribution by Saving Accounts")

In [None]:
df["Risk"].value_counts(normalize=True)*100

In [None]:
plt.figure(figsize=(10,5))
for i, col in enumerate(["Age", "Credit Amount", "Duration"]):
    plt.subplots(1,3,i+1)
    sns.boxplot(data=df,x="Risk", y=col, palette ="Pastel2")
    plt.title(f"{col} by Risk")
plt.tight_layout()
plt.show()

In [None]:
df.groupby("Risk")[["Age","Credit amount", "Duration"]].mean()

In [None]:
plt.figure(figsize=(10,10))
for i, col in enumerate(categorical_cols):
    plt.subplot(3,3,i+1)
    sns.countplot(data=df, x=col, hue="Risk", palatte="Set1", order=df[col].value_counts().index)
    plt.title(f"{col} by Risk")
    plt.xticks(rotation =45)

plt.tight_layout()
plt.show()

In [None]:
features =["Age", "Sex", "Job", "Housing", "Saving accounts", "Checking account", "Credit amount", "Duration"]

In [None]:
target="Risk"

In [None]:
df_model = df[features + [target]].copy()

In [None]:
df_model.head()

In [None]:
from sklearn.preprocessing import LabelEncoder 
import joblib

In [None]:
cat_cols =df_model.select_dtypes(include="object").columns.drop("Risk")

In [None]:
le_dict ={}

In [None]:
cat_cols

In [None]:
for col in cat_cols:
    le=LabelEncoder()
    df_model[col]=le.fit_transform(df_model[col])
    le_dict[col]=le
    joblib.dump(le, f"{col}_encoder.pkl")

In [None]:
le_target=LabelEncoder()

In [None]:
df_model[target]=le_target.fit_transform(df_model[target])

In [None]:
df_model[target].value_counts()

In [None]:
joblib.dump(le_target,"target_encoder.pkl")

In [None]:
df_model.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=df_model.drop(target, axis =1)
y=df_model[target]

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, stratify=y, random_state=40)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import GridSearchCV

In [None]:
def train_model(model, param_grid, X_train, y_train, Xtest, y_test):
    grid=GridSearchCV(model, param_grid, cv=5, scoring ="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model =grid.best_estimator_ 
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return best_model, acc, grid.best_params_



In [None]:
dt =DecisionTreeClassifier(random_state=40,class_weights="balanced")
dt_param_grid={
    "max_depth" : [3,5,7,10, None],
    "min_samples_split": [2,5,100],
    "min_samples_leaf": [1,2,4]
}

In [None]:
best_dt, acc_dt, params_dt=train_model(dt, dt_param_grid, X_train, y_train, X_test, y_test)

In [None]:
print("Decision Tree Accuracy", acc_dt)

In [None]:
print("Best parameters", params_dt)

In [None]:
rf = RandomForestClassifier(random_state=1, class_weight="balanced", n_jobs=-1)

In [None]:
rf_param_grid={
    "n_estimators": [100,200],
    "max_depth": [5,7,10, None], 
    "min_samples_split": [2,5,10],
    "min_samples_leaf": [1,2,4]
}

In [None]:
best_rf, acc_rf, params_rf = train_model(rf, rf_param_grid, X_train, y_train, X_test, y_test)

In [None]:
print("Random Forest Accuracy", acc_rf)

In [None]:
print("Best params", params_rf)

In [None]:
et =ExtraTreesClassifier(random_state=1, class_weight="balanced", n_jobs=-1)

In [None]:
et_param_grid={
    "n_estimators": [100,200],
    "max_depth": [5,7,10, None], 
    "min_samples_split": [2,5,10],
    "min_samples_leaf": [1,2,4]
}

In [None]:
best_et, acc_et, params_et = train_model(et, et_param_grid, X_train, y_train, X_test, y_test)

In [None]:
print("Extra Trees Accuracy", acc_et)

In [None]:
print("Best params", params_et)

In [None]:
xgb =XGBClassifier(random_state=1, scale_pos_weight =(y_train==0).sum()/(y_train==1).sum()), use_label_encoder=False, eval_metric="logloss"

In [None]:
xgb_param_grid ={
    "n_estimators": [100, 200], 
    "max_depth": [3,5,7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.7, 1],
    "colsample_bytree": [0.7,1]
}

In [None]:
best_xgb, acc_xgb, params_xgb = train_model(xgb, xgb_param_grid, X_train, y_train, X_test, y_test)

In [None]:
print("Extra Trees Accuracy", acc_xgb)

In [None]:
print("Best params", params_xgb)

In [None]:
joblib.dump(best_et, "extra_trees_credit_model.pkl")

In [None]:
#