In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
sns.set_style('whitegrid')

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from scipy.stats import uniform
from scipy import interp
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import warnings
warnings.filterwarnings("ignore")
import plotly.io as pio
pio.renderers
pio.renderers.default = 'colab'

In [2]:
# read the dataset
df = pd.read_csv('german_credit_data.csv')

In [3]:
one_hot = {
    "Sex": "sex",
    "Housing": "hous",
    "Purpose": "purp"
}
ordinal_encoding = {
    "Saving accounts": {
        None: 0,
        "little": 1,
        "moderate": 2,
        "quite rich": 3,
        "rich": 4,
    },
    "Checking account": {
        None: 1,
        "little": 2,
        "moderate": 3,
        "rich": 4,
    },
    "Risk": {
        "bad": 1,
        "good": 0,
    }
}
def one_hot_enconding(df, col_prefix: dict):
    df = df.copy()
    for col, prefix in col_prefix.items():
        df = pd.get_dummies(data=df, prefix=prefix, columns=[col,],dtype=int)
    return df
def encode_ordinal(df, custom_ordinals: dict):
    df = df.copy()
    for col, map_dict in custom_ordinals.items():
        df[col] = df[col].replace(map_dict)
    return df

In [None]:
df_encode = df.copy()
df_encode = one_hot_enconding(df_encode, one_hot)
df_encode = encode_ordinal(df_encode, ordinal_encoding)
df_encode

In [5]:
model_cols = ['Age', 'Job', 'Saving accounts', 'Checking account',
       'Credit amount', 'Duration', 'sex_female', 'sex_male',
       'hous_free', 'hous_own', 'hous_rent', 'purp_business', 'purp_car',
       'purp_domestic appliances', 'purp_education',
       'purp_furniture/equipment', 'purp_radio/TV', 'purp_repairs',
       'purp_vacation/others']
df_new = df_encode[model_cols]

In [6]:
num_df = df_new[['Age', 'Duration', 'Credit amount']]
num_df = np.log(num_df)

In [None]:
fig, ax = plt.subplots(1,3,figsize=(20,5))
plt.suptitle('DISTRIBUTION PLOTS AFTER LOG TRANSFORMATION')
sns.distplot(num_df['Credit amount'], bins=40, ax=ax[0]);
sns.distplot(num_df['Duration'], bins=40, ax=ax[1], color='salmon');
sns.distplot(num_df['Age'], bins=40, ax=ax[2], color='darkviolet');

In [8]:
scaler = StandardScaler()
num_df_scaled = scaler.fit_transform(num_df)

In [None]:
inertias = []

for i in range(2,16):
    kmeans = KMeans(n_clusters=i, random_state=0).fit(num_df_scaled)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(10,5))
plt.title('ELBOW METHOD')
plt.plot(np.arange(2,16),inertias, marker='o', lw=2, color='steelblue');

In [None]:
results = []

for i in range(3, 20):
      kmeans = KMeans(n_clusters=i, random_state=42)
      c_labels = kmeans.fit_predict(num_df_scaled)
      sil_ave = silhouette_score(num_df_scaled, c_labels)
      results.append([i, sil_ave])

res_df = pd.DataFrame(results, columns=['num_cluster', 'sil_score'])

# Calculando a média do Silhouette Score para cada número de clusters
mean_silhouette_scores = res_df.groupby('num_cluster')['sil_score'].mean().reset_index()

# Plotando o gráfico de linha
plt.figure(figsize=(10, 6))
plt.plot(mean_silhouette_scores['num_cluster'], mean_silhouette_scores['sil_score'], marker='o')
plt.title('Silhouette Score for Different Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Average Silhouette Score')
plt.grid(True)
plt.tight_layout()
plt.show()

In [11]:
km = KMeans(n_clusters=3, random_state=0)
clusters = km.fit_predict(num_df_scaled)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111, projection='3d')

for i in range(3):
    ax.scatter(num_df_scaled[clusters ==i,0], num_df_scaled[clusters ==i,1], num_df_scaled[clusters ==i,2])

In [None]:
df_clustered = df_encode[['Risk','Age', 'Job', 'Saving accounts', 'Checking account',
       'Credit amount', 'Duration', 'sex_female', 'sex_male',
       'hous_free', 'hous_own', 'hous_rent', 'purp_business', 'purp_car',
       'purp_domestic appliances', 'purp_education',
       'purp_furniture/equipment', 'purp_radio/TV', 'purp_repairs',
       'purp_vacation/others']]
df_clustered['cluster'] = clusters
df_clustered.groupby('cluster').mean()