In [None]:
import pandas as pd
import numpy as np

In [None]:
def load_data():
    cols=['BALANCE','BALANCE_FREQUENCY','PURCHASES','ONEOFF_PURCHASES','INSTALLMENTS_PURCHASES','CASH_ADVANCE','PURCHASES_FREQUENCY','ONEOFF_PURCHASES_FREQUENCY','PURCHASES_INSTALLMENTS_FREQUENCY','CASH_ADVANCE_FREQUENCY','CASH_ADVANCE_TRX','PURCHASES_TRX','CREDIT_LIMIT','PAYMENTS','MINIMUM_PAYMENTS','PRC_FULL_PAYMENT','TENURE']
    df = pd.read_csv("data/CC GENERAL.csv", usecols=cols, index_col=False, dtype=np.float64)
    return df


In [None]:
df = load_data()

df.info()

In [None]:
df = df.dropna()
df_min = df.min()

df_n = (df - df_min) / (df.max() - df_min)
df_s = (df - df.mean()) / (df.std())

df_n.describe()

In [None]:
df_s.describe()

In [None]:
from sklearn.cluster import KMeans

In [None]:
def alpha_k(k, dim):
    if k == 2 and dim > 1:
        return 1.0 - 3.0 / (4.0 * dim)
    elif k > 2 and dim > 1:
        ak1 = alpha_k(k - 1, dim)
        return ak1 + (1.0 - ak1) / 6.0
    else:
        return 1.0

In [None]:
def cluster_eval(s_k, k, dim):
    if k == 1:
        return 1.0
    elif s_k[k-1] != 0.0:
        return s_k[k] / (alpha_k(k, dim) * s_k[k-1])
    else:
        return 1.0

In [None]:
def calculate_f_k(df,max_k):
    s_k = np.zeros(max_k)
    f_k = np.zeros(max_k)
    
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42).fit(df)
        s_k[k-1] = kmeans.inertia_
        
    f_k[0] = 1.0
    for k in range(2, max_k + 1):
        f_k[k-1] = cluster_eval(s_k, k-1, len(df.columns))
    
    return f_k

In [None]:
def estimate_k(df, max_k):
    f_k = calculate_f_k(df, max_k)
    return np.argmin(f_k) + 1

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

t = np.arange(1, 21)
f_k = calculate_f_k(df_s, 20)

f_k

In [None]:
t

In [None]:
fig, ax1 = plt.subplots()
ax1.plot(t, f_k)