In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import norm

import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import pyplot
from sklearn.decomposition import PCA
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture

### Funciton Defined


In [129]:
def feature_selection(data_df, variation_threshold):
    pca = PCA()
    pca.fit(data_df)
    cum_variation = np.cumsum(pca.explained_variance_ratio_)
    feature_number = (cum_variation > variation_threshold).argmax(axis=0) + 1
    result_df = pd.DataFrame(
        data=np.vstack(
            [
                pca.explained_variance_,
                pca.explained_variance_ratio_ * 100,
                cum_variation * 100,
            ]
        ).T,
        columns=["Eigenvalue", "Variation explained(%)", "Cumulative(%)"],
        index=np.arange(1, data_df.shape[1] + 1),
    )
    result_df = round(result_df, 2)
    display(result_df)
    print("Selected Feature number: ", feature_number)

    return feature_number

### Data Input



In [130]:
rawdata_df = pd.read_csv("../data/SME_dataset.csv", index_col=0)
rawdata_df.head()

Unnamed: 0_level_0,status,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24
X1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1,3.3,0.62,1.3,0.96,0.7,0.94,-21.75,-0.12,-0.53,...,-0.01,1,1,1,1,7,25,8,36635,4635
3,1,-7.93,-0.19,0.87,1.35,1.34,225.95,-42.44,-0.59,-8.12,...,-0.1,1,1,1,1,1,42,65,29970,4791
7,1,1.24,0.41,1.81,1.14,0.65,1.1,-10.84,-0.08,-0.21,...,0.0,1,1,1,1,30,43,21,7052,4752
10,1,-1.16,-0.68,0.14,1.6,0.87,2.23,-16.29,-0.27,-0.05,...,-0.07,1,1,1,1,48,31,12,4276,4771
12,1,-1.9,-0.86,0.47,0.42,0.42,-1.78,-13.83,-0.24,-0.23,...,-0.17,1,1,1,1,0,53,156,3799,3212


In [131]:
# features engineering: rearrange the features
adj_features_df = rawdata_df.copy()
adj_features_df["V25"] = adj_features_df["V23"] * adj_features_df["V15"]
adj_features_df["V26"] = (
    adj_features_df["V25"] / adj_features_df["V13"] * adj_features_df["V7"]
)
adj_features_df["M1"] = adj_features_df["V23"] / adj_features_df["V11"]
adj_features_df["M2"] = adj_features_df["V25"] / adj_features_df["V14"]
adj_features_df["M3"] = adj_features_df["M2"] * adj_features_df["V21"]
adj_features_df["M4"] = adj_features_df["M2"] * adj_features_df["V22"]
adj_features_df["M5"] = adj_features_df["M1"] / (adj_features_df["V1"] + 1)
adj_features_df["M6"] = adj_features_df["M5"] * adj_features_df["V2"]
adj_features_df["M7"] = adj_features_df["M1"] / adj_features_df["V3"]
adj_features_df["M8"] = adj_features_df["M7"] * adj_features_df["V4"]
adj_features_df["V27"] = (
    adj_features_df["M8"] - adj_features_df["M3"] - adj_features_df["M4"]
) / adj_features_df["M6"]
adj_features_df = adj_features_df.drop(
    [
        "V1",
        "V7",
        "V15",
        "M8",
        "V21",
        "V23",
        "V24",
        "M1",
        "M2",
        "M3",
        "M4",
        "M5",
        "M6",
        "M7",
        "M8",
    ],
    axis=1,
)
adj_features_df["V27"].replace([np.inf, -np.inf], [1, -1], inplace=True)
for col in adj_features_df.columns[1:]:
    adj_features_df = adj_features_df[
        (adj_features_df[col] != np.inf) & (adj_features_df[col] != -np.inf)
    ]

In [132]:
adj_features_df = adj_features_df[adj_features_df["V11"] != 0]
adj_features_df = adj_features_df.dropna()
adj_features_df.describe()

Unnamed: 0,status,V2,V3,V4,V5,V6,V8,V9,V10,V11,...,V14,V16,V17,V18,V19,V20,V22,V25,V26,V27
count,14730.0,14730.0,14730.0,14730.0,14730.0,14730.0,14730.0,14730.0,14730.0,14730.0,...,14730.0,14730.0,14730.0,14730.0,14730.0,14730.0,14730.0,14730.0,14730.0,14730.0
mean,0.107332,1.253775,1.467396,1.54628,1.192641,7.779691,0.029429,-0.057073,1.349719,1.312478,...,0.063851,0.167821,0.194705,0.93632,0.230821,105.11799,95.991989,243.970946,101.303006,-5969.63
std,0.309545,3.303331,0.803976,1.218004,1.012736,23.821837,0.147991,0.787363,1.033187,1.029186,...,0.198024,0.37372,0.395987,0.24419,0.421373,351.257203,132.318943,1126.28629,1396.06682,63220.59
min,0.0,-9.59,0.14,0.01,-0.02,-33.14,-1.34,-8.93,0.01,0.01,...,-2.08,0.0,0.0,0.0,0.0,0.0,0.0,-36548.28,-113010.176673,-3982522.0
25%,0.0,0.0,1.07,0.97,0.62,0.94,0.01,0.0,0.69,0.65,...,0.03,0.0,0.0,1.0,0.0,1.0,0.0,25.205,10.812667,-1280.792
50%,0.0,0.07,1.2,1.23,0.98,1.75,0.04,0.07,1.15,1.115,...,0.06,0.0,0.0,1.0,0.0,21.0,68.0,73.2,41.64048,-277.6837
75%,0.0,1.18,1.55,1.73,1.41,4.76,0.08,0.22,1.72,1.69,...,0.11,0.0,0.0,1.0,0.0,81.0,136.0,208.205,122.479875,-1.0
max,1.0,33.38,8.27,15.89,10.91,300.77,0.51,1.28,8.5,8.5,...,0.94,1.0,1.0,1.0,1.0,5569.0,1531.0,35312.16,15091.027302,186.2275


In [133]:
adj_features_df.groupby("status").mean().T

status,0,1
V2,1.247915,1.302505
V3,1.515448,1.067761
V4,1.605406,1.054535
V5,1.241081,0.789766
V6,8.008721,5.874877
V8,0.049565,-0.138039
V9,0.024447,-0.735066
V10,1.362423,1.244061
V11,1.322687,1.227571
V12,0.217967,0.092884


In [134]:
data_status = adj_features_df["status"]

In [135]:
adj_features_df = adj_features_df.drop(["status"], axis=1)
adj_features_df.head()

Unnamed: 0_level_0,V2,V3,V4,V5,V6,V8,V9,V10,V11,V12,...,V14,V16,V17,V18,V19,V20,V22,V25,V26,V27
X1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.62,1.3,0.96,0.7,0.94,-0.12,-0.53,8.5,8.49,-0.05,...,-0.01,1,1,1,1,7,8,-366.35,-454.023504,-1937.992928
3,-0.19,0.87,1.35,1.34,225.95,-0.59,-8.12,5.13,5.47,-0.02,...,-0.1,1,1,1,1,1,65,-2997.0,-3340.143908,-21291.085535
10,-0.68,0.14,1.6,0.87,2.23,-0.27,-0.05,2.66,2.68,-0.06,...,-0.07,1,1,1,1,48,12,-299.32,-419.614699,-24.426218
12,-0.86,0.47,0.42,0.42,-1.78,-0.24,-0.23,1.32,1.29,-0.07,...,-0.16,1,1,1,1,0,156,-645.83,-730.918895,-298.849194
16,0.57,1.17,0.35,0.12,0.36,-0.1,-0.72,0.79,0.78,-0.01,...,-0.11,1,1,1,1,77,12,-272.25,-316.823394,-2990.27558


### Factor Network-Based Segmentation


In [136]:
adj_features_df = (adj_features_df - np.mean(adj_features_df)) / np.std(adj_features_df)

In [137]:
variation_threshold = 0.95
feature_number = feature_selection(adj_features_df, variation_threshold)

Unnamed: 0,Eigenvalue,Variation explained(%),Cumulative(%)
1,4.64,22.08,22.08
2,2.5,11.92,34.0
3,2.16,10.3,44.3
4,1.6,7.62,51.92
5,1.2,5.72,57.65
6,1.08,5.12,62.77
7,1.05,5.0,67.77
8,0.96,4.59,72.36
9,0.91,4.35,76.71
10,0.85,4.05,80.76


Selected Feature number:  15


In [138]:
U, sigma, V = np.linalg.svd(adj_features_df, full_matrices=False)
Feature_vec = pd.DataFrame(U[:, :feature_number].dot(np.diag(sigma[:feature_number])))
Feature_vec.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,2.755508,-5.266324,-7.384289,-0.492941,-2.455815,-0.175566,-0.160452,0.348044,-0.610207,0.902019,-1.761436,1.007912,-2.79876,-0.083571,-1.044335
1,7.804968,-0.767719,-6.361781,3.239193,-5.901896,-1.137347,3.120747,1.637762,1.569721,0.161955,5.801218,1.075587,-0.331649,8.015394,0.304522
2,3.801238,-1.035043,-2.002558,-0.430803,-0.440707,-0.420092,-0.128057,-0.234459,-0.127525,-0.211447,-0.23889,0.081471,-0.483045,-1.369066,0.868092
3,4.410054,-0.314603,-0.380676,-0.323672,0.593627,0.102833,0.043369,-0.321987,0.477654,-0.230502,0.032513,0.250323,-0.4323,-1.150557,0.204691
4,4.133253,0.101029,0.208535,-0.677096,0.209257,-0.600007,-0.487701,-0.445737,-0.540364,0.143305,0.590391,-0.448287,-0.637395,-0.637518,-0.233458


In [187]:
def gen_summary_df(data_status, pred_vec):
    classified_df = pd.DataFrame({"default": data_status, "cluster": pred_vec})
    classified_df["counter"] = 1
    result_df=pd.merge(
        classified_df.groupby(["cluster", "default"]).count(),
        classified_df.groupby(["cluster", "default"])
        .sum()
        .groupby(level=0)
        .apply(lambda x: round(100 * x / float(x.sum()), 2)),
        how="right",
        on=["cluster", "default"],
    )
    result_df=result_df.reset_index()
    result_df.columns=['cluster','default','frequency','perct%']
    freq_series=result_df.groupby("cluster")["frequency"].sum()
    for i in range(len(freq_series)):
        new_row=pd.DataFrame([[i,'Total',freq_series[i],round(100*freq_series[i]/np.sum(freq_series),2)]],columns=result_df.columns)
        result_df=result_df.append(new_row)
    result_df=result_df.set_index(["cluster", "default"]).sort_index()
    return result_df

In [125]:
# kMeans
estimator = KMeans(n_clusters=2)
kmeans_pred = estimator.fit_predict(Feature_vec)
gen_summary_df(data_status,kmeans_pred)

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency,perct%
cluster,default,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,11591,94.29
0,1,702,5.71
0,Total,12293,83.46
1,0,1558,63.93
1,1,879,36.07
1,Total,2437,16.54


In [126]:
# GMM
estimator = GaussianMixture(
    n_components=2, covariance_type="full", init_params="kmeans", random_state=None
)
gmm_pred = estimator.fit_predict(Feature_vec)
gen_summary_df(data_status,gmm_pred)

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency,perct%
cluster,default,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,8810,95.16
0,1,448,4.84
0,Total,9258,62.85
1,0,4339,79.29
1,1,1133,20.71
1,Total,5472,37.15


In [50]:
# DEC neural network

from DEC import *
from keras.optimizers import SGD, Adam

input_x = adj_features_df

# hyper-params
dims = [input_x.shape[-1], 500, 500, 2000, 15]
init = "glorot_uniform"
tol = 0.0001
maxiter = 1000
update_interval = 100
n_clusters = 2
n_epochs = 100
batch_size = 128
pretrain_optimizer = SGD(lr=0.001, momentum=0.9, decay=0.001 / n_epochs)
# pretrain_optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
optimizer = SGD(lr=0.001, momentum=0.9, decay=0.001 / n_epochs)

autoencoder, encoder = autoencoder(dims, act="relu", init=init, dropout=0.2)
autoencoder.compile(optimizer=pretrain_optimizer, loss="mse")
autoencoder.fit(input_x, input_x, batch_size=batch_size, epochs=n_epochs)
kmeans = KMeans(n_clusters=n_clusters)
y_init = kmeans.fit(encoder.predict(input_x)).cluster_centers_
loss = 1
index = 0
index_array = np.arange(input_x.shape[0])
clustering_layer = ClusteringLayer(
    n_clusters=n_clusters, name="clustering", weights=[y_init]
)(encoder.output)
model = Model(inputs=encoder.input, outputs=[autoencoder.output, clustering_layer])
model.compile(loss=["mse", "kld"], loss_weights=[0.1, 1], optimizer=optimizer)
y_pred_last = kmeans.predict(encoder.predict(input_x))
for ite in range(maxiter):
    if ite % update_interval == 0:
        _, q = model.predict(input_x, verbose=0)
        p = target_distribution(q)  # update the auxiliary target distribution p

        # evaluate the clustering performance
        y_pred = q.argmax(1)

        # check stop criterion
        delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
        y_pred_last = np.copy(y_pred)
        print("the " + str(ite) + "th training." + "Loss: " + str(loss))
        if ite > 0 and delta_label < tol:
            print("delta_label = ", delta_label, " < %d" % tol)
            print("Reached tolerance threshold. Stopping training.")
            break
    idx = index_array[
        index * batch_size : min((index + 1) * batch_size, input_x.shape[0])
    ]
    loss = model.train_on_batch(x=input_x.iloc[idx], y=[input_x.iloc[idx], p[idx]])
    index = index + 1 if (index + 1) * batch_size <= input_x.shape[0] else 0
_, q = model.predict(input_x, verbose=0)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [188]:
gen_summary_df(data_status,np.argmax(q, 1))

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency,perct%
cluster,default,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,11423,94.44
0,1,673,5.56
0,Total,12096,82.12
1,0,1726,65.53
1,1,908,34.47
1,Total,2634,17.88


In [139]:
adj_features_df.to_csv('new_feature.csv')
Feature_vec.to_csv('new_svd.csv')
pd.DataFrame({"cluster": kmeans_pred}).to_csv('kmeans.csv')