In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

os.chdir("../")

from sklearn.preprocessing import MinMaxScaler
from src.models import ModelBasedClustering
import statsmodels.api as sm

In [None]:
sns.set_theme()
plt.rcParams['pgf.texsystem']

In [None]:
data_ale = pd.read_csv(f"./data/mcs_ds_edited_iter_shuffled.csv")
data_ale = data_ale[["ale", "anchor_ratio", "trans_range", "node_density", "iterations"]].reset_index()

In [None]:
data_ale.head()

# Linear Regression

In [None]:
ale_model = sm.OLS(data_ale.iloc[:, [1]], sm.add_constant(data_ale.iloc[:, 2:])).fit()
fig, axe =  plt.subplots(1, 1, figsize=(12, 7))
fig = sm.graphics.influence_plot(ale_model, criterion="cooks", ax=axe)
fig.tight_layout(pad=1.0)

# Clustering Analysis

In [None]:
dict_clustering_data = {
    "K": [],
    "init_Clusters": [],
    "MBC_Clusters": [],
    "BIC_val": []
}
N_iterations = 100

In [None]:
for K in range(1, 5):
    _mbc_model = ModelBasedClustering(data_ale, K)
    init_Clusters = _mbc_model.initZ()
    MBC_Clusters, BIC_val = _mbc_model.EM_algo(N_iterations)
    dict_clustering_data["K"].append(K)
    dict_clustering_data["init_Clusters"].append(init_Clusters)
    dict_clustering_data["MBC_Clusters"].append(MBC_Clusters)
    dict_clustering_data["BIC_val"].append(BIC_val)

In [None]:
fig, axe =  plt.subplots(1, 1, figsize=(9, 5.5))
axe.plot(dict_clustering_data["K"], dict_clustering_data["BIC_val"])
axe.set_ylabel("BIC")
axe.set_xlabel("Number of clusters")
#plt.savefig("./figs/bic_ale.pgf", format='pgf')
plt.show()

In [None]:
list_BIC = np.array(dict_clustering_data["BIC_val"]).reshape(-1)

In [None]:
K = 1 + list_BIC.tolist().index(list_BIC.max())

In [None]:
data_ale.insert(1, "Kmeans_clusters", [1+list(dict_clustering_data["init_Clusters"][K-1][n]).index(1) for n in range(dict_clustering_data["MBC_Clusters"][K-1].shape[0])])

In [None]:
data_ale.insert(1, "clusters", [1+list(dict_clustering_data["MBC_Clusters"][K-1][n]).index(1) for n in range(dict_clustering_data["MBC_Clusters"][K-1].shape[0])])

In [None]:
data_ale[data_ale.clusters==1].reset_index(drop=True)

In [None]:
data_ale[data_ale.clusters==2].reset_index(drop=True)

In [None]:
data_ale[data_ale.clusters==3].reset_index(drop=True)

In [None]:
data_ale[data_ale.clusters==4].reset_index(drop=True)