In [3]:
import importlib
import funciones_modelos_ML as ml
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.inspection import permutation_importance
from mango import Tuner, scheduler
from scipy.stats import uniform
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
%matplotlib qt5

In [4]:
importlib.reload(ml)

<module 'funciones_modelos_ML' from 'c:\\Users\\jhquiza\\OneDrive - Universidad de Medellin\\JupyterNotebooks\\IAT\\funciones_modelos_ML.py'>

In [5]:
# Preprocesamiento de dataset completo
data_all = pd.read_csv('medidas_conectividad_globales_comportamentales_demograficos_2.csv')
data_all.set_index('subject', inplace=True)
data_all= data_all.drop(columns=['type','IAT_score_ind','IAT_score_level','dscore_1','dscore_2','dscore_3','dscore_5','dscore_6','dscore_7','dscore_8','modo_0','modo_1','modo_2','group'], axis=1)
data_all.dropna(inplace=True)
data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']] = data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']].astype('int64')
data_all_cod = pd.get_dummies(data_all)

# normalización de datos con standard scaler
continuas_cols = data_all_cod.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = data_all_cod.select_dtypes(include=['int64']).columns.to_list()
preprocessor_sc = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_all_cod_sc = preprocessor_sc.fit_transform(data_all_cod)
# normalización de datos con power transformer
preprocessor_pt = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_all_cod_pt = preprocessor_pt.fit_transform(data_all_cod)

In [4]:
max_clusters = 10
# clústeres por KMeans, datos sin normalizar
inertias_np, sil_scores_np, cal_scores_np, dav_scores_np = ml.clusters_kmeans(data=data_all_cod, max_clusters=max_clusters)
# clústeres por KMeans, Standard Scaler
inertias_sc, sil_scores_sc, cal_scores_sc, dav_scores_sc = ml.clusters_kmeans(data=data_all_cod_sc, max_clusters=max_clusters)
# clústeres por KMeans, PowerTransformer
inertias_pt, sil_scores_pt, cal_scores_pt, dav_scores_pt = ml.clusters_kmeans(data=data_all_cod_pt, max_clusters=max_clusters)
# Dataframe con métricas de clustering de KMeans
df_kmeans = pd.DataFrame(data=[sil_scores_np, cal_scores_np, dav_scores_np, sil_scores_sc, cal_scores_sc, dav_scores_sc, sil_scores_pt, cal_scores_pt, dav_scores_pt], index=['silhouette kmeans np', 'calinski kmeans np', 'davies kmeans np', 'silhouette kmeans sc', 'calinski kmeans sc', 'davies kmeans sc', 'silhouette kmeans pt', 'calinski kmeans pt', 'davies kmeans pt'], columns=range(2,11))
df_kmeans = df_kmeans.transpose()
df_kmeans

Unnamed: 0,silhouette kmeans np,calinski kmeans np,davies kmeans np,silhouette kmeans sc,calinski kmeans sc,davies kmeans sc,silhouette kmeans pt,calinski kmeans pt,davies kmeans pt
2,0.148333,18.377479,2.098934,0.044061,5.404879,3.907277,0.04445,5.366939,3.958477
3,0.134153,15.652288,1.930229,0.036168,4.433789,3.520215,0.036445,4.602733,3.556833
4,0.082484,13.099763,2.255836,0.039828,4.132395,3.143827,0.034238,3.913809,3.279062
5,0.082142,10.854686,2.215896,0.029583,3.671521,3.249619,0.024571,3.561801,3.231135
6,0.075981,9.722721,1.979363,0.020106,3.35108,2.979471,0.028209,3.358123,3.147341
7,0.071339,8.94485,2.243502,0.021944,3.175393,2.926014,0.025328,3.168508,2.957298
8,0.06556,8.2024,2.068214,0.030403,3.114816,2.682936,0.032128,3.149527,2.705584
9,0.06755,7.607896,2.011934,0.028926,3.098301,2.651573,0.006505,2.826422,2.641254
10,0.063591,7.263858,1.941095,0.015256,2.708397,2.554264,0.027405,2.798667,2.640762


In [5]:
max_clusters = 10
# clústeres por GMM, datos sin normalizar
gauss_scores_np, sil_scores_np, cal_scores_np, dav_scores_np = ml.clusters_gaussian(data=data_all_cod, max_clusters=max_clusters)
# clústeres por GMM, Standard Scaler
gauss_scores_sc, sil_scores_sc, cal_scores_sc, dav_scores_sc = ml.clusters_gaussian(data=data_all_cod_sc, max_clusters=max_clusters)
# clústeres por KMeans, PowerTransformer
gauss_scores_pt, sil_scores_pt, cal_scores_pt, dav_scores_pt = ml.clusters_gaussian(data=data_all_cod_pt, max_clusters=max_clusters)
# Dataframe con métricas de clustering de KMeans
df_gauss = pd.DataFrame(data=[sil_scores_np, cal_scores_np, dav_scores_np, sil_scores_sc, cal_scores_sc, dav_scores_sc, sil_scores_pt, cal_scores_pt, dav_scores_pt], index=['silhouette gauss np', 'calinski gauss np', 'davies gauss np', 'silhouette gauss sc', 'calinski gauss sc', 'davies gauss sc', 'silhouette gauss pt', 'calinski gauss pt', 'davies gauss pt'], columns=range(2,max_clusters+1))
df_gauss = df_gauss.transpose()
df_gauss

Unnamed: 0,silhouette gauss np,calinski gauss np,davies gauss np,silhouette gauss sc,calinski gauss sc,davies gauss sc,silhouette gauss pt,calinski gauss pt,davies gauss pt
2,0.141936,17.611031,2.119484,0.03472,3.494382,4.220111,0.034619,4.439385,4.110302
3,0.144911,15.546488,1.838193,0.04559,4.355674,3.50516,0.032931,4.101494,3.736777
4,0.105139,12.171231,2.236379,0.037136,3.643267,3.341581,0.024866,3.707036,3.427901
5,0.070958,10.564612,2.42489,0.024253,3.330049,3.282874,0.027538,3.421442,3.266518
6,0.060584,8.987212,2.483849,0.029674,3.254885,2.960626,0.028209,3.358123,3.147341
7,0.056423,8.223237,2.31614,0.026384,3.057859,2.601803,0.021681,3.121669,3.028148
8,0.050689,7.65157,2.065535,0.022039,2.77851,2.705418,0.011031,2.72732,3.00938
9,0.045655,6.906457,2.097543,0.015379,2.671518,2.517359,0.011755,2.658596,2.715557
10,0.04864,6.773798,1.973382,0.015867,2.623438,2.434403,0.012568,2.560854,2.608202


In [6]:
df_todos = pd.concat([df_kmeans, df_gauss], axis=1)
df_todos

Unnamed: 0,silhouette kmeans np,calinski kmeans np,davies kmeans np,silhouette kmeans sc,calinski kmeans sc,davies kmeans sc,silhouette kmeans pt,calinski kmeans pt,davies kmeans pt,silhouette gauss np,calinski gauss np,davies gauss np,silhouette gauss sc,calinski gauss sc,davies gauss sc,silhouette gauss pt,calinski gauss pt,davies gauss pt
2,0.148333,18.377479,2.098934,0.044061,5.404879,3.907277,0.04445,5.366939,3.958477,0.141936,17.611031,2.119484,0.03472,3.494382,4.220111,0.034619,4.439385,4.110302
3,0.134153,15.652288,1.930229,0.036168,4.433789,3.520215,0.036445,4.602733,3.556833,0.144911,15.546488,1.838193,0.04559,4.355674,3.50516,0.032931,4.101494,3.736777
4,0.082484,13.099763,2.255836,0.039828,4.132395,3.143827,0.034238,3.913809,3.279062,0.105139,12.171231,2.236379,0.037136,3.643267,3.341581,0.024866,3.707036,3.427901
5,0.082142,10.854686,2.215896,0.029583,3.671521,3.249619,0.024571,3.561801,3.231135,0.070958,10.564612,2.42489,0.024253,3.330049,3.282874,0.027538,3.421442,3.266518
6,0.075981,9.722721,1.979363,0.020106,3.35108,2.979471,0.028209,3.358123,3.147341,0.060584,8.987212,2.483849,0.029674,3.254885,2.960626,0.028209,3.358123,3.147341
7,0.071339,8.94485,2.243502,0.021944,3.175393,2.926014,0.025328,3.168508,2.957298,0.056423,8.223237,2.31614,0.026384,3.057859,2.601803,0.021681,3.121669,3.028148
8,0.06556,8.2024,2.068214,0.030403,3.114816,2.682936,0.032128,3.149527,2.705584,0.050689,7.65157,2.065535,0.022039,2.77851,2.705418,0.011031,2.72732,3.00938
9,0.06755,7.607896,2.011934,0.028926,3.098301,2.651573,0.006505,2.826422,2.641254,0.045655,6.906457,2.097543,0.015379,2.671518,2.517359,0.011755,2.658596,2.715557
10,0.063591,7.263858,1.941095,0.015256,2.708397,2.554264,0.027405,2.798667,2.640762,0.04864,6.773798,1.973382,0.015867,2.623438,2.434403,0.012568,2.560854,2.608202


In [11]:
# Gráfico de métricas de clustering
fontsize_legend = 20
fontsize_labels = 20
fontsize_ticks = 15
plt.subplot(1,2,1)
plt.tick_params(labelsize = fontsize_ticks)
plt.plot(np.arange(1.9,10.9,1), df_todos['silhouette kmeans np'], 'or', label='kmeans np')
plt.plot(np.arange(1.9,10.9,1), df_todos['silhouette kmeans sc'], '*b', label='kmeans sc')
plt.plot(np.arange(1.9,10.9,1), df_todos['silhouette kmeans pt'], 'Dg', label='kmeans pt')
plt.plot(np.arange(2.1,11.1,1), df_todos['silhouette gauss np'], 'sk', label='gmm np')
plt.plot(np.arange(2.1,11.1,1), df_todos['silhouette gauss sc'], 'vy', label='gmm sc')
plt.plot(np.arange(2.1,11.1,1), df_todos['silhouette gauss pt'], 'Hm', label='gmm pt')
plt.legend(fontsize=fontsize_legend)
plt.xlabel('number of clusters', fontsize=fontsize_labels)
plt.ylabel('silhouette scores',fontsize=fontsize_labels)
plt.subplot(1,2,2)
plt.tick_params(labelsize = fontsize_ticks)
plt.plot(np.arange(1.9,10.9,1), df_todos['davies kmeans np'], 'or', label='kmeans np')
plt.plot(np.arange(1.9,10.9,1), df_todos['davies kmeans sc'], '*b', label='kmeans sc')
plt.plot(np.arange(1.9,10.9,1), df_todos['davies kmeans pt'], 'Dg', label='kmeans pt')
plt.plot(np.arange(2.1,11.1,1), df_todos['davies gauss np'], 'sk', label='gmm np')
plt.plot(np.arange(2.1,11.1,1), df_todos['davies gauss sc'], 'vy', label='gmm sc')
plt.plot(np.arange(2.1,11.1,1), df_todos['davies gauss pt'], 'Hm',label='gmm pt')
plt.xlabel('number of clusters', fontsize=fontsize_labels)
plt.ylabel('davies bouldin scores', fontsize=fontsize_labels)
plt.legend(fontsize=fontsize_legend)

<matplotlib.legend.Legend at 0x2736ee2e9d0>

The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


In [28]:
df_todos.to_csv('métricas clustering dataset completo.csv', index_label='number of clusters')

In [45]:
# parece que los mejores modelos son GMM con 3 clústeres sin escalizar y KMeans con 2 clústeres sin escalizar
data_clusters = data_all.copy()
kmeans = KMeans(2, random_state=72).fit(data_all_cod)
labels_k = kmeans.labels_
data_clusters['labels kmeans'] = labels_k
gauss = GaussianMixture(3, random_state=72).fit(data_all_cod)
labels_g = gauss.predict(data_all_cod)
data_clusters['labels gmm'] = labels_g
data_clusters

Unnamed: 0_level_0,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,diameter_b2d,diameter_dc,...,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH,labels kmeans,labels gmm
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,0.063492,0.365079,...,11,27,20,16,1.50,5.75,9.00,0.5,0,2
21101,0.222222,0.269841,-0.047619,0.269841,0.365079,-0.095238,0.333333,0.365079,-0.031746,0.317460,...,6,70,33,26,0.00,1.25,9.50,0.5,0,0
21102,0.301587,0.301587,0.000000,0.285714,0.285714,0.000000,0.253968,0.301587,-0.047619,0.380952,...,10,42,13,12,0.00,0.00,4.50,0.0,0,2
21103,0.333333,0.396825,-0.063492,0.269841,0.285714,-0.015873,0.349206,0.285714,0.063492,0.333333,...,12,32,16,15,0.00,0.00,6.25,0.0,1,1
21104,0.238095,0.238095,0.000000,0.317460,0.396825,-0.079365,0.269841,0.380952,-0.111111,0.285714,...,8,30,16,17,2.00,1.50,7.50,0.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24053,0.222222,0.222222,0.000000,0.253968,0.269841,-0.015873,0.269841,0.190476,0.079365,0.365079,...,14,36,16,12,0.00,3.25,7.25,0.0,0,2
24065,0.285714,0.285714,0.000000,0.285714,0.301587,-0.015873,0.285714,0.222222,0.063492,0.380952,...,8,39,18,12,4.25,5.25,7.50,0.0,1,1
24069,0.253968,0.301587,-0.047619,0.380952,0.285714,0.095238,0.396825,0.301587,0.095238,0.301587,...,12,28,15,12,2.50,4.00,10.00,0.0,0,2
24073,0.412698,0.460317,-0.047619,0.253968,0.317460,-0.063492,0.222222,0.206349,0.015873,0.317460,...,13,27,16,12,0.00,0.00,9.50,0.0,1,1


In [46]:
data_clusters.to_csv('clusters_gmm_3_kmeans_2_dataset_completo.csv')

## Validación con clasificador XGBoost

In [32]:
# Con labels de GMM
X = data_clusters.drop(['labels kmeans', 'labels gmm'], axis=1).copy()
X = pd.get_dummies(X)
y = data_clusters['labels gmm'].copy()

# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=72, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)

# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

# espacio de hiperparámetros a optimizar
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
# Modelo XGBoost sin preprocesar
params_np, scores_np, test_score_np = ml.modelo_xgboost_np(param_space=param_space, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# Modelo XGBoost con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_sc, scores_sc, test_score_sc = ml.modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# Modelo XGBoost con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_pt, scores_pt, test_score_pt = ml.modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

In [47]:
# Con labels de KMeans
X = data_clusters.drop(['labels kmeans', 'labels gmm'], axis=1).copy()
X = pd.get_dummies(X)
y = data_clusters['labels kmeans'].copy()

# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=72, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)

# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

# espacio de hiperparámetros a optimizar
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))

# Modelo XGBoost sin preprocesar
params_np_k, scores_np_k, test_score_np_k = ml.modelo_xgboost_np(param_space=param_space, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# Modelo XGBoost con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_sc_k, scores_sc_k, test_score_sc_k = ml.modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# Modelo XGBoost con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_pt_k, scores_pt_k, test_score_pt_k = ml.modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

Best score: 0.9571428571428571: 100%|██████████| 20/20 [02:26<00:00,  7.32s/it]


best parameters np: {'colsample_bytree': 0.7393291219812286, 'eta': 0.4177461414282586, 'max_depth': 7, 'n_estimators': 83, 'subsample': 0.5179978411585439}
best accuracy np: 0.9571428571428571


Best score: 0.9714285714285715: 100%|██████████| 20/20 [02:19<00:00,  6.95s/it]


best parameters: {'colsample_bytree': 0.16691123589162116, 'eta': 0.024866935782773436, 'max_depth': 4, 'n_estimators': 21, 'subsample': 0.9008894645049547}
best accuracy: 0.9714285714285715


Best score: 0.9714285714285713: 100%|██████████| 20/20 [02:50<00:00,  8.53s/it]


best parameters: {'colsample_bytree': 0.23382483638679305, 'eta': 0.8079870426803983, 'max_depth': 3, 'n_estimators': 31, 'subsample': 0.9056342336232028}
best accuracy: 0.9714285714285713


In [48]:
clf_results = pd.DataFrame(data=[[params_np, scores_np.mean(), scores_np.std(), test_score_np],[params_sc, scores_sc.mean(), scores_sc.std(), test_score_sc],
                                 [params_pt, scores_pt.mean(), scores_pt.std(), test_score_pt], [params_np_k, scores_np_k.mean(), scores_np_k.std(), test_score_np_k],
                                 [params_sc_k, scores_sc_k.mean(), scores_sc_k.std(), test_score_sc_k], [params_pt_k, scores_pt_k.mean(), scores_pt_k.std(), test_score_pt_k]], 
                                 index=['GMM np', 'GMM sc', 'GMM pt', 'KMeans np', 'KMeans sc', 'KMeans pt'], columns=['best parameters', 'mean cv score', 'sd cv score', 'test score'])
clf_results

Unnamed: 0,best parameters,mean cv score,sd cv score,test score
GMM np,"{'colsample_bytree': 0.6922013708725508, 'eta'...",0.885714,0.034993,0.777778
GMM sc,"{'colsample_bytree': 0.9536409154413579, 'eta'...",0.9,0.034993,0.833333
GMM pt,"{'colsample_bytree': 0.4316990886074268, 'eta'...",0.885714,0.057143,0.722222
KMeans np,"{'colsample_bytree': 0.7393291219812286, 'eta'...",0.957143,0.057143,0.888889
KMeans sc,"{'colsample_bytree': 0.16691123589162116, 'eta...",0.971429,0.034993,0.833333
KMeans pt,"{'colsample_bytree': 0.23382483638679305, 'eta...",0.971429,0.057143,0.944444


In [50]:
clf_results.to_csv('resultados clasificadores GMM_3 y KMeans_2 dataset completo.csv', index_label='modelo')

In [49]:
# Mejor modelo KMeans con 2 grupos y preprocesamiento con power transformer y min_max scaler
params= clf_results.loc['KMeans pt']['best parameters']
y = data_clusters['labels kmeans'].copy()
model, model_fit = ml.mejor_modelo(params=params, X=X, y=y, pre_pipe='pt')

mean val score:  0.9571428571428571
std val score:  0.03499271061118824
test score:  0.9444444444444444


In [51]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit, X, y_label,n_repeats=20,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
age,0.394318
diameter_td,0.011364
tree_hierarchy_b2i,0.006818
leaf_fraction_gi,0.006250
max_betweenness_gi,0.005682
...,...
AL,-0.001705
IMA,-0.001705
tree_hierarchy_dc,-0.005114
diameter_gi,-0.005682


In [52]:
importancia_atributos.to_csv('importancia atributos KMeans_2 dataset completo.csv', index_label='feature')

In [58]:
y = data_clusters['labels kmeans'].copy()
# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=72, stratify=y)
df_errados = ml.errores(model=model, label='labels kmeans', X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_clusters, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'labels kmeans_x':'labels kmeans'}, inplace=True)
datos_errados = datos_errados.drop(['labels kmeans_y'], axis=1)
datos_errados

Unnamed: 0_level_0,labels kmeans,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH,labels gmm
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21101,0,1,0.222222,0.269841,-0.047619,0.269841,0.365079,-0.095238,0.333333,0.365079,...,13,6,70,33,26,0.0,1.25,9.5,0.5,0
21140,1,0,0.238095,0.301587,-0.063492,0.349206,0.269841,0.079365,0.285714,0.285714,...,14,13,26,13,12,3.75,5.75,1.25,1.75,1
22111,1,0,0.301587,0.253968,0.047619,0.285714,0.269841,0.015873,0.285714,0.301587,...,10,10,28,12,12,0.25,0.25,9.0,0.25,1
21113,1,0,0.333333,0.269841,0.063492,0.31746,0.285714,0.031746,0.269841,0.269841,...,11,9,43,12,13,3.0,3.0,9.5,0.0,1
22110,0,1,0.31746,0.269841,0.047619,0.222222,0.301587,-0.079365,0.269841,0.253968,...,13,8,31,18,12,0.0,2.0,8.0,0.0,2


In [59]:
datos_errados.to_csv('errores KMeans_2 dataset completo.csv')

## Análisis excluyendo las variables 'age' e 'IMA'

In [6]:
data_all_cod = data_all_cod.drop(['age', 'IMA'], axis=1)

# normalización de datos con standard scaler
continuas_cols = data_all_cod.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = data_all_cod.select_dtypes(include=['int64']).columns.to_list()
preprocessor_sc = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_all_cod_sc = preprocessor_sc.fit_transform(data_all_cod)
# normalización de datos con power transformer
preprocessor_pt = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_all_cod_pt = preprocessor_pt.fit_transform(data_all_cod)

In [7]:
max_clusters = 10
# clústeres por KMeans, datos sin normalizar
inertias_np, sil_scores_np, cal_scores_np, dav_scores_np = ml.clusters_kmeans(data=data_all_cod, max_clusters=max_clusters)
# clústeres por KMeans, Standard Scaler
inertias_sc, sil_scores_sc, cal_scores_sc, dav_scores_sc = ml.clusters_kmeans(data=data_all_cod_sc, max_clusters=max_clusters)
# clústeres por KMeans, PowerTransformer
inertias_pt, sil_scores_pt, cal_scores_pt, dav_scores_pt = ml.clusters_kmeans(data=data_all_cod_pt, max_clusters=max_clusters)
# Dataframe con métricas de clustering de KMeans
df_kmeans = pd.DataFrame(data=[sil_scores_np, cal_scores_np, dav_scores_np, sil_scores_sc, cal_scores_sc, dav_scores_sc, sil_scores_pt, cal_scores_pt, dav_scores_pt], index=['silhouette kmeans np', 'calinski kmeans np', 'davies kmeans np', 'silhouette kmeans sc', 'calinski kmeans sc', 'davies kmeans sc', 'silhouette kmeans pt', 'calinski kmeans pt', 'davies kmeans pt'], columns=range(2,11))
df_kmeans = df_kmeans.transpose()
df_kmeans

Unnamed: 0,silhouette kmeans np,calinski kmeans np,davies kmeans np,silhouette kmeans sc,calinski kmeans sc,davies kmeans sc,silhouette kmeans pt,calinski kmeans pt,davies kmeans pt
2,0.084161,9.208664,2.904539,0.044104,5.408777,3.905837,0.04449,5.37069,3.957081
3,0.07154,7.700618,2.648572,0.036193,4.43688,3.518931,0.036504,4.606123,3.55557
4,0.070285,7.025267,2.532773,0.040733,4.11439,3.181768,0.034276,3.916446,3.277912
5,0.056016,6.00061,2.294767,0.030928,3.706736,3.161036,0.024613,3.56376,3.230337
6,0.058584,5.539915,2.335888,0.02018,3.353503,2.978467,0.028244,3.360237,3.146494
7,0.057119,5.229888,2.472307,0.021418,3.175996,2.929901,0.025343,3.17016,2.956522
8,0.051126,4.741064,2.307607,0.030433,3.116643,2.682212,0.032163,3.151245,2.704886
9,0.050731,4.684676,2.319303,0.032248,3.200656,2.606718,0.006027,2.768873,2.648311
10,0.055192,4.463295,2.121602,0.015319,2.710084,2.553381,0.027439,2.800095,2.640046


In [8]:
max_clusters = 10
# clústeres por GMM, datos sin normalizar
gauss_scores_np, sil_scores_np, cal_scores_np, dav_scores_np = ml.clusters_gaussian(data=data_all_cod, max_clusters=max_clusters)
# clústeres por GMM, Standard Scaler
gauss_scores_sc, sil_scores_sc, cal_scores_sc, dav_scores_sc = ml.clusters_gaussian(data=data_all_cod_sc, max_clusters=max_clusters)
# clústeres por KMeans, PowerTransformer
gauss_scores_pt, sil_scores_pt, cal_scores_pt, dav_scores_pt = ml.clusters_gaussian(data=data_all_cod_pt, max_clusters=max_clusters)
# Dataframe con métricas de clustering de KMeans
df_gauss = pd.DataFrame(data=[sil_scores_np, cal_scores_np, dav_scores_np, sil_scores_sc, cal_scores_sc, dav_scores_sc, sil_scores_pt, cal_scores_pt, dav_scores_pt], index=['silhouette gauss np', 'calinski gauss np', 'davies gauss np', 'silhouette gauss sc', 'calinski gauss sc', 'davies gauss sc', 'silhouette gauss pt', 'calinski gauss pt', 'davies gauss pt'], columns=range(2,max_clusters+1))
df_gauss = df_gauss.transpose()
df_gauss

Unnamed: 0,silhouette gauss np,calinski gauss np,davies gauss np,silhouette gauss sc,calinski gauss sc,davies gauss sc,silhouette gauss pt,calinski gauss pt,davies gauss pt
2,0.092629,5.729366,2.795299,0.034767,3.496694,4.21877,0.03466,4.442307,4.108965
3,0.073328,6.926766,2.688644,0.045627,4.358535,3.50399,0.032971,4.104391,3.735444
4,0.070285,7.025267,2.532773,0.037166,3.64542,3.340636,0.024906,3.709495,3.426691
5,0.066662,5.928828,2.597368,0.024276,3.332257,3.281868,0.027558,3.423191,3.265503
6,0.060295,5.393163,2.375302,0.029706,3.257229,2.959615,0.028244,3.360237,3.146494
7,0.039715,4.661723,2.422581,0.026441,3.060117,2.600898,0.021705,3.123818,3.027117
8,0.032369,4.35574,2.425456,0.022098,2.780611,2.704535,0.011083,2.729165,3.008491
9,0.029499,4.183212,2.385524,0.015423,2.673422,2.516645,0.011813,2.660337,2.71471
10,0.033033,3.995337,2.327723,0.015936,2.625421,2.433562,0.012621,2.562421,2.607609


In [9]:
df_todos = pd.concat([df_kmeans, df_gauss], axis=1)
df_todos

Unnamed: 0,silhouette kmeans np,calinski kmeans np,davies kmeans np,silhouette kmeans sc,calinski kmeans sc,davies kmeans sc,silhouette kmeans pt,calinski kmeans pt,davies kmeans pt,silhouette gauss np,calinski gauss np,davies gauss np,silhouette gauss sc,calinski gauss sc,davies gauss sc,silhouette gauss pt,calinski gauss pt,davies gauss pt
2,0.084161,9.208664,2.904539,0.044104,5.408777,3.905837,0.04449,5.37069,3.957081,0.092629,5.729366,2.795299,0.034767,3.496694,4.21877,0.03466,4.442307,4.108965
3,0.07154,7.700618,2.648572,0.036193,4.43688,3.518931,0.036504,4.606123,3.55557,0.073328,6.926766,2.688644,0.045627,4.358535,3.50399,0.032971,4.104391,3.735444
4,0.070285,7.025267,2.532773,0.040733,4.11439,3.181768,0.034276,3.916446,3.277912,0.070285,7.025267,2.532773,0.037166,3.64542,3.340636,0.024906,3.709495,3.426691
5,0.056016,6.00061,2.294767,0.030928,3.706736,3.161036,0.024613,3.56376,3.230337,0.066662,5.928828,2.597368,0.024276,3.332257,3.281868,0.027558,3.423191,3.265503
6,0.058584,5.539915,2.335888,0.02018,3.353503,2.978467,0.028244,3.360237,3.146494,0.060295,5.393163,2.375302,0.029706,3.257229,2.959615,0.028244,3.360237,3.146494
7,0.057119,5.229888,2.472307,0.021418,3.175996,2.929901,0.025343,3.17016,2.956522,0.039715,4.661723,2.422581,0.026441,3.060117,2.600898,0.021705,3.123818,3.027117
8,0.051126,4.741064,2.307607,0.030433,3.116643,2.682212,0.032163,3.151245,2.704886,0.032369,4.35574,2.425456,0.022098,2.780611,2.704535,0.011083,2.729165,3.008491
9,0.050731,4.684676,2.319303,0.032248,3.200656,2.606718,0.006027,2.768873,2.648311,0.029499,4.183212,2.385524,0.015423,2.673422,2.516645,0.011813,2.660337,2.71471
10,0.055192,4.463295,2.121602,0.015319,2.710084,2.553381,0.027439,2.800095,2.640046,0.033033,3.995337,2.327723,0.015936,2.625421,2.433562,0.012621,2.562421,2.607609


In [10]:
# Gráfico de métricas de clustering
fontsize_legend = 20
fontsize_labels = 20
fontsize_ticks = 15
plt.subplot(1,2,1)
plt.tick_params(labelsize = fontsize_ticks)
plt.plot(np.arange(1.9,10.9,1), df_todos['silhouette kmeans np'], 'or', label='kmeans np')
plt.plot(np.arange(1.9,10.9,1), df_todos['silhouette kmeans sc'], '*b', label='kmeans sc')
plt.plot(np.arange(1.9,10.9,1), df_todos['silhouette kmeans pt'], 'Dg', label='kmeans pt')
plt.plot(np.arange(2.1,11.1,1), df_todos['silhouette gauss np'], 'sk', label='gmm np')
plt.plot(np.arange(2.1,11.1,1), df_todos['silhouette gauss sc'], 'vy', label='gmm sc')
plt.plot(np.arange(2.1,11.1,1), df_todos['silhouette gauss pt'], 'Hm', label='gmm pt')
plt.legend(fontsize=fontsize_legend)
plt.xlabel('number of clusters', fontsize=fontsize_labels)
plt.ylabel('silhouette scores',fontsize=fontsize_labels)
plt.subplot(1,2,2)
plt.tick_params(labelsize = fontsize_ticks)
plt.plot(np.arange(1.9,10.9,1), df_todos['davies kmeans np'], 'or', label='kmeans np')
plt.plot(np.arange(1.9,10.9,1), df_todos['davies kmeans sc'], '*b', label='kmeans sc')
plt.plot(np.arange(1.9,10.9,1), df_todos['davies kmeans pt'], 'Dg', label='kmeans pt')
plt.plot(np.arange(2.1,11.1,1), df_todos['davies gauss np'], 'sk', label='gmm np')
plt.plot(np.arange(2.1,11.1,1), df_todos['davies gauss sc'], 'vy', label='gmm sc')
plt.plot(np.arange(2.1,11.1,1), df_todos['davies gauss pt'], 'Hm',label='gmm pt')
plt.xlabel('number of clusters', fontsize=fontsize_labels)
plt.ylabel('davies bouldin scores', fontsize=fontsize_labels)
plt.legend(fontsize=fontsize_legend)

<matplotlib.legend.Legend at 0x2109f9ae070>

The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


In [66]:
df_todos.to_csv('métricas clustering sin edad.csv', index_label='number of clusters')

In [30]:
# parece que los mejores modelos son GMM con 2 clústeres sin escalizar y KMeans con 4 clústeres sin escalizar
data_clusters = data_all.copy()
kmeans = KMeans(4, random_state=72).fit(data_all_cod)
labels_k = kmeans.labels_
data_clusters['labels kmeans'] = labels_k
gauss = GaussianMixture(2, random_state=72).fit(data_all_cod)
labels_g = gauss.predict(data_all_cod)
data_clusters['labels gmm'] = labels_g
data_clusters

Unnamed: 0_level_0,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,diameter_b2d,diameter_dc,...,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH,labels kmeans,labels gmm
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,0.063492,0.365079,...,11,27,20,16,1.50,5.75,9.00,0.5,3,1
21101,0.222222,0.269841,-0.047619,0.269841,0.365079,-0.095238,0.333333,0.365079,-0.031746,0.317460,...,6,70,33,26,0.00,1.25,9.50,0.5,0,0
21102,0.301587,0.301587,0.000000,0.285714,0.285714,0.000000,0.253968,0.301587,-0.047619,0.380952,...,10,42,13,12,0.00,0.00,4.50,0.0,2,1
21103,0.333333,0.396825,-0.063492,0.269841,0.285714,-0.015873,0.349206,0.285714,0.063492,0.333333,...,12,32,16,15,0.00,0.00,6.25,0.0,2,0
21104,0.238095,0.238095,0.000000,0.317460,0.396825,-0.079365,0.269841,0.380952,-0.111111,0.285714,...,8,30,16,17,2.00,1.50,7.50,0.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24053,0.222222,0.222222,0.000000,0.253968,0.269841,-0.015873,0.269841,0.190476,0.079365,0.365079,...,14,36,16,12,0.00,3.25,7.25,0.0,3,1
24065,0.285714,0.285714,0.000000,0.285714,0.301587,-0.015873,0.285714,0.222222,0.063492,0.380952,...,8,39,18,12,4.25,5.25,7.50,0.0,3,1
24069,0.253968,0.301587,-0.047619,0.380952,0.285714,0.095238,0.396825,0.301587,0.095238,0.301587,...,12,28,15,12,2.50,4.00,10.00,0.0,2,1
24073,0.412698,0.460317,-0.047619,0.253968,0.317460,-0.063492,0.222222,0.206349,0.015873,0.317460,...,13,27,16,12,0.00,0.00,9.50,0.0,3,1


In [31]:
data_clusters.to_csv('clusters_gmm_2_kmeans_4_sin_edad_IMA.csv')

## Validación con clasificador XGBoost

In [32]:
# Con labels de GMM
X = data_clusters.drop(['labels kmeans', 'labels gmm'], axis=1).copy()
X = pd.get_dummies(X)
y = data_clusters['labels gmm'].copy()

# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=72, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)

# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

# espacio de hiperparámetros a optimizar
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
# Modelo XGBoost sin preprocesar
params_np, scores_np, test_score_np = ml.modelo_xgboost_np(param_space=param_space, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# Modelo XGBoost con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_sc, scores_sc, test_score_sc = ml.modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# Modelo XGBoost con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_pt, scores_pt, test_score_pt = ml.modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

Best score: 0.9428571428571428: 100%|██████████| 20/20 [02:31<00:00,  7.55s/it]


best parameters np: {'colsample_bytree': 0.9775002864853314, 'eta': 0.3343641063579844, 'max_depth': 3, 'n_estimators': 58, 'subsample': 0.7336809684544185}
best accuracy np: 0.9428571428571428


Best score: 0.9285714285714286: 100%|██████████| 20/20 [02:07<00:00,  6.37s/it]


best parameters: {'colsample_bytree': 0.997238156107808, 'eta': 0.2429920556650048, 'max_depth': 6, 'n_estimators': 64, 'subsample': 0.7414074766815201}
best accuracy: 0.9285714285714286


Best score: 0.9428571428571428: 100%|██████████| 20/20 [02:36<00:00,  7.80s/it]


best parameters: {'colsample_bytree': 0.5920108339979494, 'eta': 0.9176669273778223, 'max_depth': 8, 'n_estimators': 90, 'subsample': 0.9795485389332887}
best accuracy: 0.9428571428571428


In [33]:
# Con labels de KMeans
X = data_clusters.drop(['labels kmeans', 'labels gmm'], axis=1).copy()
X = pd.get_dummies(X)
y = data_clusters['labels kmeans'].copy()

# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=72, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)

# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

# espacio de hiperparámetros a optimizar
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))

# Modelo XGBoost sin preprocesar
params_np_k, scores_np_k, test_score_np_k = ml.modelo_xgboost_np(param_space=param_space, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# Modelo XGBoost con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_sc_k, scores_sc_k, test_score_sc_k = ml.modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# Modelo XGBoost con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_pt_k, scores_pt_k, test_score_pt_k = ml.modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

Best score: 0.7428571428571429: 100%|██████████| 20/20 [02:59<00:00,  8.97s/it]


best parameters np: {'colsample_bytree': 0.6317744780302103, 'eta': 0.6846449479742808, 'max_depth': 9, 'n_estimators': 82, 'subsample': 0.9968517802466174}
best accuracy np: 0.7428571428571429


Best score: 0.7571428571428572: 100%|██████████| 20/20 [02:38<00:00,  7.94s/it]


best parameters: {'colsample_bytree': 0.7174899567009996, 'eta': 0.51894925790377, 'max_depth': 3, 'n_estimators': 61, 'subsample': 0.4303374420226631}
best accuracy: 0.7571428571428572


Best score: 0.7428571428571429: 100%|██████████| 20/20 [03:07<00:00,  9.35s/it]


best parameters: {'colsample_bytree': 0.7722062980274618, 'eta': 0.4913367825052837, 'max_depth': 4, 'n_estimators': 84, 'subsample': 0.4191836783709183}
best accuracy: 0.7428571428571429


In [34]:
clf_results = pd.DataFrame(data=[[params_np, scores_np.mean(), scores_np.std(), test_score_np],[params_sc, scores_sc.mean(), scores_sc.std(), test_score_sc],
                                 [params_pt, scores_pt.mean(), scores_pt.std(), test_score_pt], [params_np_k, scores_np_k.mean(), scores_np_k.std(), test_score_np_k],
                                 [params_sc_k, scores_sc_k.mean(), scores_sc_k.std(), test_score_sc_k], [params_pt_k, scores_pt_k.mean(), scores_pt_k.std(), test_score_pt_k]], 
                                 index=['GMM np', 'GMM sc', 'GMM pt', 'KMeans np', 'KMeans sc', 'KMeans pt'], columns=['best parameters', 'mean cv score', 'sd cv score', 'test score'])
clf_results

Unnamed: 0,best parameters,mean cv score,sd cv score,test score
GMM np,"{'colsample_bytree': 0.9775002864853314, 'eta'...",0.942857,0.083299,0.833333
GMM sc,"{'colsample_bytree': 0.997238156107808, 'eta':...",0.928571,0.078246,0.888889
GMM pt,"{'colsample_bytree': 0.5920108339979494, 'eta'...",0.942857,0.053452,0.888889
KMeans np,"{'colsample_bytree': 0.6317744780302103, 'eta'...",0.742857,0.057143,0.611111
KMeans sc,"{'colsample_bytree': 0.7174899567009996, 'eta'...",0.757143,0.057143,0.555556
KMeans pt,"{'colsample_bytree': 0.7722062980274618, 'eta'...",0.742857,0.034993,0.388889


In [35]:
clf_results.to_csv('resultados clasificadores GMM_2 y KMeans_4 sin edad IMA.csv', index_label='model')

In [36]:
# Mejor modelo GMM con 2 grupos y sin preprocesamiento
params= clf_results.loc['GMM pt']['best parameters']
y = data_clusters['labels gmm'].copy()
model, model_fit = ml.mejor_modelo(params=params, X=X, y=y, pre_pipe='pt')

mean val score:  0.8571428571428571
std val score:  0.04517539514526258
test score:  0.7777777777777778


In [37]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit, X, y_label,n_repeats=20,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
RPQ AP,0.089205
RPQ AR,0.044318
dscore_4,0.005114
diameter_ac,0.000000
mean_mpli_b1i,0.000000
...,...
max_betweenness_gd,0.000000
max_betweenness_tc,0.000000
max_betweenness_ti,0.000000
max_betweenness_td,0.000000


In [38]:
importancia_atributos.to_csv('importancia atributos GMM_2_pt sin edad IMA.csv', index_label='feature')

In [39]:
y = data_clusters['labels gmm'].copy()
# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=72, stratify=y)
df_errados = ml.errores(model=model, label='labels gmm', X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_clusters, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'labels gmm_x':'labels gmm'}, inplace=True)
datos_errados = datos_errados.drop(['labels gmm_y'], axis=1)
datos_errados

Unnamed: 0_level_0,labels gmm,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH,labels kmeans
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21108,0,1,0.238095,0.333333,-0.095238,0.396825,0.253968,0.142857,0.31746,0.285714,...,15,7,35,15,14,0.0,2.5,6.75,0.0,0
21134,0,1,0.333333,0.301587,0.031746,0.31746,0.206349,0.111111,0.269841,0.380952,...,15,11,43,22,18,0.0,3.25,9.75,0.0,2
21116,0,1,0.238095,0.365079,-0.126984,0.285714,0.301587,-0.015873,0.222222,0.253968,...,13,7,28,18,17,0.0,1.5,6.25,0.0,0
23005,0,1,0.238095,0.285714,-0.047619,0.301587,0.301587,0.0,0.269841,0.269841,...,14,18,36,17,13,0.0,0.0,0.0,0.0,3
21122,0,1,0.222222,0.412698,-0.190476,0.253968,0.269841,-0.015873,0.31746,0.269841,...,11,6,40,20,26,0.0,3.0,5.75,0.0,0
21107,0,1,0.253968,0.285714,-0.031746,0.253968,0.285714,-0.031746,0.222222,0.253968,...,11,14,57,27,20,3.5,4.0,0.75,2.0,0
21104,1,0,0.238095,0.238095,0.0,0.31746,0.396825,-0.079365,0.269841,0.380952,...,18,8,30,16,17,2.0,1.5,7.5,0.0,2


In [40]:
datos_errados.to_csv('datos errados GMM_2_pt sin edad IMA.csv')