In [1]:
import importlib
import funciones_modelos_ML as ml
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.inspection import permutation_importance
from mango import Tuner, scheduler
from scipy.stats import uniform
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
%matplotlib qt5

In [2]:
importlib.reload(ml)

<module 'funciones_modelos_ML' from 'c:\\Users\\jhquiza\\OneDrive - Universidad de Medellin\\JupyterNotebooks\\IAT\\funciones_modelos_ML.py'>

In [3]:
data = pd.read_csv('medidas_conectividad_globales_comportamentales_demograficos_2.csv')
data.set_index('subject', inplace=True)
# variables más importantes según modelos de clasificación
data = data[['max_degree_b1d', 'max_degree_b2c', 'max_degree_b2i', 'max_degree_gi', 'mean_eccentricity_ac', 'mean_eccentricity_b2i', 'mean_eccentricity_b2d', 'mean_eccentricity_gc', 'mean_eccentricity_gd', 'exposure_level', 'EX2_score', 'school_years', 'gender', 'IRI_PT', 'IRI_EC', 'RPQ AP', 'TD', 'AN', 'AL', 'mean_eccentricity_ti', 'IRI_PD', 'RPQ AR', 'victims_self']]
# eliminación de sujetos con datos incompletos
data.dropna(inplace=True)
df = data.copy()
# codificación de variables categóricas
data = pd.get_dummies(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88 entries, 21100 to 24101
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   max_degree_b1d         88 non-null     int64  
 1   max_degree_b2c         88 non-null     int64  
 2   max_degree_b2i         88 non-null     int64  
 3   max_degree_gi          88 non-null     int64  
 4   mean_eccentricity_ac   88 non-null     float64
 5   mean_eccentricity_b2i  88 non-null     float64
 6   mean_eccentricity_b2d  88 non-null     float64
 7   mean_eccentricity_gc   88 non-null     float64
 8   mean_eccentricity_gd   88 non-null     float64
 9   EX2_score              88 non-null     int64  
 10  school_years           88 non-null     int64  
 11  IRI_PT                 88 non-null     float64
 12  IRI_EC                 88 non-null     float64
 13  RPQ AP                 88 non-null     float64
 14  TD                     88 non-null     float64
 15  A

In [14]:
# normalización de datos con standard scaler
continuas_cols = data.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = data.select_dtypes(include=['int64']).columns.to_list()
preprocessor_sc = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_sc = preprocessor_sc.fit_transform(data)
# normalización de datos con power transformer
preprocessor_pt = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_pt = preprocessor_pt.fit_transform(data)

In [73]:
# otra opción usando todas las variables
data_all = pd.read_csv('medidas_conectividad_globales_comportamentales_demograficos_2.csv')
data_all.set_index('subject', inplace=True)
data_all= data_all.drop(columns=['type','IAT_score_ind','IAT_score_level','dscore_1','dscore_2','dscore_3','dscore_5','dscore_6','dscore_7','dscore_8','modo_0','modo_1','modo_2','group'], axis=1)
data_all.dropna(inplace=True)
data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']] = data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']].astype('int64')
data_all_cod = pd.get_dummies(data_all)

# normalización de datos con standard scaler
continuas_cols = data_all_cod.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = data_all_cod.select_dtypes(include=['int64']).columns.to_list()
preprocessor_sc = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_all_cod_sc = preprocessor_sc.fit_transform(data_all_cod)
# normalización de datos con power transformer
preprocessor_pt = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_all_cod_pt = preprocessor_pt.fit_transform(data_all_cod)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88 entries, 21100 to 24101
Columns: 149 entries, diameter_ac to laterality_I
dtypes: float64(113), int64(28), uint8(8)
memory usage: 98.3 KB


In [4]:
# otra opción usando todas las variables, menos la edad
data_all = pd.read_csv('medidas_conectividad_globales_comportamentales_demograficos_2.csv')
data_all.set_index('subject', inplace=True)
data_all= data_all.drop(columns=['type','IAT_score_ind','IAT_score_level','dscore_1','dscore_2','dscore_3','dscore_5','dscore_6','dscore_7','dscore_8','modo_0','modo_1','modo_2','group', 'age'], axis=1)
data_all.dropna(inplace=True)
data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']] = data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']].astype('int64')
data_all_cod = pd.get_dummies(data_all)

# normalización de datos con standard scaler
continuas_cols = data_all_cod.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = data_all_cod.select_dtypes(include=['int64']).columns.to_list()
preprocessor_sc = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_all_cod_sc = preprocessor_sc.fit_transform(data_all_cod)
# normalización de datos con power transformer
preprocessor_pt = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_all_cod_pt = preprocessor_pt.fit_transform(data_all_cod)

In [4]:
# clústeres por KMeans, datos sin normalizar
inertias_np, sil_scores_np, cal_scores_np, dav_scores_np = ml.clusters_kmeans(data=data_all_cod)
# clústeres por KMeans, Standard Scaler
inertias_sc, sil_scores_sc, cal_scores_sc, dav_scores_sc = ml.clusters_kmeans(data=data_all_cod_sc)
# clústeres por KMeans, PowerTransformer
inertias_pt, sil_scores_pt, cal_scores_pt, dav_scores_pt = ml.clusters_kmeans(data=data_all_cod_pt)
# Dataframe con métricas de clustering de KMeans
df_kmeans = pd.DataFrame(data=[sil_scores_np, cal_scores_np, dav_scores_np, sil_scores_sc, cal_scores_sc, dav_scores_sc, sil_scores_pt, cal_scores_pt, dav_scores_pt], index=['silhouette kmeans np', 'calinski kmeans np', 'davies kmeans np', 'silhouette kmeans sc', 'calinski kmeans sc', 'davies kmeans sc', 'silhouette kmeans pt', 'calinski kmeans pt', 'davies kmeans pt'], columns=range(2,11))
df_kmeans = df_kmeans.transpose()
df_kmeans.head()

Unnamed: 0,silhouette kmeans np,calinski kmeans np,davies kmeans np,silhouette kmeans sc,calinski kmeans sc,davies kmeans sc,silhouette kmeans pt,calinski kmeans pt,davies kmeans pt
2,0.179248,16.294256,1.912958,0.044086,5.407005,3.906467,0.044473,5.368955,3.957708
3,0.090813,12.70255,2.51756,0.036188,4.435521,3.519501,0.036471,4.604617,3.556109
4,0.087038,10.166513,2.373104,0.04072,4.112934,3.182334,0.034255,3.915305,3.278403
5,0.069441,8.627765,2.649309,0.029606,3.672968,3.248932,0.024582,3.562997,3.230534
6,0.069204,7.819516,2.007118,0.020156,3.352491,2.97884,0.028219,3.35907,3.146974


In [5]:
# Gráficas
plt.subplot(2,2,1)
plt.plot(range(2,df_kmeans.index.max()+1), inertias_np, 'or')
plt.plot(range(2,df_kmeans.index.max()+1), inertias_sc, 'b+')
plt.plot(range(2,df_kmeans.index.max()+1), inertias_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('inertias')
plt.subplot(2,2,2)
plt.plot(range(2,df_kmeans.index.max()+1), sil_scores_np, 'or')
plt.plot(range(2,df_kmeans.index.max()+1), sil_scores_sc, 'b+')
plt.plot(range(2,df_kmeans.index.max()+1), sil_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,df_kmeans.index.max()+1), cal_scores_np, 'or')
plt.plot(range(2,df_kmeans.index.max()+1), cal_scores_sc, 'b+')
plt.plot(range(2,df_kmeans.index.max()+1), cal_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,df_kmeans.index.max()+1), dav_scores_np, 'or')
plt.plot(range(2,df_kmeans.index.max()+1), dav_scores_sc, 'b+')
plt.plot(range(2,df_kmeans.index.max()+1), dav_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

Text(0, 0.5, 'davies bouldin scores')

In [6]:
# clústeres por GMM, datos sin normalizar
max_clusters = 10
gauss_scores_np, sil_scores_np, cal_scores_np, dav_scores_np = ml.clusters_gaussian(data=data_all_cod, max_clusters=max_clusters)
# clústeres por GMM, Standard Scaler
gauss_scores_sc, sil_scores_sc, cal_scores_sc, dav_scores_sc = ml.clusters_gaussian(data=data_all_cod_sc, max_clusters=max_clusters)
# clústeres por KMeans, PowerTransformer
gauss_scores_pt, sil_scores_pt, cal_scores_pt, dav_scores_pt = ml.clusters_gaussian(data=data_all_cod_pt, max_clusters=max_clusters)
# Dataframe con métricas de clustering de KMeans
df_gauss = pd.DataFrame(data=[sil_scores_np, cal_scores_np, dav_scores_np, sil_scores_sc, cal_scores_sc, dav_scores_sc, sil_scores_pt, cal_scores_pt, dav_scores_pt], index=['silhouette gauss np', 'calinski gauss np', 'davies gauss np', 'silhouette gauss sc', 'calinski gauss sc', 'davies gauss sc', 'silhouette gauss pt', 'calinski gauss pt', 'davies gauss pt'], columns=range(2,max_clusters+1))
df_gauss = df_gauss.transpose()
df_gauss.head()

Unnamed: 0,silhouette gauss np,calinski gauss np,davies gauss np,silhouette gauss sc,calinski gauss sc,davies gauss sc,silhouette gauss pt,calinski gauss pt,davies gauss pt
2,0.169738,16.263001,1.976148,0.034735,3.495575,4.219364,0.034631,4.440883,4.109559
3,0.095863,12.457332,2.477288,0.045606,4.357119,3.504549,0.032947,4.103121,3.736019
4,0.072063,9.604916,2.388518,0.037144,3.644169,3.341213,0.024885,3.70848,3.4272
5,0.068615,8.182749,2.318511,0.024264,3.331274,3.282261,0.027539,3.422144,3.266041
6,0.05701,7.200882,2.397189,0.029693,3.256066,2.960107,0.028219,3.35907,3.146974


In [7]:
# Gráficas
plt.subplot(2,2,1)
plt.plot(range(2,df_gauss.index.max()+1), gauss_scores_np, 'or')
plt.plot(range(2,df_gauss.index.max()+1), gauss_scores_sc, 'b+')
plt.plot(range(2,df_gauss.index.max()+1), gauss_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('inertias')
plt.subplot(2,2,2)
plt.plot(range(2,df_gauss.index.max()+1), sil_scores_np, 'or')
plt.plot(range(2,df_gauss.index.max()+1), sil_scores_sc, 'b+')
plt.plot(range(2,df_gauss.index.max()+1), sil_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,df_gauss.index.max()+1), cal_scores_np, 'or')
plt.plot(range(2,df_gauss.index.max()+1), cal_scores_sc, 'b+')
plt.plot(range(2,df_gauss.index.max()+1), cal_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,df_gauss.index.max()+1), dav_scores_np, 'or')
plt.plot(range(2,df_gauss.index.max()+1), dav_scores_sc, 'b+')
plt.plot(range(2,df_gauss.index.max()+1), dav_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

Text(0, 0.5, 'davies bouldin scores')

In [8]:
# clústeres por Spectral Clustering, datos sin normalizar
max_clusters=10
sil_scores_np, cal_scores_np, dav_scores_np = ml.clusters_spectral(data=data_all_cod, max_clusters=max_clusters)
# clústeres por Spectral Clustering, Standard Scaler
sil_scores_sc, cal_scores_sc, dav_scores_sc = ml.clusters_spectral(data=data_all_cod_sc, max_clusters=max_clusters)
# clústeres por Spectral Clustering, PowerTransformer
sil_scores_pt, cal_scores_pt, dav_scores_pt = ml.clusters_spectral(data=data_all_cod_pt, max_clusters=max_clusters)
# Dataframe con métricas de clustering de KMeans
df_spectral = pd.DataFrame(data=[sil_scores_np, cal_scores_np, dav_scores_np, sil_scores_sc, cal_scores_sc, dav_scores_sc, sil_scores_pt, cal_scores_pt, dav_scores_pt], index=['silhouette spèctral np', 'calinski spectral np', 'davies spectral np', 'silhouette spectral sc', 'calinski spectral sc', 'davies spectral sc', 'silhouette spectral pt', 'calinski spectral pt', 'davies spectral pt'], columns=range(2,max_clusters+1))
df_spectral = df_spectral.transpose()
df_spectral.head()

Unnamed: 0,silhouette spèctral np,calinski spectral np,davies spectral np,silhouette spectral sc,calinski spectral sc,davies spectral sc,silhouette spectral pt,calinski spectral pt,davies spectral pt
2,0.268039,2.695965,0.586645,0.206637,2.197807,0.659929,0.15158,1.820085,0.727873
3,0.268039,2.695965,0.586645,0.169334,2.206564,0.660351,0.062695,1.616448,0.794792
4,0.268039,2.695965,0.586645,0.162149,2.200767,0.662241,-0.030283,2.108974,3.358815
5,0.268039,2.695965,0.586645,-0.004432,1.768858,2.08322,-0.072222,1.148797,1.530875
6,0.268039,2.695965,0.586645,-0.026035,1.922106,1.443869,-0.067318,1.300244,1.90392


In [9]:
# Gráficas
plt.subplot(2,2,1)
plt.subplot(2,2,2)
plt.plot(range(2,df_spectral.index.max()+1), sil_scores_np, 'or')
plt.plot(range(2,df_spectral.index.max()+1), sil_scores_sc, 'b+')
plt.plot(range(2,df_spectral.index.max()+1), sil_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,df_spectral.index.max()+1), cal_scores_np, 'or')
plt.plot(range(2,df_spectral.index.max()+1), cal_scores_sc, 'b+')
plt.plot(range(2,df_spectral.index.max()+1), cal_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,df_spectral.index.max()+1), dav_scores_np, 'or')
plt.plot(range(2,df_spectral.index.max()+1), dav_scores_sc, 'b+')
plt.plot(range(2,df_spectral.index.max()+1), dav_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

Text(0, 0.5, 'davies bouldin scores')

In [22]:
df_todos = pd.concat([df_kmeans, df_gauss, df_spectral], axis=1)
df_todos.head()

Unnamed: 0,silhouette kmeans np,calinski kmeans np,davies kmeans np,silhouette kmeans sc,calinski kmeans sc,davies kmeans sc,silhouette kmeans pt,calinski kmeans pt,davies kmeans pt,silhouette gauss np,...,davies gauss pt,silhouette spèctral np,calinski spectral np,davies spectral np,silhouette spectral sc,calinski spectral sc,davies spectral sc,silhouette spectral pt,calinski spectral pt,davies spectral pt
2,0.179248,16.294256,1.912958,0.044086,5.407005,3.906467,0.044473,5.368955,3.957708,0.169738,...,4.109559,0.268039,2.695965,0.586645,0.206637,2.197807,0.659929,0.15158,1.820085,0.727873
3,0.090813,12.70255,2.51756,0.036188,4.435521,3.519501,0.036471,4.604617,3.556109,0.095863,...,3.736019,0.268039,2.695965,0.586645,0.169334,2.206564,0.660351,0.062695,1.616448,0.794792
4,0.087038,10.166513,2.373104,0.04072,4.112934,3.182334,0.034255,3.915305,3.278403,0.072063,...,3.4272,0.268039,2.695965,0.586645,0.162149,2.200767,0.662241,-0.030283,2.108974,3.358815
5,0.069441,8.627765,2.649309,0.029606,3.672968,3.248932,0.024582,3.562997,3.230534,0.068615,...,3.266041,0.268039,2.695965,0.586645,-0.004432,1.768858,2.08322,-0.072222,1.148797,1.530875
6,0.069204,7.819516,2.007118,0.020156,3.352491,2.97884,0.028219,3.35907,3.146974,0.05701,...,3.146974,0.268039,2.695965,0.586645,-0.026035,1.922106,1.443869,-0.067318,1.300244,1.90392


In [13]:
df_todos.to_csv('métricas clustering sin edad.csv', index_label='number of clusters')

In [5]:
# parece que los mejores modelos son KMeans y GMM con 2 clústeres sin escalizar
n=2
data_clusters = data_all.copy()
kmeans = KMeans(n, random_state=72).fit(data_all_cod)
labels_k = kmeans.labels_
data_clusters['labels kmeans'] = labels_k
gauss = GaussianMixture(n, random_state=72).fit(data_all_cod)
labels_g = gauss.predict(data_all_cod)
data_clusters['labels gmm'] = labels_g
data_clusters

Unnamed: 0_level_0,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,diameter_b2d,diameter_dc,...,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH,labels kmeans,labels gmm
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,0.063492,0.365079,...,11,27,20,16,1.50,5.75,9.00,0.5,1,1
21101,0.222222,0.269841,-0.047619,0.269841,0.365079,-0.095238,0.333333,0.365079,-0.031746,0.317460,...,6,70,33,26,0.00,1.25,9.50,0.5,0,0
21102,0.301587,0.301587,0.000000,0.285714,0.285714,0.000000,0.253968,0.301587,-0.047619,0.380952,...,10,42,13,12,0.00,0.00,4.50,0.0,0,0
21103,0.333333,0.396825,-0.063492,0.269841,0.285714,-0.015873,0.349206,0.285714,0.063492,0.333333,...,12,32,16,15,0.00,0.00,6.25,0.0,1,1
21104,0.238095,0.238095,0.000000,0.317460,0.396825,-0.079365,0.269841,0.380952,-0.111111,0.285714,...,8,30,16,17,2.00,1.50,7.50,0.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24053,0.222222,0.222222,0.000000,0.253968,0.269841,-0.015873,0.269841,0.190476,0.079365,0.365079,...,14,36,16,12,0.00,3.25,7.25,0.0,1,1
24065,0.285714,0.285714,0.000000,0.285714,0.301587,-0.015873,0.285714,0.222222,0.063492,0.380952,...,8,39,18,12,4.25,5.25,7.50,0.0,1,0
24069,0.253968,0.301587,-0.047619,0.380952,0.285714,0.095238,0.396825,0.301587,0.095238,0.301587,...,12,28,15,12,2.50,4.00,10.00,0.0,1,1
24073,0.412698,0.460317,-0.047619,0.253968,0.317460,-0.063492,0.222222,0.206349,0.015873,0.317460,...,13,27,16,12,0.00,0.00,9.50,0.0,1,1


In [29]:
data_clusters.to_csv('clusters_kmeans_gmm_3_dataset_sin edad.csv')

# Validación clústeres con modelos de clasificación

In [6]:
def val_test_scores(model):
    global X_train, y_train_label, X_test, y_test_label
    scores = cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
    model.fit(X_train, y_train_label)
    test_score = model.score(X_test, y_test_label)
    return scores, test_score

def modelo_xgboost_np(param_space):
    global X_train, y_train_label, X_test, y_test_label
    # Modelo XGBoosting sin preprocesar datos
    @scheduler.parallel(n_jobs=-1)
    def objective(**params):
        global X_train, y_train_label
        model = XGBClassifier(**params)
        score= cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
        return score
    conf_dict = dict(num_iteration=40, domain_size=10000, initial_random=3)
    tuner = Tuner(param_space, objective, conf_dict=conf_dict)
    best_results = tuner.maximize()
    print('best parameters np:', best_results['best_params'])
    print('best accuracy np:', best_results['best_objective'])
    # Scores de validación y prueba
    params = best_results['best_params']
    model = XGBClassifier(**params)
    scores, test_score = val_test_scores(model=model)
    return params, scores, test_score

def modelo_xgboost_sc(param_space, preprocessor):
    global X_train, y_train_label, X_test, y_test_label
    @scheduler.parallel(n_jobs=-1)
    def objective(**params):
        global X_train, y_train_label, preprocessor
        model = Pipeline([('preprocessing', preprocessor),('xg', XGBClassifier(**params))])
        score = cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
        return score
    conf_dict = dict(num_iteration=40, domain_size=10000, initial_random=3)
    tuner = Tuner(param_space, objective, conf_dict=conf_dict)
    best_results = tuner.maximize()
    print('best parameters:', best_results['best_params'])
    print('best accuracy:', best_results['best_objective'])
    params = best_results['best_params']
    model = Pipeline([('preprocessing', preprocessor),('xg', XGBClassifier(**params))])
    scores, test_score = val_test_scores(model=model)
    return params, scores, test_score

## Clasificadores con etiquetas GMM

In [7]:
X = data_clusters.drop(['labels kmeans', 'labels gmm'], axis=1).copy()
X = pd.get_dummies(X)
y = data_clusters['labels gmm'].copy()

# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)

# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

# espacio de hiperparámetros a optimizar
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))

In [8]:
# Modelo XGBoost sin preprocesar
params_np, scores_np, test_score_np = modelo_xgboost_np(param_space=param_space)

# Modelo XGBoost con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_sc, scores_sc, test_score_sc = modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor)

# Modelo XGBoost con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_pt, scores_pt, test_score_pt = modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor)

Best score: 1.0: 100%|██████████| 40/40 [00:44<00:00,  1.12s/it]               


best parameters np: {'colsample_bytree': 0.5376641235275361, 'eta': 0.6663638775704284, 'max_depth': 7, 'n_estimators': 64, 'subsample': 0.9126894832131548}
best accuracy np: 1.0


Best score: 1.0: 100%|██████████| 40/40 [00:40<00:00,  1.01s/it]               


best parameters: {'colsample_bytree': 0.7708280429101674, 'eta': 0.8741167545617391, 'max_depth': 6, 'n_estimators': 32, 'subsample': 0.6827071518781187}
best accuracy: 1.0


Best score: 0.9857142857142858: 100%|██████████| 40/40 [03:01<00:00,  4.53s/it]


best parameters: {'colsample_bytree': 0.9937474941539925, 'eta': 0.386224880515471, 'max_depth': 9, 'n_estimators': 46, 'subsample': 0.9106290254375463}
best accuracy: 0.9857142857142858


## Clasificadores con etiquetas KMeans

In [9]:
y = data_clusters['labels kmeans'].copy()

# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)

# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

In [10]:
# Modelo XGBoost sin preprocesar
params_np_k, scores_np_k, test_score_np_k = modelo_xgboost_np(param_space=param_space)

# Modelo XGBoost con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_sc_k, scores_sc_k, test_score_sc_k = modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor)

# Modelo XGBoost con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_pt_k, scores_pt_k, test_score_pt_k = modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor)

Best score: 1.0: 100%|██████████| 40/40 [00:43<00:00,  1.09s/it]               


best parameters np: {'colsample_bytree': 0.8049327844690753, 'eta': 0.7767005778293112, 'max_depth': 5, 'n_estimators': 87, 'subsample': 0.3333727910687473}
best accuracy np: 1.0


Best score: 1.0: 100%|██████████| 40/40 [00:56<00:00,  1.41s/it]              


best parameters: {'colsample_bytree': 0.560861509573755, 'eta': 0.46348365141296843, 'max_depth': 7, 'n_estimators': 22, 'subsample': 0.6523351810230892}
best accuracy: 1.0


Best score: 1.0: 100%|██████████| 40/40 [03:32<00:00,  5.31s/it]               


best parameters: {'colsample_bytree': 0.662017858298335, 'eta': 0.5895545155155124, 'max_depth': 7, 'n_estimators': 74, 'subsample': 0.6742622602871143}
best accuracy: 1.0


In [11]:
clf_results = pd.DataFrame(data=[[params_np, scores_np.mean(), scores_np.std(), test_score_np],[params_sc, scores_sc.mean(), scores_sc.std(), test_score_sc],[params_pt, scores_pt.mean(), scores_pt.std(), test_score_pt], [params_np_k, scores_np_k.mean(), scores_np_k.std(), test_score_np_k],[params_sc_k, scores_sc_k.mean(), scores_sc_k.std(), test_score_sc_k],[params_pt_k, scores_pt_k.mean(), scores_pt_k.std(), test_score_pt_k]], index=['GMM np', 'GMM sc', 'GMM pt', 'KMeans np', 'KMeans sc', 'KMeans pt'], columns=['best parameters', 'mean cv score', 'sd cv score', 'test score'])
clf_results

Unnamed: 0,best parameters,mean cv score,sd cv score,test score
GMM np,"{'colsample_bytree': 0.5376641235275361, 'eta'...",1.0,0.0,1.0
GMM sc,"{'colsample_bytree': 0.7708280429101674, 'eta'...",1.0,0.0,1.0
GMM pt,"{'colsample_bytree': 0.9937474941539925, 'eta'...",0.985714,0.028571,1.0
KMeans np,"{'colsample_bytree': 0.8049327844690753, 'eta'...",1.0,0.0,0.944444
KMeans sc,"{'colsample_bytree': 0.560861509573755, 'eta':...",1.0,0.0,0.944444
KMeans pt,"{'colsample_bytree': 0.662017858298335, 'eta':...",1.0,0.0,0.944444


In [12]:
# Mejor modelo
params= clf_results.loc['GMM np']['best parameters']
y = data_clusters['labels gmm'].copy()
model, model_fit = ml.mejor_modelo(params=params, X=X, y=y, pre_pipe='np')

mean val score:  1.0
std val score:  0.0
test score:  1.0


In [13]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit, X, y_label,n_repeats=20,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
IMA,0.375568
diameter_ac,0.000000
mean_mpli_di,0.000000
mean_mpli_b1i,0.000000
mean_mpli_b1d,0.000000
...,...
max_betweenness_gi,0.000000
max_betweenness_gd,0.000000
max_betweenness_tc,0.000000
max_betweenness_ti,0.000000


In [14]:
importancia_atributos.to_csv('importancia atributos clustering 2 dataset sin edad.csv', index_label='feature')

In [15]:
y = data_clusters['labels gmm'].copy()
# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77, stratify=y)
df_errados = ml.errores(model=model, label='labels gmm', X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_clusters, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'labels gmm_x':'labels gmm'}, inplace=True)
datos_errados = datos_errados.drop(['labels gmm_y'], axis=1)
datos_errados

Unnamed: 0_level_0,labels gmm,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH,labels kmeans
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [34]:
datos_errados.to_csv('errores clustering 2 dataset sin edad gmm.csv')