In [60]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.inspection import permutation_importance
from mango import Tuner, scheduler
from scipy.stats import uniform
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
%matplotlib qt5

In [2]:
data = pd.read_csv('medidas_conectividad_globales_comportamentales_demograficos_2.csv')
data.set_index('subject', inplace=True)
# variables más importantes según modelos de clasificación
data = data[['max_degree_b1d', 'max_degree_b2c', 'max_degree_b2i', 'max_degree_gi', 'mean_eccentricity_ac', 'mean_eccentricity_b2i', 'mean_eccentricity_b2d', 'mean_eccentricity_gc', 'mean_eccentricity_gd', 'exposure_level', 'EX2_score', 'school_years', 'gender', 'IRI_PT', 'IRI_EC', 'RPQ AP', 'TD', 'AN', 'AL', 'mean_eccentricity_ti', 'IRI_PD', 'RPQ AR', 'victims_self']]
# eliminación de sujetos con datos incompletos
data.dropna(inplace=True)
df = data.copy()
# codificación de variables categóricas
data = pd.get_dummies(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88 entries, 21100 to 24101
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   max_degree_b1d         88 non-null     int64  
 1   max_degree_b2c         88 non-null     int64  
 2   max_degree_b2i         88 non-null     int64  
 3   max_degree_gi          88 non-null     int64  
 4   mean_eccentricity_ac   88 non-null     float64
 5   mean_eccentricity_b2i  88 non-null     float64
 6   mean_eccentricity_b2d  88 non-null     float64
 7   mean_eccentricity_gc   88 non-null     float64
 8   mean_eccentricity_gd   88 non-null     float64
 9   EX2_score              88 non-null     int64  
 10  school_years           88 non-null     int64  
 11  IRI_PT                 88 non-null     float64
 12  IRI_EC                 88 non-null     float64
 13  RPQ AP                 88 non-null     float64
 14  TD                     88 non-null     float64
 15  A

In [12]:
# normalización de datos con standard escaler
continuas_cols = data.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = data.select_dtypes(include=['int64']).columns.to_list()
preprocessor_sc = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_sc = preprocessor_sc.fit_transform(data)
# normalización de datos con power transformer
preprocessor_pt = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_pt = preprocessor_pt.fit_transform(data)

In [20]:
def clusters_kmeans(data, max_clusters=10):
    inertias = []
    sil_scores = []
    cal_scores = []
    dav_scores = []
    for i in range(2,max_clusters+1):
        kmeans = KMeans(i, random_state=72).fit(data)
        inertia = kmeans.inertia_
        inertias.append(inertia)
        labels = kmeans.labels_
        sil = silhouette_score(X=data, labels=labels)
        sil_scores.append(sil)
        cal = calinski_harabasz_score(X=data, labels=labels)
        cal_scores.append(cal)
        dav = davies_bouldin_score(X=data, labels=labels)
        dav_scores.append(dav)
    return inertias, sil_scores, cal_scores, dav_scores

In [21]:
# clústeres por KMeans, datos sin normalizar
inertias_np, sil_scores_np, cal_scores_np, dav_scores_np = clusters_kmeans(data=data)
# clústeres por KMeans, Standard Scaler
inertias_sc, sil_scores_sc, cal_scores_sc, dav_scores_sc = clusters_kmeans(data=data_sc)
# clústeres por KMeans, PowerTransformer
inertias_pt, sil_scores_pt, cal_scores_pt, dav_scores_pt = clusters_kmeans(data=data_pt)
# Dataframe con métricas de clustering de KMeans
df_kmeans = pd.DataFrame(data=[sil_scores_np, cal_scores_np, dav_scores_np, sil_scores_sc, cal_scores_sc, dav_scores_sc, sil_scores_pt, cal_scores_pt, dav_scores_pt], index=['silhouette kmeans np', 'calinski kmeans np', 'davies kmeans np', 'silhouette kmeans sc', 'calinski kmeans sc', 'davies kmeans sc', 'silhouette kmeans pt', 'calinski kmeans pt', 'davies kmeans pt'], columns=range(2,11))
df_kmeans = df_kmeans.transpose()
df_kmeans.head()

Unnamed: 0,silhouette kmeans np,calinski kmeans np,davies kmeans np,silhouette kmeans sc,calinski kmeans sc,davies kmeans sc,silhouette kmeans pt,calinski kmeans pt,davies kmeans pt
2,0.14043,15.948231,2.258104,0.115895,10.520811,2.745652,0.103173,11.578432,2.689949
3,0.141865,13.54372,1.953296,0.096446,10.074285,2.42931,0.102325,10.640033,2.487609
4,0.104275,11.728366,2.177642,0.096957,8.989544,2.212793,0.088649,9.166728,2.556416
5,0.098943,10.313543,2.024446,0.088588,8.588638,2.195654,0.09174,8.539001,2.232042
6,0.103143,9.694866,1.749558,0.101929,8.109551,1.998565,0.084863,7.459235,2.188218


In [25]:
# Gráficas
plt.subplot(2,2,1)
plt.plot(range(2,df_kmeans.index.max()+1), inertias_np, 'or')
plt.plot(range(2,df_kmeans.index.max()+1), inertias_sc, 'b+')
plt.plot(range(2,df_kmeans.index.max()+1), inertias_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('inertias')
plt.subplot(2,2,2)
plt.plot(range(2,df_kmeans.index.max()+1), sil_scores_np, 'or')
plt.plot(range(2,df_kmeans.index.max()+1), sil_scores_sc, 'b+')
plt.plot(range(2,df_kmeans.index.max()+1), sil_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,df_kmeans.index.max()+1), cal_scores_np, 'or')
plt.plot(range(2,df_kmeans.index.max()+1), cal_scores_sc, 'b+')
plt.plot(range(2,df_kmeans.index.max()+1), cal_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,df_kmeans.index.max()+1), dav_scores_np, 'or')
plt.plot(range(2,df_kmeans.index.max()+1), dav_scores_sc, 'b+')
plt.plot(range(2,df_kmeans.index.max()+1), dav_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')


Text(0, 0.5, 'davies bouldin scores')

In [26]:
def clusters_gaussian(data, max_clusters=10):
    gauss_scores = []
    sil_scores = []
    cal_scores = []
    dav_scores = []
    for i in range(2,max_clusters+1):
        gauss = GaussianMixture(i, random_state=72).fit(data)
        labels = gauss.predict(data)
        score = gauss.score(data)
        gauss_scores.append(score)
        sil = silhouette_score(X=data, labels=labels)
        sil_scores.append(sil)
        cal = calinski_harabasz_score(X=data, labels=labels)
        cal_scores.append(cal)
        dav = davies_bouldin_score(X=data, labels=labels)
        dav_scores.append(dav)
    return gauss_scores, sil_scores, cal_scores, dav_scores

In [27]:
# clústeres por GMM, datos sin normalizar
max_clusters = 10
gauss_scores_np, sil_scores_np, cal_scores_np, dav_scores_np = clusters_gaussian(data=data, max_clusters=max_clusters)
# clústeres por GMM, Standard Scaler
gauss_scores_sc, sil_scores_sc, cal_scores_sc, dav_scores_sc = clusters_gaussian(data=data_sc, max_clusters=max_clusters)
# clústeres por KMeans, PowerTransformer
gauss_scores_pt, sil_scores_pt, cal_scores_pt, dav_scores_pt = clusters_gaussian(data=data_pt, max_clusters=max_clusters)
# Dataframe con métricas de clustering de KMeans
df_gauss = pd.DataFrame(data=[sil_scores_np, cal_scores_np, dav_scores_np, sil_scores_sc, cal_scores_sc, dav_scores_sc, sil_scores_pt, cal_scores_pt, dav_scores_pt], index=['silhouette gauss np', 'calinski gauss np', 'davies gauss np', 'silhouette gauss sc', 'calinski gauss sc', 'davies gauss sc', 'silhouette gauss pt', 'calinski gauss pt', 'davies gauss pt'], columns=range(2,max_clusters+1))
df_gauss = df_gauss.transpose()
df_gauss.head()

Unnamed: 0,silhouette gauss np,calinski gauss np,davies gauss np,silhouette gauss sc,calinski gauss sc,davies gauss sc,silhouette gauss pt,calinski gauss pt,davies gauss pt
2,0.14043,15.948231,2.258104,0.141303,8.995966,2.301088,0.103173,11.578432,2.689949
3,0.137739,13.441898,1.974317,0.097107,9.079189,2.513739,0.08598,9.237497,2.797421
4,0.121609,11.223888,1.937475,0.099765,7.586436,2.041934,0.099326,9.161644,2.39628
5,0.08846,9.265846,2.221143,0.082408,7.393322,2.169239,0.093969,8.31739,2.302571
6,0.09278,9.026629,2.015649,0.083356,7.053722,2.172089,0.086713,7.297082,2.327979


In [28]:
# Gráficas
plt.subplot(2,2,1)
plt.plot(range(2,df_gauss.index.max()+1), gauss_scores_np, 'or')
plt.plot(range(2,df_gauss.index.max()+1), gauss_scores_sc, 'b+')
plt.plot(range(2,df_gauss.index.max()+1), gauss_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('inertias')
plt.subplot(2,2,2)
plt.plot(range(2,df_gauss.index.max()+1), sil_scores_np, 'or')
plt.plot(range(2,df_gauss.index.max()+1), sil_scores_sc, 'b+')
plt.plot(range(2,df_gauss.index.max()+1), sil_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,df_gauss.index.max()+1), cal_scores_np, 'or')
plt.plot(range(2,df_gauss.index.max()+1), cal_scores_sc, 'b+')
plt.plot(range(2,df_gauss.index.max()+1), cal_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,df_gauss.index.max()+1), dav_scores_np, 'or')
plt.plot(range(2,df_gauss.index.max()+1), dav_scores_sc, 'b+')
plt.plot(range(2,df_gauss.index.max()+1), dav_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

Text(0, 0.5, 'davies bouldin scores')

In [29]:
def clusters_spectral(data, max_clusters=10):
    sil_scores = []
    cal_scores = []
    dav_scores = []
    for i in range(2,max_clusters+1):
        sc = SpectralClustering(i, random_state=72).fit(data)
        labels = sc.labels_
        sil = silhouette_score(X=data, labels=labels)
        sil_scores.append(sil)
        cal = calinski_harabasz_score(X=data, labels=labels)
        cal_scores.append(cal)
        dav = davies_bouldin_score(X=data, labels=labels)
        dav_scores.append(dav)
    return sil_scores, cal_scores, dav_scores

In [30]:
# clústeres por Spectral Clustering, datos sin normalizar
max_clusters=10
sil_scores_np, cal_scores_np, dav_scores_np = clusters_spectral(data=data, max_clusters=max_clusters)
# clústeres por Spectral Clustering, Standard Scaler
sil_scores_sc, cal_scores_sc, dav_scores_sc = clusters_spectral(data=data_sc, max_clusters=max_clusters)
# clústeres por Spectral Clustering, PowerTransformer
sil_scores_pt, cal_scores_pt, dav_scores_pt = clusters_spectral(data=data_pt, max_clusters=max_clusters)
# Dataframe con métricas de clustering de KMeans
df_spectral = pd.DataFrame(data=[sil_scores_np, cal_scores_np, dav_scores_np, sil_scores_sc, cal_scores_sc, dav_scores_sc, sil_scores_pt, cal_scores_pt, dav_scores_pt], index=['silhouette spèctral np', 'calinski spectral np', 'davies spectral np', 'silhouette spectral sc', 'calinski spectral sc', 'davies spectral sc', 'silhouette spectral pt', 'calinski spectral pt', 'davies spectral pt'], columns=range(2,max_clusters+1))
df_spectral = df_spectral.transpose()
df_spectral.head()

Unnamed: 0,silhouette spèctral np,calinski spectral np,davies spectral np,silhouette spectral sc,calinski spectral sc,davies spectral sc,silhouette spectral pt,calinski spectral pt,davies spectral pt
2,0.384709,4.323644,0.466401,0.072662,9.213499,2.592379,0.058295,5.064337,2.951708
3,0.384709,4.323644,0.466401,-0.023366,3.26742,2.116766,0.032183,3.873963,3.380819
4,0.312515,4.235392,0.479128,0.009084,5.097932,2.515847,0.033596,4.561816,2.944617
5,0.312515,4.235392,0.479128,0.016349,4.671844,3.27468,0.041663,5.055261,2.571906
6,0.384709,4.323644,0.466401,0.018762,4.734451,2.538218,0.018136,4.259066,2.469577


In [31]:
# Gráficas
plt.subplot(2,2,1)
plt.subplot(2,2,2)
plt.plot(range(2,df_spectral.index.max()+1), sil_scores_np, 'or')
plt.plot(range(2,df_spectral.index.max()+1), sil_scores_sc, 'b+')
plt.plot(range(2,df_spectral.index.max()+1), sil_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,df_spectral.index.max()+1), cal_scores_np, 'or')
plt.plot(range(2,df_spectral.index.max()+1), cal_scores_sc, 'b+')
plt.plot(range(2,df_spectral.index.max()+1), cal_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,df_spectral.index.max()+1), dav_scores_np, 'or')
plt.plot(range(2,df_spectral.index.max()+1), dav_scores_sc, 'b+')
plt.plot(range(2,df_spectral.index.max()+1), dav_scores_pt, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

Text(0, 0.5, 'davies bouldin scores')

In [36]:
df_todos = pd.concat([df_kmeans, df_gauss, df_spectral], axis=1)
df_todos.head()

Unnamed: 0,silhouette kmeans np,calinski kmeans np,davies kmeans np,silhouette kmeans sc,calinski kmeans sc,davies kmeans sc,silhouette kmeans pt,calinski kmeans pt,davies kmeans pt,silhouette gauss np,...,davies gauss pt,silhouette spèctral np,calinski spectral np,davies spectral np,silhouette spectral sc,calinski spectral sc,davies spectral sc,silhouette spectral pt,calinski spectral pt,davies spectral pt
2,0.14043,15.948231,2.258104,0.115895,10.520811,2.745652,0.103173,11.578432,2.689949,0.14043,...,2.689949,0.384709,4.323644,0.466401,0.072662,9.213499,2.592379,0.058295,5.064337,2.951708
3,0.141865,13.54372,1.953296,0.096446,10.074285,2.42931,0.102325,10.640033,2.487609,0.137739,...,2.797421,0.384709,4.323644,0.466401,-0.023366,3.26742,2.116766,0.032183,3.873963,3.380819
4,0.104275,11.728366,2.177642,0.096957,8.989544,2.212793,0.088649,9.166728,2.556416,0.121609,...,2.39628,0.312515,4.235392,0.479128,0.009084,5.097932,2.515847,0.033596,4.561816,2.944617
5,0.098943,10.313543,2.024446,0.088588,8.588638,2.195654,0.09174,8.539001,2.232042,0.08846,...,2.302571,0.312515,4.235392,0.479128,0.016349,4.671844,3.27468,0.041663,5.055261,2.571906
6,0.103143,9.694866,1.749558,0.101929,8.109551,1.998565,0.084863,7.459235,2.188218,0.09278,...,2.327979,0.384709,4.323644,0.466401,0.018762,4.734451,2.538218,0.018136,4.259066,2.469577


In [37]:
# parece que los mejores modelos son KMeans y GMM con 3 clústeres sin escalizar
data_clusters = data.copy()
kmeans = KMeans(3, random_state=72).fit(data)
labels_k = kmeans.labels_
data_clusters['labels kmeans'] = labels_k
gauss = GaussianMixture(3, random_state=72).fit(data)
labels_g = gauss.predict(data)
data_clusters['labels gmm'] = labels_g
data_clusters.head()

Unnamed: 0_level_0,max_degree_b1d,max_degree_b2c,max_degree_b2i,max_degree_gi,mean_eccentricity_ac,mean_eccentricity_b2i,mean_eccentricity_b2d,mean_eccentricity_gc,mean_eccentricity_gd,EX2_score,...,IRI_PD,RPQ AR,exposure_level_high,exposure_level_low,gender_F,gender_M,victims_self_no,victims_self_yes,labels kmeans,labels gmm
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,0,5,6,10,13.546875,11.3125,2.84375,19.046875,6.671875,8,...,11.0,20.0,1,0,1,0,0,1,0,2
21101,0,5,5,5,10.71875,17.0625,-0.8125,13.15625,-7.34375,8,...,6.0,33.0,1,0,0,1,0,1,2,0
21102,-2,5,6,5,14.703125,14.53125,-1.921875,11.453125,-4.25,1,...,10.0,13.0,0,1,0,1,0,1,1,1
21103,1,4,5,5,16.25,14.078125,2.625,16.71875,0.484375,8,...,12.0,16.0,1,0,0,1,0,1,0,2
21104,-2,6,5,6,11.90625,18.0,-4.515625,19.546875,6.96875,5,...,8.0,16.0,1,0,0,1,0,1,1,0


In [38]:
data_clusters.to_csv('clusters_kmeans_gmm_3.csv')

# Validación clústeres con modelos de clasificación

In [43]:
X = data_clusters.drop(['labels kmeans', 'labels gmm'], axis=1).copy()
y = data_clusters['labels gmm'].copy()
# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)
X_train.shape

(70, 26)

In [48]:
# Modelo XGBoosting sin preprocesar datos
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    model = XGBClassifier(**params)
    error= cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.736213098005421, 'eta': 0.2480338929576088, 'max_depth': 3, 'n_estimators': 66, 'subsample': 0.819624579463738}
best accuracy: 0.9285714285714286


In [49]:
params = best_results['best_params']
model = XGBClassifier(**params)
scores = cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train, y_train_label)
print(model.score(X_test, y_test_label))

0.9285714285714286 0.04517539514526258
0.7222222222222222


In [50]:
# Modelo XGBoost con escalizador
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    numeric_cols = X_train.select_dtypes(include=['float64','int64']).columns.to_list()
    preprocessor = ColumnTransformer([('scale', StandardScaler(), numeric_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.7251311634844219, 'eta': 0.29095852050979, 'max_depth': 6, 'n_estimators': 49, 'subsample': 0.5483997033213193}
best accuracy: 0.9285714285714285


In [51]:
params = best_results['best_params']
numeric_cols = X_train.select_dtypes(include=['float64','int64']).columns.to_list()
preprocessor = ColumnTransformer([('scale', StandardScaler(), numeric_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

0.9285714285714285 0.06388765649999402
0.7777777777777778


In [52]:
# Modelo XGBoost con standard_scaler y min_max_scaler
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
    discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
    preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.955614797985831, 'eta': 0.2761123520020293, 'max_depth': 6, 'n_estimators': 33, 'subsample': 0.3405732785725569}
best accuracy: 0.9428571428571428


In [53]:
params = best_results['best_params']
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

0.9428571428571428 0.05345224838248489
0.7222222222222222


In [54]:
# Modelo XGBoost con power transformer y min_max_scaler
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
    discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
    preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.5726964014056182, 'eta': 0.5942453661633827, 'max_depth': 8, 'n_estimators': 22, 'subsample': 0.8397693232911679}
best accuracy: 0.9285714285714285


In [55]:
params = best_results['best_params']
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

0.9285714285714285 0.06388765649999402
0.7222222222222222


In [56]:
# Mejor modelo
params= {'colsample_bytree': 0.955614797985831, 'eta': 0.2761123520020293, 'max_depth': 6, 'n_estimators': 33, 'subsample': 0.3405732785725569}
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

0.9428571428571428 0.05345224838248489
0.7222222222222222


In [58]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
model = pipe.fit(X, y_label)
r = permutation_importance(model, X, y_label,n_repeats=10,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
IRI_PT,0.239773
RPQ AR,0.084091
AL,0.063636
school_years,0.031818
RPQ AP,0.017045
IRI_PD,0.011364
mean_eccentricity_b2i,0.010227
IRI_EC,0.005682
mean_eccentricity_ti,0.005682
EX2_score,0.004545


In [59]:
importancia_atributos.to_csv('importancia atributos clustering.csv', index_label='feature')

In [62]:
# Extracción sujetos mal clasificados
skf = StratifiedKFold(n_splits=5)

le = LabelEncoder()
le.fit(y_train)
y_train_label = le.fit_transform(y_train)
y_test_label = le.fit_transform(y_test)
model = pipe.fit(X_train, y_train_label)

df_errados = pd.DataFrame(columns=['predicted'])
# errores dataset de entrenamiento
for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train_label)):
    model.fit(X_train.iloc[train_index], y_train_label[train_index])
    y_est = model.predict(X_train.iloc[test_index])
    errado = test_index[y_train_label[test_index] != y_est]
    y_pred = le.inverse_transform(y_est)
    y_p_df = pd.DataFrame(data=(y_pred), index=test_index, columns=['predicted'])
    errado_idx = pd.Index(errado)
    y_errados = y_p_df.loc[errado_idx].copy()
    df_errados = pd.concat([df_errados, y_errados], ignore_index=False)
y_t = y_train.reset_index().copy()
df_errados = pd.merge(y_t, df_errados, how='inner', left_index=True, right_index=True)
df_errados.set_index('subject', inplace=True)

# errores dataset de prueba
y_test_pred = model.predict(X_test)
y_test_pred = le.inverse_transform(y_test_pred)
y_test_pred_df = pd.DataFrame(data=y_test_pred, index=y_test.index, columns=['predicted'])
test_errados_df = pd.merge(y_test, y_test_pred_df, left_index=True, right_index=True)
test_errados_df = test_errados_df[test_errados_df['labels gmm'] != test_errados_df['predicted']]

df_errados = pd.concat([df_errados, test_errados_df], ignore_index=False)
df_errados

Unnamed: 0_level_0,labels gmm,predicted
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
21147,0,1
21111,0,1
22103,2,1
21110,1,2
21109,0,1
21121,0,1
22108,0,1
24043,2,1
21123,2,1


In [63]:
datos_errados = pd.merge(df_errados, data_clusters, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'labels gmm_x':'labels gmm'}, inplace=True)
datos_errados = datos_errados.drop(['labels gmm_y'], axis=1)
datos_errados.head()

Unnamed: 0_level_0,labels gmm_x,predicted,max_degree_b1d,max_degree_b2c,max_degree_b2i,max_degree_gi,mean_eccentricity_ac,mean_eccentricity_b2i,mean_eccentricity_b2d,mean_eccentricity_gc,...,IRI_PD,RPQ AR,exposure_level_high,exposure_level_low,gender_F,gender_M,victims_self_no,victims_self_yes,labels kmeans,labels gmm_y
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21147,0,1,-1,6,5,6,11.8125,13.65625,3.359375,16.953125,...,6.0,16.0,1,0,0,1,0,1,1,0
21111,0,1,-1,5,5,5,12.71875,16.296875,0.984375,12.265625,...,10.0,24.0,1,0,1,0,0,1,2,0
22103,2,1,2,5,5,5,10.859375,13.765625,-2.140625,11.875,...,17.0,15.0,0,1,0,1,1,0,0,2
21110,1,2,-2,5,4,5,11.75,11.859375,2.40625,12.359375,...,11.0,14.0,1,0,0,1,0,1,1,1
21109,0,1,1,6,4,4,13.40625,15.046875,1.5625,16.640625,...,7.0,21.0,1,0,0,1,0,1,2,0


In [65]:
datos_errados.to_csv('errores clustering.csv')