In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from mango import Tuner, scheduler
import matplotlib.pyplot as plt
%matplotlib qt5

In [2]:
data = pd.read_csv('medidas_conectividad_globales_comportamentales_demograficos_2.csv')
data.set_index('subject', inplace=True)
# variables más importantes según modelos de clasificación
data = data[['EX2_score', 'IRI_PT', 'gender', 'IRI_EC', 'mean_eccentricity_ti', 'mean_eccentricity_b2i', 'school_years', 'mean_eccentricity_b2d', 'AL', 'victims_self']]
# eliminación de sujetos con datos incompletos
data.dropna(inplace=True)
df = data.copy()
# codificación de variables categóricas
data = pd.get_dummies(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88 entries, 21100 to 24101
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   EX2_score              88 non-null     int64  
 1   IRI_PT                 88 non-null     float64
 2   gender                 88 non-null     object 
 3   IRI_EC                 88 non-null     float64
 4   mean_eccentricity_ti   88 non-null     float64
 5   mean_eccentricity_b2i  88 non-null     float64
 6   school_years           88 non-null     int64  
 7   mean_eccentricity_b2d  88 non-null     float64
 8   AL                     88 non-null     float64
 9   victims_self           88 non-null     object 
dtypes: float64(6), int64(2), object(2)
memory usage: 7.6+ KB


In [4]:
# normalización de datos con standard escaler
continuas_cols = data.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = data.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_sc = preprocessor.fit_transform(data)

In [5]:
# normalización de datos con power transformer
continuas_cols = data.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = data.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
data_pt = preprocessor.fit_transform(data)

In [18]:
# clústeres por KMeans
inertias = []
sil_scores = []
cal_scores = []
dav_scores = []
for i in range(2,12):
    kmeans = KMeans(i, random_state=72).fit(data)
    inertia = kmeans.inertia_
    inertias.append(inertia)
    labels = kmeans.labels_
    sil = silhouette_score(X=data, labels=labels)
    sil_scores.append(sil)
    cal = calinski_harabasz_score(X=data, labels=labels)
    cal_scores.append(cal)
    dav = davies_bouldin_score(X=data, labels=labels)
    dav_scores.append(dav)
plt.subplot(2,2,1)
plt.plot(range(2,12), inertias, 'or')
plt.xlabel('number of clusters')
plt.ylabel('inertias')
plt.subplot(2,2,2)
plt.plot(range(2,12), sil_scores, 'or')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,12), cal_scores, 'or')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,12), dav_scores, 'or')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

df = pd.DataFrame(data=[sil_scores, cal_scores, dav_scores], index=['silhouette kmeans', 'calinski kmeans', 'davies kmeans'], columns=range(2,12))
df = df.transpose()
df.head()

Unnamed: 0,silhouette kmeans,calinski kmeans,davies kmeans
2,0.187889,22.126945,1.876505
3,0.174086,20.44251,1.72644
4,0.156731,17.652485,1.792626
5,0.143442,15.422921,1.755334
6,0.127261,14.053908,1.648325


In [19]:
# clústeres por KMeans
inertias = []
sil_scores = []
cal_scores = []
dav_scores = []
for i in range(2,12):
    kmeans = KMeans(i, random_state=72).fit(data_sc)
    inertia = kmeans.inertia_
    inertias.append(inertia)
    labels = kmeans.labels_
    sil = silhouette_score(X=data_sc, labels=labels)
    sil_scores.append(sil)
    cal = calinski_harabasz_score(X=data_sc, labels=labels)
    cal_scores.append(cal)
    dav = davies_bouldin_score(X=data_sc, labels=labels)
    dav_scores.append(dav)
plt.subplot(2,2,1)
plt.plot(range(2,12), inertias, 'b+')
plt.xlabel('number of clusters')
plt.ylabel('inertias')
plt.subplot(2,2,2)
plt.plot(range(2,12), sil_scores, 'b+')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,12), cal_scores, 'b+')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,12), dav_scores, 'b+')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

df['silhouette kmeans sc'] = sil_scores
df['calinski kmeans sc'] = cal_scores
df['davies kmeans sc'] = dav_scores
df.head()

Unnamed: 0,silhouette kmeans,calinski kmeans,davies kmeans,silhouette kmeans sc,calinski kmeans sc,davies kmeans sc
2,0.187889,22.126945,1.876505,0.152998,17.765369,2.11649
3,0.174086,20.44251,1.72644,0.1701,18.712719,1.747855
4,0.156731,17.652485,1.792626,0.160954,17.139713,1.711894
5,0.143442,15.422921,1.755334,0.145071,15.494967,1.706103
6,0.127261,14.053908,1.648325,0.149932,14.783664,1.586329


In [20]:
# clústeres por KMeans
inertias = []
sil_scores = []
cal_scores = []
dav_scores = []
for i in range(2,12):
    kmeans = KMeans(i, random_state=72).fit(data_pt)
    inertia = kmeans.inertia_
    inertias.append(inertia)
    labels = kmeans.labels_
    sil = silhouette_score(X=data_pt, labels=labels)
    sil_scores.append(sil)
    cal = calinski_harabasz_score(X=data_pt, labels=labels)
    cal_scores.append(cal)
    dav = davies_bouldin_score(X=data_pt, labels=labels)
    dav_scores.append(dav)

plt.subplot(2,2,1)
plt.plot(range(2,12), inertias, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('inertias')
plt.subplot(2,2,2)
plt.plot(range(2,12), sil_scores, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,12), cal_scores, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,12), dav_scores, 'ks')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

df['silhouette kmeans pt'] = sil_scores
df['calinski kmeans pt'] = cal_scores
df['davies kmeans pt'] = dav_scores
df.head()

Unnamed: 0,silhouette kmeans,calinski kmeans,davies kmeans,silhouette kmeans sc,calinski kmeans sc,davies kmeans sc,silhouette kmeans pt,calinski kmeans pt,davies kmeans pt
2,0.187889,22.126945,1.876505,0.152998,17.765369,2.11649,0.155111,18.477087,2.091533
3,0.174086,20.44251,1.72644,0.1701,18.712719,1.747855,0.160143,18.254031,1.751729
4,0.156731,17.652485,1.792626,0.160954,17.139713,1.711894,0.156644,16.722571,1.694557
5,0.143442,15.422921,1.755334,0.145071,15.494967,1.706103,0.140067,14.78986,1.779262
6,0.127261,14.053908,1.648325,0.149932,14.783664,1.586329,0.141026,13.759064,1.659389


In [21]:
scores = []
sil_scores = []
cal_scores = []
dav_scores = []
for i in range(2,12):
    gauss = GaussianMixture(i, random_state=72).fit(data)
    labels = gauss.predict(data)
    score = gauss.score(data)
    scores.append(score)
    sil = silhouette_score(X=data, labels=labels)
    sil_scores.append(sil)
    cal = calinski_harabasz_score(X=data, labels=labels)
    cal_scores.append(cal)
    dav = davies_bouldin_score(X=data, labels=labels)
    dav_scores.append(dav)

df['silhouette gauss'] = sil_scores
df['calinski gauss'] = cal_scores
df['davies gauss'] = dav_scores

df.head()
# plt.subplot(2,2,1)
# plt.plot(range(2,12), scores, 'or')
# plt.xlabel('number of clusters')
# plt.ylabel('logprob scores')
plt.subplot(2,2,2)
plt.plot(range(2,12), sil_scores, 'g*')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,12), cal_scores, 'g*')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,12), dav_scores, 'g*')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

Text(710.7926767676767, 0.5, 'davies bouldin scores')

In [22]:
scores = []
sil_scores = []
cal_scores = []
dav_scores = []
for i in range(2,12):
    gauss = GaussianMixture(i, random_state=72).fit(data_sc)
    labels = gauss.predict(data_sc)
    score = gauss.score(data_sc)
    scores.append(score)
    sil = silhouette_score(X=data_sc, labels=labels)
    sil_scores.append(sil)
    cal = calinski_harabasz_score(X=data_sc, labels=labels)
    cal_scores.append(cal)
    dav = davies_bouldin_score(X=data_sc, labels=labels)
    dav_scores.append(dav)

df['silhouette gauss sc'] = sil_scores
df['calinski gauss sc'] = cal_scores
df['davies gauss sc'] = dav_scores
df.head()

# plt.subplot(2,2,1)
# plt.plot(range(2,12), scores, 'or')
# plt.xlabel('number of clusters')
# plt.ylabel('logprob scores')
plt.subplot(2,2,2)
plt.plot(range(2,12), sil_scores, 'y>')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,12), cal_scores, 'y>')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,12), dav_scores, 'y>')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

Text(710.7926767676767, 0.5, 'davies bouldin scores')

In [23]:
scores = []
sil_scores = []
cal_scores = []
dav_scores = []
for i in range(2,12):
    gauss = GaussianMixture(i, random_state=72).fit(data_pt)
    labels = gauss.predict(data_pt)
    score = gauss.score(data_pt)
    scores.append(score)
    sil = silhouette_score(X=data_pt, labels=labels)
    sil_scores.append(sil)
    cal = calinski_harabasz_score(X=data_pt, labels=labels)
    cal_scores.append(cal)
    dav = davies_bouldin_score(X=data_pt, labels=labels)
    dav_scores.append(dav)

df['silhouette gauss pt'] = sil_scores
df['calinski gauss pt'] = cal_scores
df['davies gauss pt'] = dav_scores
df.head()

# plt.subplot(2,2,1)
# plt.plot(range(2,12), scores, 'or')
# plt.xlabel('number of clusters')
# plt.ylabel('logprob scores')
plt.subplot(2,2,2)
plt.plot(range(2,12), sil_scores, 'md')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,12), cal_scores, 'md')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,12), dav_scores, 'md')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

Text(710.7926767676767, 0.5, 'davies bouldin scores')

In [26]:
from sklearn.cluster import SpectralClustering
sil_scores = []
cal_scores = []
dav_scores = []
for i in range(2,12):
    sc = SpectralClustering(i, random_state=72).fit(data)
    labels = sc.labels_
    sil = silhouette_score(X=data, labels=labels)
    sil_scores.append(sil)
    cal = calinski_harabasz_score(X=data, labels=labels)
    cal_scores.append(cal)
    dav = davies_bouldin_score(X=data, labels=labels)
    dav_scores.append(dav)

df['silhouette spectral'] = sil_scores
df['calinski spectral'] = cal_scores
df['davies spectral'] = dav_scores
df.head()

# plt.subplot(2,2,1)
# plt.plot(range(2,12), scores, 'cx')
# plt.xlabel('number of clusters')
# plt.ylabel('logprob scores')
plt.subplot(2,2,2)
plt.plot(range(2,12), sil_scores, 'cx')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,12), cal_scores, 'cx')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,12), dav_scores, 'cx')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

Text(710.9176767676767, 0.5, 'davies bouldin scores')

In [25]:
from sklearn.cluster import SpectralClustering
sil_scores = []
cal_scores = []
dav_scores = []
for i in range(2,12):
    sc = SpectralClustering(i, random_state=72).fit(data_sc)
    labels = sc.labels_
    sil = silhouette_score(X=data_sc, labels=labels)
    sil_scores.append(sil)
    cal = calinski_harabasz_score(X=data_sc, labels=labels)
    cal_scores.append(cal)
    dav = davies_bouldin_score(X=data_sc, labels=labels)
    dav_scores.append(dav)

df['silhouette spectral sc'] = sil_scores
df['calinski spectral sc'] = cal_scores
df['davies spectral sc'] = dav_scores
df.head()

# plt.subplot(2,2,1)
# plt.plot(range(2,12), scores, 'rx')
# plt.xlabel('number of clusters')
# plt.ylabel('logprob scores')
plt.subplot(2,2,2)
plt.plot(range(2,12), sil_scores, 'rx')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,12), cal_scores, 'rx')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,12), dav_scores, 'rx')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

Text(710.9176767676767, 0.5, 'davies bouldin scores')

In [27]:
from sklearn.cluster import SpectralClustering
sil_scores = []
cal_scores = []
dav_scores = []
for i in range(2,12):
    sc = SpectralClustering(i, random_state=72).fit(data_pt)
    labels = sc.labels_
    sil = silhouette_score(X=data_pt, labels=labels)
    sil_scores.append(sil)
    cal = calinski_harabasz_score(X=data_pt, labels=labels)
    cal_scores.append(cal)
    dav = davies_bouldin_score(X=data_pt, labels=labels)
    dav_scores.append(dav)

df['silhouette spectral pt'] = sil_scores
df['calinski spectral pt'] = cal_scores
df['davies spectral pt'] = dav_scores
df.head()

# plt.subplot(2,2,1)
# plt.plot(range(2,12), scores, 'rx')
# plt.xlabel('number of clusters')
# plt.ylabel('logprob scores')
plt.subplot(2,2,2)
plt.plot(range(2,12), sil_scores, 'kx')
plt.xlabel('number of clusters')
plt.ylabel('silhouette scores')
plt.subplot(2,2,3)
plt.plot(range(2,12), cal_scores, 'kx')
plt.xlabel('number of clusters')
plt.ylabel('calinski harabasz scores')
plt.subplot(2,2,4)
plt.plot(range(2,12), dav_scores, 'kx')
plt.xlabel('number of clusters')
plt.ylabel('davies bouldin scores')

Text(710.9176767676767, 0.5, 'davies bouldin scores')

In [29]:
# parece que el mejor modelo es KMeans sin escalizar con 3 clústeres
kmeans = KMeans(3, random_state=72).fit(data)
inertia = kmeans.inertia_
labels = kmeans.labels_
data['labels'] = labels
data.head()

Unnamed: 0_level_0,EX2_score,IRI_PT,IRI_EC,mean_eccentricity_ti,mean_eccentricity_b2i,school_years,mean_eccentricity_b2d,AL,gender_F,gender_M,victims_self_no,victims_self_yes,labels
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
21100,8,12.0,11.0,15.203125,11.3125,11,2.84375,9.0,1,0,0,1,1
21101,8,25.0,13.0,18.015625,17.0625,11,-0.8125,9.5,0,1,0,1,0
21102,1,20.0,15.0,14.703125,14.53125,11,-1.921875,4.5,0,1,0,1,0
21103,8,16.0,16.0,13.34375,14.078125,18,2.625,6.25,0,1,0,1,0
21104,5,23.0,18.0,16.59375,18.0,11,-4.515625,7.5,0,1,0,1,0


In [31]:
data.to_csv('clusters_kmeans_3.csv')