In [1]:
import importlib
import funciones_modelos_ML as ml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
from mango import Tuner, scheduler
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [2]:
importlib.reload(ml)

<module 'funciones_modelos_ML' from 'c:\\Users\\jhquiza\\OneDrive - Universidad de Medellin\\JupyterNotebooks\\IAT\\funciones_modelos_ML.py'>

# Modelo eliminando sujetos con datos faltantes y usando todas las variables

In [3]:
# otra opción usando todas las variables y sin hacer imputación de datos
data_all = pd.read_csv('medidas_conectividad_globales_comportamentales_demograficos_2.csv')
data_all.set_index('subject', inplace=True)
data_all= data_all.drop(columns=['IAT_score_ind','IAT_score_level','dscore_1','dscore_2','dscore_3','dscore_5','dscore_6','dscore_7','dscore_8','modo_0','modo_1','modo_2'], axis=1)
data_all.dropna(inplace=True)
data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']] = data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']].astype('int64')

In [4]:
def val_test_scores(model):
    global X_train, y_train_label, X_test, y_test_label
    scores = cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
    model.fit(X_train, y_train_label)
    test_score = model.score(X_test, y_test_label)
    return scores, test_score

def modelo_xgboost_np(param_space):
    global X_train, y_train_label, X_test, y_test_label
    # Modelo XGBoosting sin preprocesar datos
    @scheduler.parallel(n_jobs=-1)
    def objective(**params):
        global X_train, y_train_label
        model = XGBClassifier(**params)
        score= cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
        return score
    conf_dict = dict(num_iteration=40, domain_size=10000, initial_random=3)
    tuner = Tuner(param_space, objective, conf_dict)
    best_results = tuner.maximize()
    print('best parameters np:', best_results['best_params'])
    print('best accuracy np:', best_results['best_objective'])
    # Scores de validación y prueba
    params = best_results['best_params']
    model = XGBClassifier(**params)
    scores, test_score = val_test_scores(model=model)
    return params, scores, test_score

def modelo_xgboost_sc(param_space, preprocessor):
    global X_train, y_train_label, X_test, y_test_label
    @scheduler.parallel(n_jobs=-1)
    def objective(**params):
        global X_train, y_train_label, preprocessor
        model = Pipeline([('preprocessing', preprocessor),('xg', XGBClassifier(**params))])
        score = cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
        return score
    conf_dict = dict(num_iteration=40, domain_size=10000, initial_random=3)
    tuner = Tuner(param_space, objective, conf_dict)
    best_results = tuner.maximize()
    print('best parameters:', best_results['best_params'])
    print('best accuracy:', best_results['best_objective'])
    params = best_results['best_params']
    model = Pipeline([('preprocessing', preprocessor),('xg', XGBClassifier(**params))])
    scores, test_score = val_test_scores(model=model)
    return params, scores, test_score

In [5]:
X = data_all.drop(['type', 'group'], axis=1).copy()
X = pd.get_dummies(X)
y = data_all['type'].copy()

# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)

# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

# espacio de hiperparámetros a optimizar
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))

In [6]:
# Modelo XGBoost sin preprocesar
params_np, scores_np, test_score_np = modelo_xgboost_np(param_space=param_space)

# Modelo XGBoost con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_sc, scores_sc, test_score_sc = modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor)

# Modelo XGBoost con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_pt, scores_pt, test_score_pt = modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor)

Best score: 0.8714285714285713: 100%|██████████| 40/40 [01:03<00:00,  1.59s/it]


best parameters np: {'colsample_bytree': 0.5401688217543137, 'eta': 0.22832081815771366, 'max_depth': 6, 'n_estimators': 8, 'subsample': 0.7360669603490515}
best accuracy np: 0.8714285714285713


Best score: 0.8714285714285716: 100%|██████████| 40/40 [00:47<00:00,  1.19s/it]


best parameters: {'colsample_bytree': 0.8343068446432506, 'eta': 0.06495756616418458, 'max_depth': 8, 'n_estimators': 63, 'subsample': 0.691229636293653}
best accuracy: 0.8714285714285716


Best score: 0.8714285714285713: 100%|██████████| 40/40 [02:25<00:00,  3.63s/it]


best parameters: {'colsample_bytree': 0.44574481444537406, 'eta': 0.05690982566394587, 'max_depth': 4, 'n_estimators': 62, 'subsample': 0.7529975275055638}
best accuracy: 0.8714285714285713


In [7]:
clf_results = pd.DataFrame(data=[[params_np, scores_np.mean(), scores_np.std(), test_score_np],[params_sc, scores_sc.mean(), scores_sc.std(), test_score_sc],[params_pt, scores_pt.mean(), scores_pt.std(), test_score_pt]], index=['np', 'sc', 'pt'], columns=['best parameters', 'mean cv score', 'sd cv score', 'test score'])
clf_results

Unnamed: 0,best parameters,mean cv score,sd cv score,test score
np,"{'colsample_bytree': 0.5401688217543137, 'eta'...",0.871429,0.028571,0.666667
sc,"{'colsample_bytree': 0.8343068446432506, 'eta'...",0.871429,0.053452,0.722222
pt,"{'colsample_bytree': 0.44574481444537406, 'eta...",0.871429,0.053452,0.722222


In [8]:
# Mejor modelo
params= clf_results.loc['sc']['best parameters']
y = data_all['type'].copy()
model, model_fit = ml.mejor_modelo(params=params, X=X, y=y, pre_pipe='sc')

mean val score:  0.8714285714285716
std val score:  0.0534522483824849
test score:  0.7222222222222222


In [9]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit, X, y_label,n_repeats=20,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
gender_F,0.069318
EX2_score,0.047727
exposure_level_high,0.010795
AL,0.007955
victims_self_no,0.007386
...,...
IMA,-0.004545
max_betweenness_dd,-0.005682
gender_M,-0.007386
max_betweenness_ac,-0.007386


In [12]:
importancia_atributos.to_csv('importancia atributos clasificación dataset completo sc.csv', index_label='feature')

In [10]:
df_errados = ml.errores(model=model, label='type', X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_all, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'type_x':'type'}, inplace=True)
datos_errados = datos_errados.drop(['type_y'], axis=1)
datos_errados

Unnamed: 0_level_0,type,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22114,controls,ex-combatant,0.396825,0.333333,0.063492,0.222222,0.269841,-0.047619,0.285714,0.269841,...,9,6,13,29,11,12,0.0,0.0,6.0,0.0
21100,ex-combatant,victims,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
24043,victims,ex-combatant,0.31746,0.269841,0.047619,0.31746,0.238095,0.079365,0.285714,0.238095,...,18,15,15,36,11,12,0.0,2.25,5.25,0.0
21137,ex-combatant,victims,0.380952,0.285714,0.095238,0.333333,0.269841,0.063492,0.396825,0.269841,...,5,17,5,33,17,14,0.0,2.0,8.0,0.0
22108,controls,ex-combatant,0.285714,0.253968,0.031746,0.238095,0.269841,-0.031746,0.285714,0.349206,...,15,19,9,33,20,14,0.0,1.5,9.5,0.0


In [11]:
datos_errados

Unnamed: 0_level_0,type,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22114,controls,ex-combatant,0.396825,0.333333,0.063492,0.222222,0.269841,-0.047619,0.285714,0.269841,...,9,6,13,29,11,12,0.0,0.0,6.0,0.0
21100,ex-combatant,victims,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
24043,victims,ex-combatant,0.31746,0.269841,0.047619,0.31746,0.238095,0.079365,0.285714,0.238095,...,18,15,15,36,11,12,0.0,2.25,5.25,0.0
21137,ex-combatant,victims,0.380952,0.285714,0.095238,0.333333,0.269841,0.063492,0.396825,0.269841,...,5,17,5,33,17,14,0.0,2.0,8.0,0.0
22108,controls,ex-combatant,0.285714,0.253968,0.031746,0.238095,0.269841,-0.031746,0.285714,0.349206,...,15,19,9,33,20,14,0.0,1.5,9.5,0.0
24027,victims,ex-combatant,0.222222,0.238095,-0.015873,0.380952,0.238095,0.142857,0.285714,0.269841,...,8,14,11,32,19,13,0.0,0.0,9.0,0.0
21131,ex-combatant,victims,0.285714,0.301587,-0.015873,0.238095,0.333333,-0.095238,0.222222,0.269841,...,7,8,15,28,12,12,0.0,2.0,4.25,0.0
24003,victims,ex-combatant,0.253968,0.253968,0.0,0.31746,0.285714,0.031746,0.285714,0.285714,...,16,15,14,28,12,12,0.0,2.25,5.25,0.0
24069,victims,ex-combatant,0.253968,0.301587,-0.047619,0.380952,0.285714,0.095238,0.396825,0.301587,...,20,19,12,28,15,12,2.5,4.0,10.0,0.0
24015,victims,ex-combatant,0.253968,0.253968,0.0,0.380952,0.238095,0.142857,0.333333,0.269841,...,14,13,14,29,14,13,1.75,3.5,8.25,2.0


In [13]:
datos_errados.to_csv('errores clasificación dataset completo sc.csv')

## Modelo con selección de atributos

In [14]:
# Por selección por modelos
# SVC
X_train_df = X_train.copy()
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(random_state=72).fit(X_train_df, y_train)
model = SelectFromModel(lsvc, threshold='1.5*mean', prefit=True)
X_new = model.transform(X_train_df)
features_lsvc = model.get_feature_names_out(input_features=X_train_df.columns)
features_svc = pd.DataFrame(data=np.ones_like(features_lsvc), columns=['features_lsvc'], index=features_lsvc)

# regresión logística l2
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
lr = LogisticRegression(penalty="l2", solver='saga', max_iter=10000, random_state=72).fit(X_train_df, y_train)
model = SelectFromModel(lr, threshold='1.5*mean', prefit=True)
X_new = model.transform(X_train_df)
features_lrl2 = model.get_feature_names_out(input_features=X_train_df.columns)
features_l2 = pd.DataFrame(data=np.ones_like(features_lrl2), columns=['features_lrl2'], index=features_lrl2)

# regresión logística l1
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
lr = LogisticRegression(penalty="l1", solver='saga', max_iter=10000, random_state=72).fit(X_train_df, y_train)
model = SelectFromModel(lr, threshold='1.5*mean', prefit=True)
X_new = model.transform(X_train_df)
features_lrl1 = model.get_feature_names_out(input_features=X_train_df.columns)
features_l1 = pd.DataFrame(data=np.ones_like(features_lrl1), columns=['features_lrl1'], index=features_lrl1)

# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
rf = RandomForestClassifier(random_state=72).fit(X_train_df, y_train)
model = SelectFromModel(rf, threshold='1.5*mean', prefit=True)
X_new = model.transform(X_train_df)
features_rf = model.get_feature_names_out(input_features=X_train_df.columns)
features_rfo = pd.DataFrame(data=np.ones_like(features_rf), columns=['features_rf'], index=features_rf)

# anova
from sklearn.feature_selection import f_classif
__, p_values = f_classif(X_train_df, y_train)
features_anova = pd.DataFrame(p_values, columns=['p_values'], index=X_train_df.columns)
features_anova = features_anova[features_anova['p_values']<0.05]
features_anova['features_an'] = 1

# información mutua
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X_train_df, y_train)
features_mi = pd.DataFrame(mi, columns=['mutual information'], index=X_train_df.columns)
features_mi = features_mi[features_mi['mutual information']>0.1]
features_mi['features_im'] = 1

# atributos seleccionados
features_sel = features_svc.join([features_l2, features_l1, features_rfo, features_anova, features_mi], how='outer')
features_sel.drop(['p_values','mutual information'], axis=1, inplace=True)
features_sel['total'] = features_sel.sum(axis=1)
features_sel = features_sel[features_sel['total']>=3]
lista_atributos = list(features_sel.index)
print(lista_atributos)

['max_degree_ac', 'max_degree_b2i', 'max_degree_b2d', 'max_degree_gi', 'mean_eccentricity_ai', 'mean_eccentricity_b1c', 'mean_eccentricity_b2i', 'mean_eccentricity_b2d', 'mean_eccentricity_dd', 'mean_eccentricity_gi', 'mean_eccentricity_gd', 'mean_eccentricity_td', 'EX2_score', 'age', 'school_years', 'IRI_PT', 'IRI_EC', 'RPQ AR', 'RPQ AP', 'AN', 'AL', 'exposure_level_high', 'exposure_level_low', 'victims_self_no', 'victims_self_yes', 'gender_F', 'gender_M', 'IRI_PD', 'diameter_b2i']


In [15]:
data_all_cod = pd.get_dummies(data_all)
X_sel = data_all_cod[lista_atributos].copy()

# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=77, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)

# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

# espacio de hiperparámetros a optimizar
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))

In [16]:
# Modelo XGBoost sin preprocesar
params_np, scores_np, test_score_np = modelo_xgboost_np(param_space=param_space)

# Modelo XGBoost con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_sc, scores_sc, test_score_sc = modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor)

# Modelo XGBoost con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
params_pt, scores_pt, test_score_pt = modelo_xgboost_sc(param_space=param_space, preprocessor=preprocessor)

Best score: 0.9428571428571428: 100%|██████████| 40/40 [00:46<00:00,  1.17s/it]


best parameters np: {'colsample_bytree': 0.517466795342206, 'eta': 0.1191499330549105, 'max_depth': 6, 'n_estimators': 56, 'subsample': 0.568746761525535}
best accuracy np: 0.9428571428571428


Best score: 0.9428571428571428: 100%|██████████| 40/40 [00:38<00:00,  1.04it/s]


best parameters: {'colsample_bytree': 0.4822211406205049, 'eta': 0.6804145765715574, 'max_depth': 5, 'n_estimators': 21, 'subsample': 0.8052037779431528}
best accuracy: 0.9428571428571428


Best score: 0.9285714285714286: 100%|██████████| 40/40 [01:15<00:00,  1.90s/it]


best parameters: {'colsample_bytree': 0.3269621301631981, 'eta': 0.6086467388616723, 'max_depth': 4, 'n_estimators': 74, 'subsample': 0.7923655824539855}
best accuracy: 0.9285714285714286


In [17]:
clf_results = pd.DataFrame(data=[[params_np, scores_np.mean(), scores_np.std(), test_score_np],[params_sc, scores_sc.mean(), scores_sc.std(), test_score_sc],[params_pt, scores_pt.mean(), scores_pt.std(), test_score_pt]], index=['np', 'sc', 'pt'], columns=['best parameters', 'mean cv score', 'sd cv score', 'test score'])
clf_results

Unnamed: 0,best parameters,mean cv score,sd cv score,test score
np,"{'colsample_bytree': 0.517466795342206, 'eta':...",0.942857,0.028571,0.777778
sc,"{'colsample_bytree': 0.4822211406205049, 'eta'...",0.942857,0.028571,0.777778
pt,"{'colsample_bytree': 0.3269621301631981, 'eta'...",0.928571,0.045175,0.666667


In [18]:
# Mejor modelo
params= clf_results.loc['np']['best parameters']
y = data_all['type'].copy()
model, model_fit = ml.mejor_modelo(params=params, X=X_sel, y=y, pre_pipe='np')

mean val score:  0.9428571428571428
std val score:  0.02857142857142856
test score:  0.7777777777777778


In [19]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit, X_sel, y_label,n_repeats=20,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X_sel.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
IRI_PT,0.026136
AL,0.025568
IRI_PD,0.011932
mean_eccentricity_b1c,0.010227
gender_F,0.010227
EX2_score,0.008523
RPQ AR,0.006818
age,0.003977
victims_self_yes,0.001705
RPQ AP,0.0


In [20]:
importancia_atributos.to_csv('importancia atributos clasificación atributos seleccionados 2.csv', index_label='feature')

In [21]:
df_errados = ml.errores(model=model, label='type', X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_all, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'type_x':'type'}, inplace=True)
datos_errados = datos_errados.drop(['type_y'], axis=1)
datos_errados

Unnamed: 0_level_0,type,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,ex-combatant,victims,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
21137,ex-combatant,controls,0.380952,0.285714,0.095238,0.333333,0.269841,0.063492,0.396825,0.269841,...,5,17,5,33,17,14,0.0,2.0,8.0,0.0
22108,controls,ex-combatant,0.285714,0.253968,0.031746,0.238095,0.269841,-0.031746,0.285714,0.349206,...,15,19,9,33,20,14,0.0,1.5,9.5,0.0
24027,victims,ex-combatant,0.222222,0.238095,-0.015873,0.380952,0.238095,0.142857,0.285714,0.269841,...,8,14,11,32,19,13,0.0,0.0,9.0,0.0
22106,controls,victims,0.31746,0.31746,0.0,0.31746,0.285714,0.031746,0.301587,0.365079,...,16,17,15,57,21,14,0.0,3.25,7.5,0.0
22110,controls,ex-combatant,0.31746,0.269841,0.047619,0.222222,0.301587,-0.079365,0.269841,0.253968,...,14,13,8,31,18,12,0.0,2.0,8.0,0.0
24045,victims,ex-combatant,0.31746,0.31746,0.0,0.285714,0.31746,-0.031746,0.269841,0.206349,...,13,13,11,49,19,18,0.0,0.5,3.75,0.25
21111,ex-combatant,victims,0.253968,0.269841,-0.015873,0.253968,0.238095,0.015873,0.349206,0.31746,...,15,13,10,44,24,15,4.5,2.25,3.0,0.25
21114,ex-combatant,victims,0.269841,0.238095,0.031746,0.31746,0.269841,0.047619,0.301587,0.301587,...,8,16,10,31,13,13,0.0,2.25,8.25,0.75


In [22]:
datos_errados.to_csv('errores clasificación atributos seleccionados 2.csv')

## Prueba de conjuntos de atributos

In [142]:
lista_atributos_ss = ['diameter_b2i', 'leaf_fraction_b2i', 'mean_mpli_b1c', 'mean_mpli_b1i', 'dscore_4', 'AN', 'AL', 'IH', 'exposure_level', 'victims_self', 'gender', 'IRI_PT', 'mean_eccentricity_b2i', 'tree_hierarchy_ti', 'EX2_score', 'IRI_PD']

lista_atributos_pt = ['mean_mpli_b1c', 'mean_mpli_b1i', 'mean_mpli_dc', 'dscore_4', 'AN', 'AL', 'IH', 'exposure_level', 'victims_self', 'gender', 'IRI_PT', 'diameter_b2i', 'mean_eccentricity_b2i', 'mean_eccentricity_di', 'mean_mpli_gd', 'tree_hierarchy_ti', 'mean_mpli_ai', 'EX2_score', 'IRI_PD']

lista_atributos_3b = ['max_degree_b2c', 'max_degree_b2i', 'max_degree_dc', 'max_degree_gc', 'max_degree_gi', 'mean_eccentricity_b2i', 'mean_eccentricity_b2d', 'mean_eccentricity_ti', 'EX2_score', 'school_years', 'dscore_4', 'IRI_PT', 'IRI_EC', 'RPQ AP', 'TD', 'AL', 'victims_self', 'gender', 'mean_eccentricity_ac', 'AN', 'mean_mpli_b1c', 'IRI_PD']

lista_atributos_4b = ['mean_eccentricity_b2i', 'mean_eccentricity_b2d', 'mean_eccentricity_ti', 'EX2_score', 'school_years', 'dscore_4', 'IRI_PT', 'IRI_EC', 'TD', 'AL', 'victims_self', 'gender', 'AN']

lista_atributos_3a = ['max_degree_b2c', 'max_degree_b2i', 'max_degree_dc', 'max_degree_gc', 'max_degree_gi', 'mean_eccentricity_b2i', 'mean_eccentricity_b2d', 'mean_eccentricity_gc', 'mean_eccentricity_ti', 'EX2_score', 'school_years', 'dscore_4', 'IRI_PT', 'IRI_EC', 'RPQ AP', 'TD', 'AN', 'AL', 'exposure_level', 'victims_self', 'gender', 'mean_eccentricity_ac', 'diameter_b2i', 'mean_mpli_b1c', 'IRI_PD']

lista_atributos_4a = ['mean_eccentricity_b2i', 'mean_eccentricity_b2d', 'mean_eccentricity_ti', 'EX2_score', 'school_years', 'dscore_4', 'IRI_PT', 'IRI_EC', 'AN', 'AL', 'exposure_level', 'victims_self', 'gender']

In [34]:
from sklearn.preprocessing import LabelEncoder
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)
listados_atributos = [lista_atributos_ss, lista_atributos_pt, lista_atributos_3a, lista_atributos_3b, lista_atributos_4a, lista_atributos_4b]
resultados = pd.DataFrame(columns=['features list','model','parameters','score'])
for lista in listados_atributos:
    X_train_sel = X_train[lista]
    param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
    @scheduler.parallel(n_jobs=-1)
    def objective(**params):
        global X_train_sel, y_train_label
        model = XGBClassifier(**params)
        error= cross_val_score(estimator = model, X= X_train_sel, y= y_train_label, scoring='accuracy', cv=5).mean()
        return error
    tuner = Tuner(param_space, objective)
    best_results = tuner.maximize()
    print('best parameters:', best_results['best_params'])
    print('best accuracy:', best_results['best_objective'])
    temp = pd.DataFrame([[lista,'xg_boosting',list(best_results['best_params'].items()),best_results['best_objective']]],columns=['features list','model','parameters','score'])
    resultados = pd.concat([resultados,temp],ignore_index=True)
resultados

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.3738293909292353, 'eta': 0.12183648974944172, 'max_depth': 5, 'n_estimators': 82, 'subsample': 0.7075383822586274}
best accuracy: 0.7923809523809523


  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.5414439137673309, 'eta': 0.12175578549511856, 'max_depth': 5, 'n_estimators': 48, 'subsample': 0.7334854432051042}
best accuracy: 0.819047619047619


  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.5768197432805084, 'eta': 0.29328990931426424, 'max_depth': 4, 'n_estimators': 96, 'subsample': 0.5066068270827632}
best accuracy: 0.8457142857142858


  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.9913324675866748, 'eta': 0.2955294349132507, 'max_depth': 5, 'n_estimators': 88, 'subsample': 0.9028792673128314}
best accuracy: 0.7895238095238095


  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.8361803140246551, 'eta': 0.23097091542603665, 'max_depth': 8, 'n_estimators': 36, 'subsample': 0.6635975725407189}
best accuracy: 0.8457142857142858


  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.8600782212733878, 'eta': 0.04742174960433265, 'max_depth': 5, 'n_estimators': 34, 'subsample': 0.6710795775113861}
best accuracy: 0.8323809523809522


Unnamed: 0,features list,model,parameters,score
0,"[diameter_b2i, leaf_fraction_b2i, mean_mpli_b1...",xg_boosting,"[(colsample_bytree, 0.3738293909292353), (eta,...",0.792381
1,"[mean_mpli_b1c, mean_mpli_b1i, mean_mpli_dc, d...",xg_boosting,"[(colsample_bytree, 0.5414439137673309), (eta,...",0.819048
2,"[max_degree_b2c, max_degree_b2i, max_degree_dc...",xg_boosting,"[(colsample_bytree, 0.5768197432805084), (eta,...",0.845714
3,"[max_degree_b2c, max_degree_b2i, max_degree_dc...",xg_boosting,"[(colsample_bytree, 0.9913324675866748), (eta,...",0.789524
4,"[mean_eccentricity_b2i, mean_eccentricity_b2d,...",xg_boosting,"[(colsample_bytree, 0.8361803140246551), (eta,...",0.845714
5,"[mean_eccentricity_b2i, mean_eccentricity_b2d,...",xg_boosting,"[(colsample_bytree, 0.8600782212733878), (eta,...",0.832381


In [35]:
resultados.to_csv('prueba conjuntos de atributos clasificación grupos conectividad.csv', index=False)

In [40]:
X_sel = X[lista_atributos_4a].copy()
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_sel, y, test_size=0.2, random_state=77, stratify=data.group)
y_train2_label = LabelEncoder().fit_transform(y_train2)
y_test2_label = LabelEncoder().fit_transform(y_test2)
params = {'colsample_bytree':0.8361803140246551, 'eta':0.23097091542603665, 'max_depth':8, 'n_estimators':36, 'subsample':0.6635975725407189}
model = XGBClassifier(**params)
scores = cross_val_score(estimator = model, X= X_train2, y= y_train2_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train2, y_train2_label)
print(model.score(X_test2, y_test2_label))

0.8457142857142858 0.12168879679455344
0.6842105263157895


In [41]:
X_sel = X[lista_atributos_3a].copy()
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_sel, y, test_size=0.2, random_state=77, stratify=data.group)
y_train2_label = LabelEncoder().fit_transform(y_train2)
y_test2_label = LabelEncoder().fit_transform(y_test2)
params = {'colsample_bytree':0.5768197432805084, 'eta':0.29328990931426424, 'max_depth':4, 'n_estimators':96, 'subsample':0.5066068270827632}
model = XGBClassifier(**params)
scores = cross_val_score(estimator = model, X= X_train2, y= y_train2_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train2, y_train2_label)
print(model.score(X_test2, y_test2_label))

0.8457142857142858 0.0716251263884311
0.6842105263157895


In [42]:
X_sel = X[lista_atributos_4b].copy()
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_sel, y, test_size=0.2, random_state=77, stratify=data.group)
y_train2_label = LabelEncoder().fit_transform(y_train2)
y_test2_label = LabelEncoder().fit_transform(y_test2)
params = {'colsample_bytree':0.8600782212733878, 'eta':0.04742174960433265, 'max_depth':5, 'n_estimators':34, 'subsample':0.6710795775113861}
model = XGBClassifier(**params)
scores = cross_val_score(estimator = model, X= X_train2, y= y_train2_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train2, y_train2_label)
print(model.score(X_test2, y_test2_label))

0.8323809523809522 0.03826154176495366
0.5789473684210527


Mejor el modelo 3, porque tiene menos varianza.

## Análisis de relevancia de atributos

In [46]:
from sklearn.inspection import permutation_importance
X_sel = X[lista_atributos_4b].copy()
#X_train2, X_test2, y_train2, y_test2 = train_test_split(X_sel, y, test_size=0.2, random_state=77, stratify=data.group)
#y_train2_label = LabelEncoder().fit_transform(y_train2)
#y_test2_label = LabelEncoder().fit_transform(y_test2)
y_label = LabelEncoder().fit_transform(y)
params = {'colsample_bytree':0.8600782212733878, 'eta':0.04742174960433265, 'max_depth':5, 'n_estimators':34, 'subsample':0.6710795775113861}
model = XGBClassifier(**params).fit(X_sel, y_label)
r = permutation_importance(model, X_sel, y_label,n_repeats=10,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X_sel.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
EX2_score,0.206593
gender,0.134066
mean_eccentricity_b2i,0.083516
AL,0.059341
mean_eccentricity_ti,0.048352
IRI_PT,0.043407
IRI_EC,0.037912
school_years,0.022527
dscore_4,0.018132
victims_self,0.00989


In [45]:
importancia_atributos.to_csv('importancia atributos clasificación grupos conectividad.csv', index_label='feature')

## Análisis errores

In [219]:
from sklearn.model_selection import StratifiedKFold
X_sel = X[lista_atributos_4b].copy()
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_sel, y, test_size=0.2, random_state=77, stratify=data.group)
le = LabelEncoder()
le.fit(y_train2)
y_train2_label = le.fit_transform(y_train2)
y_test2_label = le.fit_transform(y_test2)
y_real = pd.DataFrame(data=y_train2_label, index=y_train2.index, columns=['group'])
y_real.reset_index(inplace=True)

params = {'colsample_bytree':0.8600782212733878, 'eta':0.04742174960433265, 'max_depth':5, 'n_estimators':34, 'subsample':0.6710795775113861}
model = XGBClassifier(**params).fit(X_train2, y_train2_label)
skf = StratifiedKFold(n_splits=5)
X_val = X_train2.values
df_errados = pd.DataFrame(columns=['predicted'])

# errores dataset de entrenamiento
for i, (train_index, test_index) in enumerate(skf.split(X_val, y_train2_label)):
    model.fit(X_val[train_index], y_train2_label[train_index])
    y_est = model.predict(X_val[test_index])
    errado = test_index[y_train2_label[test_index] != y_est]
    #y_r = le.inverse_transform(y_train2_label[test_index])
    y_pred = le.inverse_transform(y_est)
    #y_r_df = pd.DataFrame(data=(y_r), index=test_index, columns=['real'])
    y_p_df = pd.DataFrame(data=(y_pred), index=test_index, columns=['predicted'])
    #y_r_p = pd.merge(y_r_df, y_p_df, left_index=True, right_index=True)
    errado_idx = pd.Index(errado)
    y_errados = y_p_df.loc[errado_idx].copy()
    df_errados = pd.concat([df_errados, y_errados], ignore_index=False)
y_t = y_train2.reset_index().copy()
df_errados = pd.merge(y_t, df_errados, how='inner', left_index=True, right_index=True)
df_errados.set_index('subject', inplace=True)

# errores dataset de prueba
y_test_pred = model.predict(X_test2)
y_test_pred = le.inverse_transform(y_test_pred)
y_test_pred_df = pd.DataFrame(data=y_test_pred, index=y_test2.index, columns=['predicted'])
test_errados_df = pd.merge(y_test2, y_test_pred_df, left_index=True, right_index=True)
test_errados_df = test_errados_df[test_errados_df['group'] != test_errados_df['predicted']]

df_errados = pd.concat([df_errados, test_errados_df], ignore_index=False)
df_errados

Unnamed: 0_level_0,group,predicted
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
21116,ex-guerrillas,ex-paramilitaries
21140,ex-guerrillas,ex-paramilitaries
24027,victims,ex-paramilitaries
22108,controls,ex-paramilitaries
21127,ex-paramilitaries,ex-guerrillas
24057,victims,ex-paramilitaries
21125,ex-guerrillas,ex-paramilitaries
23012,ex-guerrillas,ex-paramilitaries
21100,ex-guerrillas,victims
24014,victims,ex-guerrillas


In [227]:
datos_errados = pd.merge(df_errados, data, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'group_x':'group'}, inplace=True)
variables = lista_atributos_4b
variables.append('group')
variables.append('predicted')
datos_errados = datos_errados[variables]
datos_errados
datos_errados.to_csv('sujetos errados clasificación conectividad grupos.csv', index_label='subject')

# Prueba excluyendo variables demográficas

In [5]:
data.head()

Unnamed: 0_level_0,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,diameter_b2d,diameter_dc,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,0.063492,0.365079,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
21101,0.222222,0.269841,-0.047619,0.269841,0.365079,-0.095238,0.333333,0.365079,-0.031746,0.31746,...,22,13,6,70,33,26,0.0,1.25,9.5,0.5
21102,0.301587,0.301587,0.0,0.285714,0.285714,0.0,0.253968,0.301587,-0.047619,0.380952,...,16,15,10,42,13,12,0.0,0.0,4.5,0.0
21103,0.333333,0.396825,-0.063492,0.269841,0.285714,-0.015873,0.349206,0.285714,0.063492,0.333333,...,19,16,12,32,16,15,0.0,0.0,6.25,0.0
21104,0.238095,0.238095,0.0,0.31746,0.396825,-0.079365,0.269841,0.380952,-0.111111,0.285714,...,15,18,8,30,16,17,2.0,1.5,7.5,0.0


In [5]:
data2 = data.drop(columns=['gender','laterality', 'school_years', 'age'], axis=1).copy()
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 21100 to 24101
Columns: 142 entries, diameter_ac to IH
dtypes: float64(113), int64(26), object(3)
memory usage: 101.7+ KB


In [6]:
# La variable de salida son los grupos de actores
X= data2.drop(['group'], axis=1).copy()
y = data2['group'].copy()
# codificación variables binarias
X['victims_self'] = pd.factorize(X['victims_self'])[0]
X['exposure_level'] = pd.factorize(X['exposure_level'])[0]
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 21100 to 24101
Columns: 141 entries, diameter_ac to IH
dtypes: float64(113), int64(28)
memory usage: 101.0 KB


In [7]:
# Por selección por modelos
# SVC
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(random_state=72).fit(X, y)
model = SelectFromModel(lsvc, threshold='2*mean', prefit=True)
X_new = model.transform(X)
features_lsvc = model.get_feature_names_out(input_features=X.columns)
features_svc = pd.DataFrame(data=np.ones_like(features_lsvc), columns=['features_lsvc'], index=features_lsvc)

# regresión logística l2
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
lr = LogisticRegression(penalty="l2", solver='saga', max_iter=10000, random_state=72).fit(X,y)
model = SelectFromModel(lr, threshold='2*mean', prefit=True)
X_new = model.transform(X)
features_lrl2 = model.get_feature_names_out(input_features=X.columns)
features_l2 = pd.DataFrame(data=np.ones_like(features_lrl2), columns=['features_lrl2'], index=features_lrl2)

# regresión logística l1
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
lr = LogisticRegression(penalty="l1", solver='saga', max_iter=10000, random_state=72).fit(X,y)
model = SelectFromModel(lr, threshold='2*mean', prefit=True)
X_new = model.transform(X)
features_lrl1 = model.get_feature_names_out(input_features=X.columns)
features_l1 = pd.DataFrame(data=np.ones_like(features_lrl1), columns=['features_lrl1'], index=features_lrl1)

# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
rf = RandomForestClassifier(random_state=72).fit(X,y)
model = SelectFromModel(rf, threshold='1.5*mean', prefit=True)
X_new = model.transform(X)
features_rf = model.get_feature_names_out(input_features=X.columns)
features_rfo = pd.DataFrame(data=np.ones_like(features_rf), columns=['features_rf'], index=features_rf)

# anova
from sklearn.feature_selection import f_classif
__, p_values = f_classif(X,y)
features_anova = pd.DataFrame(p_values, columns=['p_values'], index=X.columns)
features_anova = features_anova[features_anova['p_values']<0.05]
features_anova['features_an'] = 1

# información mutua
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X, y)
features_mi = pd.DataFrame(mi, columns=['mutual information'], index=X.columns)
features_mi = features_mi[features_mi['mutual information']>0.1]
features_mi['features_im'] = 1

# atributos seleccionados
features_sel = features_svc.join([features_l2, features_l1, features_rfo, features_anova, features_mi], how='outer')
features_sel.drop(['p_values','mutual information'], axis=1, inplace=True)
features_sel['total'] = features_sel.sum(axis=1)
features_sel = features_sel[features_sel['total']>=3]
lista_atributos = list(features_sel.index)
print(lista_atributos)

['max_degree_b1d', 'max_degree_b2c', 'max_degree_b2i', 'max_degree_dc', 'max_degree_gc', 'max_degree_gi', 'mean_eccentricity_b2i', 'mean_eccentricity_b2d', 'exposure_level', 'EX2_score', 'dscore_4', 'IRI_PT', 'IRI_EC', 'RPQ AP', 'TD', 'AN', 'AL', 'IH', 'mean_eccentricity_ti', 'IRI_PD', 'diameter_b2i', 'mean_mpli_b1c']


In [27]:
X_sel = X[lista_atributos].copy()
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=77, stratify=data2.group)
X_train.shape

(72, 22)

In [28]:
X_sel = X_sel.drop(['IH'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=77, stratify=data2.group)
X_train.shape

(72, 21)

In [47]:
## XG Boosting
from sklearn.preprocessing import LabelEncoder
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train
    model = XGBClassifier(**params)
    error= cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
model = XGBClassifier(**params)
scores = cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train, y_train_label)
print(model.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.7226936106097185, 'eta': 0.5599271556658973, 'max_depth': 9, 'n_estimators': 92, 'subsample': 0.33632145829846005}
best accuracy: 0.720952380952381
0.720952380952381 0.0501788185612171
0.7894736842105263


In [48]:
# con escalizador
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    numeric_cols = X_train.select_dtypes(include=['float64','int64']).columns.to_list()
    preprocessor = ColumnTransformer([('scale', StandardScaler(), numeric_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
numeric_cols = X_train.select_dtypes(include=['float64','int64']).columns.to_list()
preprocessor = ColumnTransformer([('scale', StandardScaler(), numeric_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.9199374589563524, 'eta': 0.4063259758824468, 'max_depth': 6, 'n_estimators': 9, 'subsample': 0.4906945794641526}
best accuracy: 0.7333333333333334
0.7333333333333334 0.10896336940924632
0.5789473684210527


In [45]:
# con standard_scaler y min_max_scaler
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
    discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
    preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.7788533093099961, 'eta': 0.22342482980835654, 'max_depth': 5, 'n_estimators': 97, 'subsample': 0.38460649017525816}
best accuracy: 0.7342857142857142
0.7342857142857142 0.08589399151150083
0.6842105263157895


In [46]:
# con power transformer y min_max_scaler
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
    discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
    preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.6616948900690665, 'eta': 0.18002468544097283, 'max_depth': 7, 'n_estimators': 85, 'subsample': 0.2506667480527316}
best accuracy: 0.7352380952380952
0.7352380952380952 0.08500233423498922
0.7368421052631579


In [22]:

# Mejor modelo
#params = {'colsample_bytree': 0.8266922339106892, 'eta': 0.9171517501210134, 'max_depth': 9, 'n_estimators': 79, 'subsample': 0.9823611838345138}
params = {'colsample_bytree': 0.9945235209206074, 'eta': 0.4071307326352137, 'max_depth': 4, 'n_estimators': 92, 'subsample': 0.16893896699926955}
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.6913483799779206, 'eta': 0.20571609941422297, 'max_depth': 6, 'n_estimators': 62, 'subsample': 0.5342591862748203}
best accuracy: 0.7361904761904762


In [25]:
# Análisis de relevancia
from sklearn.inspection import permutation_importance
y_label = LabelEncoder().fit_transform(y)
model = pipe.fit(X_sel, y_label)
r = permutation_importance(model, X_sel, y_label,n_repeats=10,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X_sel.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
EX2_score,0.143956
IRI_PT,0.1131868
AL,0.1010989
mean_eccentricity_b2i,0.09340659
mean_mpli_b1c,0.08021978
mean_eccentricity_ti,0.07692308
IRI_EC,0.04725275
IRI_PD,0.02307692
dscore_4,0.01758242
max_degree_b2c,0.01428571


In [26]:
importancia_atributos.to_csv('importancia atributos clasificación grupos conectividad sin demográficas 2.csv', index_label='feature')

In [39]:
# Extracción sujetos mal clasificados
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)

le = LabelEncoder()
le.fit(y_train)
y_train_label = le.fit_transform(y_train)
y_test_label = le.fit_transform(y_test)
model = pipe.fit(X_train, y_train_label)

df_errados = pd.DataFrame(columns=['predicted'])
# errores dataset de entrenamiento
for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train_label)):
    model.fit(X_train.iloc[train_index], y_train_label[train_index])
    y_est = model.predict(X_train.iloc[test_index])
    errado = test_index[y_train_label[test_index] != y_est]
    y_pred = le.inverse_transform(y_est)
    y_p_df = pd.DataFrame(data=(y_pred), index=test_index, columns=['predicted'])
    errado_idx = pd.Index(errado)
    y_errados = y_p_df.loc[errado_idx].copy()
    df_errados = pd.concat([df_errados, y_errados], ignore_index=False)
y_t = y_train.reset_index().copy()
df_errados = pd.merge(y_t, df_errados, how='inner', left_index=True, right_index=True)
df_errados.set_index('subject', inplace=True)

# errores dataset de prueba
y_test_pred = model.predict(X_test)
y_test_pred = le.inverse_transform(y_test_pred)
y_test_pred_df = pd.DataFrame(data=y_test_pred, index=y_test.index, columns=['predicted'])
test_errados_df = pd.merge(y_test, y_test_pred_df, left_index=True, right_index=True)
test_errados_df = test_errados_df[test_errados_df['group'] != test_errados_df['predicted']]

df_errados = pd.concat([df_errados, test_errados_df], ignore_index=False)
df_errados

Unnamed: 0_level_0,group,predicted
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
21103,ex-paramilitaries,victims
22112,controls,victims
21115,ex-guerrillas,controls
22108,controls,ex-paramilitaries
21127,ex-paramilitaries,ex-guerrillas
24057,victims,ex-paramilitaries
24043,victims,ex-guerrillas
23004,ex-paramilitaries,ex-guerrillas
21125,ex-guerrillas,ex-paramilitaries
23012,ex-guerrillas,ex-paramilitaries


In [43]:
datos_errados = pd.merge(df_errados, data2, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'group_x':'group'}, inplace=True)
variables = lista_atributos
variables.append('group')
variables.append('predicted')
datos_errados = datos_errados[variables]
datos_errados

Unnamed: 0_level_0,max_degree_b1d,max_degree_b2c,max_degree_b2i,max_degree_dc,max_degree_gc,max_degree_gi,mean_eccentricity_b2i,mean_eccentricity_b2d,exposure_level,EX2_score,...,IRI_EC,RPQ AP,TD,AN,AL,mean_eccentricity_ti,IRI_PD,mean_mpli_b1c,group,predicted
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21103,1,4,5,5,6,5,14.078125,2.625,high,8,...,16,15,0.0,0.0,6.25,13.34375,12,0.016898,ex-paramilitaries,victims
22112,0,5,7,7,4,7,15.875,-0.890625,low,0,...,15,13,0.0,0.0,9.0,13.140625,12,0.030263,controls,victims
21115,-1,5,6,7,5,6,11.9375,0.90625,high,3,...,13,13,0.0,0.5,5.25,19.703125,14,0.020584,ex-guerrillas,controls
22108,1,5,6,7,5,6,17.8125,-3.71875,high,5,...,19,14,0.0,1.5,9.5,15.0,9,0.011113,controls,ex-paramilitaries
21127,0,5,6,5,6,5,12.734375,-0.375,high,3,...,9,12,4.5,2.25,0.5,15.140625,8,0.005713,ex-paramilitaries,ex-guerrillas
24057,1,8,5,7,8,5,15.53125,-3.625,low,5,...,16,13,0.0,0.0,0.0,12.375,13,0.009817,victims,ex-paramilitaries
24043,-1,5,8,4,5,6,11.71875,2.3125,high,7,...,15,12,0.0,2.25,5.25,13.703125,15,0.028276,victims,ex-guerrillas
23004,0,4,9,5,5,6,12.609375,3.359375,high,11,...,8,12,0.0,0.0,0.0,13.734375,6,0.004649,ex-paramilitaries,ex-guerrillas
21125,-1,4,5,5,5,5,15.375,2.1875,high,12,...,16,12,2.5,0.0,6.5,15.390625,7,0.002886,ex-guerrillas,ex-paramilitaries
23012,1,5,6,7,6,6,13.140625,-0.125,high,11,...,14,12,0.0,0.0,0.0,10.265625,4,0.009909,ex-guerrillas,ex-paramilitaries


In [44]:
datos_errados.to_csv('sujetos errados clasificación conectividad grupos sin demográficas.csv', index_label='subject')

In [45]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)

le = LabelEncoder()
le.fit(y)
y_label = le.fit_transform(y)
model = pipe.fit(X_sel, y_label)

df_errados = pd.DataFrame(columns=['predicted'])
# errores dataset de entrenamiento
for i, (train_index, test_index) in enumerate(skf.split(X_sel, y_label)):
    model.fit(X_sel.iloc[train_index], y_label[train_index])
    y_est = model.predict(X_sel.iloc[test_index])
    errado = test_index[y_label[test_index] != y_est]
    y_pred = le.inverse_transform(y_est)
    y_p_df = pd.DataFrame(data=(y_pred), index=test_index, columns=['predicted'])
    errado_idx = pd.Index(errado)
    y_errados = y_p_df.loc[errado_idx].copy()
    df_errados = pd.concat([df_errados, y_errados], ignore_index=False)
y_t = y_train.reset_index().copy()
df_errados = pd.merge(y_t, df_errados, how='inner', left_index=True, right_index=True)
df_errados.set_index('subject', inplace=True)
df_errados

Unnamed: 0_level_0,group,predicted
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
21152,ex-paramilitaries,victims
21103,ex-paramilitaries,ex-guerrillas
21116,ex-guerrillas,controls
21121,ex-guerrillas,victims
21113,ex-guerrillas,victims
23015,ex-paramilitaries,ex-guerrillas
22114,controls,ex-paramilitaries
24101,victims,ex-paramilitaries
22107,controls,victims
21147,ex-guerrillas,ex-paramilitaries


In [46]:
df_errados.sort_values(by='subject', ascending=True)

Unnamed: 0_level_0,group,predicted
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
21102,ex-paramilitaries,ex-paramilitaries
21103,ex-paramilitaries,ex-guerrillas
21106,ex-guerrillas,ex-paramilitaries
21107,ex-guerrillas,ex-guerrillas
21113,ex-guerrillas,victims
21116,ex-guerrillas,controls
21121,ex-guerrillas,victims
21127,ex-paramilitaries,victims
21134,ex-paramilitaries,ex-guerrillas
21135,ex-paramilitaries,victims


# Prueba excluyendo EX2 Score

In [2]:
data = pd.read_csv('medidas_conectividad_globales_comportamentales_demograficos_2.csv')
data.set_index('subject', inplace=True)
data = data.drop(columns=['type','IAT_score_ind','IAT_score_level','dscore_1','dscore_2','dscore_3','dscore_5','dscore_6','dscore_7','dscore_8','modo_0','modo_1','modo_2', 'gender','laterality', 'school_years', 'age', 'EX2_score', 'exposure_level'], axis=1)

# Imputación de datos
# Datos ordinales se imputan con la mediana del grupo redondeada, datos nominales con el más frecuente del grupo
data['IRI_PT'].fillna(data.groupby('group')['IRI_PT'].transform(lambda x: np.around(x.median())), inplace=True)
data['IRI_PD'].fillna(data.groupby('group')['IRI_PD'].transform(lambda x: np.around(x.median())), inplace=True)
data['IRI_EC'].fillna(data.groupby('group')['IRI_EC'].transform(lambda x: np.around(x.median())), inplace=True)
data['IRI_FS'].fillna(data.groupby('group')['IRI_FS'].transform(lambda x: np.around(x.median())), inplace=True)
data['IMA'].fillna(data.groupby('group')['IMA'].transform(lambda x: np.around(x.median())), inplace=True)
data['RPQ AR'].fillna(data.groupby('group')['RPQ AR'].transform(lambda x: np.around(x.median())), inplace=True)
data['RPQ AP'].fillna(data.groupby('group')['RPQ AP'].transform(lambda x: np.around(x.median())), inplace=True)
data['victims_self'].fillna(method='backfill', inplace=True)

data[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']] = data[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']].astype('int64')

# La variable de salida son los grupos de actores
X= data.drop(['group'], axis=1).copy()
y = data['group'].copy()

# codificación variables binarias
X['victims_self'] = pd.factorize(X['victims_self'])[0]
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 21100 to 24101
Columns: 139 entries, diameter_ac to IH
dtypes: float64(113), int64(26)
memory usage: 99.5 KB


In [3]:
# Por selección por modelos
# SVC
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(random_state=72).fit(X, y)
model = SelectFromModel(lsvc, threshold='2*mean', prefit=True)
X_new = model.transform(X)
features_lsvc = model.get_feature_names_out(input_features=X.columns)
features_svc = pd.DataFrame(data=np.ones_like(features_lsvc), columns=['features_lsvc'], index=features_lsvc)

# regresión logística l2
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
lr = LogisticRegression(penalty="l2", solver='saga', max_iter=10000, random_state=72).fit(X,y)
model = SelectFromModel(lr, threshold='2*mean', prefit=True)
X_new = model.transform(X)
features_lrl2 = model.get_feature_names_out(input_features=X.columns)
features_l2 = pd.DataFrame(data=np.ones_like(features_lrl2), columns=['features_lrl2'], index=features_lrl2)

# regresión logística l1
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
lr = LogisticRegression(penalty="l1", solver='saga', max_iter=10000, random_state=72).fit(X,y)
model = SelectFromModel(lr, threshold='2*mean', prefit=True)
X_new = model.transform(X)
features_lrl1 = model.get_feature_names_out(input_features=X.columns)
features_l1 = pd.DataFrame(data=np.ones_like(features_lrl1), columns=['features_lrl1'], index=features_lrl1)

# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
rf = RandomForestClassifier(random_state=72).fit(X,y)
model = SelectFromModel(rf, threshold='1.5*mean', prefit=True)
X_new = model.transform(X)
features_rf = model.get_feature_names_out(input_features=X.columns)
features_rfo = pd.DataFrame(data=np.ones_like(features_rf), columns=['features_rf'], index=features_rf)

# anova
from sklearn.feature_selection import f_classif
__, p_values = f_classif(X,y)
features_anova = pd.DataFrame(p_values, columns=['p_values'], index=X.columns)
features_anova = features_anova[features_anova['p_values']<0.05]
features_anova['features_an'] = 1

# información mutua
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X, y)
features_mi = pd.DataFrame(mi, columns=['mutual information'], index=X.columns)
features_mi = features_mi[features_mi['mutual information']>0.1]
features_mi['features_im'] = 1

# atributos seleccionados
features_sel = features_svc.join([features_l2, features_l1, features_rfo, features_anova, features_mi], how='outer')
features_sel.drop(['p_values','mutual information'], axis=1, inplace=True)
features_sel['total'] = features_sel.sum(axis=1)
features_sel = features_sel[features_sel['total']>=3]
lista_atributos = list(features_sel.index)
print(lista_atributos)

['max_degree_b2c', 'max_degree_b2i', 'max_degree_dd', 'max_degree_gc', 'max_degree_gi', 'mean_eccentricity_b2i', 'victims_self', 'dscore_4', 'IRI_EC', 'RPQ AP', 'TD', 'AN', 'AL', 'mean_eccentricity_b2d', 'IRI_PT', 'IRI_PD', 'mean_eccentricity_ti', 'mean_eccentricity_td', 'mean_mpli_b1c']


In [5]:
X_sel = X[lista_atributos].copy()
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=77, stratify=y)
X_train.shape

(72, 19)

In [6]:
## XG Boosting
from sklearn.preprocessing import LabelEncoder
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train
    model = XGBClassifier(**params)
    error= cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
model = XGBClassifier(**params)
scores = cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train, y_train_label)
print(model.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.6910277286613105, 'eta': 0.2501420805352883, 'max_depth': 4, 'n_estimators': 73, 'subsample': 0.14212514049875993}
best accuracy: 0.6952380952380952
0.6952380952380952 0.1217782081194707
0.47368421052631576


In [7]:
# con standard scaler
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    numeric_cols = X_train.select_dtypes(include=['float64','int64']).columns.to_list()
    preprocessor = ColumnTransformer([('scale', StandardScaler(), numeric_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
numeric_cols = X_train.select_dtypes(include=['float64','int64']).columns.to_list()
preprocessor = ColumnTransformer([('scale', StandardScaler(), numeric_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.6655154925003463, 'eta': 0.21399856688497443, 'max_depth': 4, 'n_estimators': 90, 'subsample': 0.15477576674534546}
best accuracy: 0.6923809523809524
0.6923809523809524 0.07851224322104744
0.5263157894736842


In [8]:
# con standard_scaler y min_max_scaler
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
    discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
    preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.27552414150721194, 'eta': 0.2896856289080729, 'max_depth': 8, 'n_estimators': 96, 'subsample': 0.3863969358992285}
best accuracy: 0.678095238095238
0.678095238095238 0.15723229777896336
0.631578947368421


In [9]:
# con power transformer y min_max_scaler
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
    discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
    preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.6940805893765363, 'eta': 0.21463913653819322, 'max_depth': 3, 'n_estimators': 86, 'subsample': 0.1756940154019607}
best accuracy: 0.7476190476190476
0.7476190476190476 0.08979694907674793
0.5789473684210527


In [10]:
# Análisis de relevancia
from sklearn.inspection import permutation_importance
y_label = LabelEncoder().fit_transform(y)
model = pipe.fit(X_sel, y_label)
r = permutation_importance(model, X_sel, y_label,n_repeats=10,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X_sel.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
victims_self,0.123077
IRI_PT,0.110989
AL,0.086813
mean_mpli_b1c,0.061538
IRI_EC,0.056044
IRI_PD,0.049451
mean_eccentricity_b2i,0.047253
mean_eccentricity_ti,0.036264
AN,0.035165
dscore_4,0.032967


In [11]:
importancia_atributos.to_csv('importancia atributos clasificación grupos conectividad sin demográficas ex2 score.csv', index_label='feature')

In [12]:
# Extracción sujetos mal clasificados
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)

le = LabelEncoder()
le.fit(y_train)
y_train_label = le.fit_transform(y_train)
y_test_label = le.fit_transform(y_test)
model = pipe.fit(X_train, y_train_label)

df_errados = pd.DataFrame(columns=['predicted'])
# errores dataset de entrenamiento
for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train_label)):
    model.fit(X_train.iloc[train_index], y_train_label[train_index])
    y_est = model.predict(X_train.iloc[test_index])
    errado = test_index[y_train_label[test_index] != y_est]
    y_pred = le.inverse_transform(y_est)
    y_p_df = pd.DataFrame(data=(y_pred), index=test_index, columns=['predicted'])
    errado_idx = pd.Index(errado)
    y_errados = y_p_df.loc[errado_idx].copy()
    df_errados = pd.concat([df_errados, y_errados], ignore_index=False)
y_t = y_train.reset_index().copy()
df_errados = pd.merge(y_t, df_errados, how='inner', left_index=True, right_index=True)
df_errados.set_index('subject', inplace=True)

# errores dataset de prueba
y_test_pred = model.predict(X_test)
y_test_pred = le.inverse_transform(y_test_pred)
y_test_pred_df = pd.DataFrame(data=y_test_pred, index=y_test.index, columns=['predicted'])
test_errados_df = pd.merge(y_test, y_test_pred_df, left_index=True, right_index=True)
test_errados_df = test_errados_df[test_errados_df['group'] != test_errados_df['predicted']]

df_errados = pd.concat([df_errados, test_errados_df], ignore_index=False)
df_errados

Unnamed: 0_level_0,group,predicted
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
21103,ex-paramilitaries,victims
22112,controls,victims
23005,ex-paramilitaries,victims
21139,ex-paramilitaries,controls
21134,ex-paramilitaries,victims
21115,ex-guerrillas,controls
22108,controls,ex-paramilitaries
21127,ex-paramilitaries,ex-guerrillas
24057,victims,ex-paramilitaries
24043,victims,ex-guerrillas


In [13]:
datos_errados = pd.merge(df_errados, data, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'group_x':'group'}, inplace=True)
variables = lista_atributos
variables.append('group')
variables.append('predicted')
datos_errados = datos_errados[variables]
datos_errados

Unnamed: 0_level_0,max_degree_b2c,max_degree_b2i,max_degree_dd,max_degree_gc,max_degree_gi,mean_eccentricity_b2i,victims_self,dscore_4,IRI_EC,RPQ AP,...,AN,AL,mean_eccentricity_b2d,IRI_PT,IRI_PD,mean_eccentricity_ti,mean_eccentricity_td,mean_mpli_b1c,group,predicted
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21103,4,5,-1,6,5,14.078125,yes,0.120374,16,15,...,0.0,6.25,2.625,16,12,13.34375,1.875,0.016898,ex-paramilitaries,victims
22112,5,7,1,4,7,15.875,no,0.156987,15,13,...,0.0,9.0,-0.890625,12,12,13.140625,-2.03125,0.030263,controls,victims
23005,9,8,0,4,8,13.46875,no,-0.198112,14,13,...,0.0,0.0,-0.703125,13,18,13.09375,-1.609375,0.113762,ex-paramilitaries,victims
21139,4,5,-1,5,5,14.65625,no,-0.449696,18,23,...,1.0,7.75,0.0,19,15,15.96875,-3.46875,0.002874,ex-paramilitaries,controls
21134,5,5,1,6,5,18.765625,yes,-0.09949,15,18,...,3.25,9.75,-5.578125,18,11,11.890625,6.875,0.016142,ex-paramilitaries,victims
21115,5,6,2,5,6,11.9375,no,-0.099458,13,13,...,0.5,5.25,0.90625,19,14,19.703125,-4.5,0.020584,ex-guerrillas,controls
22108,5,6,2,5,6,17.8125,no,-0.519923,19,14,...,1.5,9.5,-3.71875,19,9,15.0,-0.59375,0.011113,controls,ex-paramilitaries
21127,5,6,-2,6,5,12.734375,no,0.473664,9,12,...,2.25,0.5,-0.375,19,8,15.140625,1.8125,0.005713,ex-paramilitaries,ex-guerrillas
24057,8,5,0,8,5,15.53125,yes,-0.183536,16,13,...,0.0,0.0,-3.625,14,13,12.375,3.984375,0.009817,victims,ex-paramilitaries
24043,5,8,-2,5,6,11.71875,yes,-0.171814,15,12,...,2.25,5.25,2.3125,17,15,13.703125,1.078125,0.028276,victims,ex-guerrillas


In [14]:
datos_errados.to_csv('sujetos errados clasificación conectividad grupos sin demográficas ex2 score.csv')

# Prueba haciendo modificaciones al dataset propuestas por Natalia

In [2]:
data = pd.read_csv('medidas_conectividad_globales_comportamentales_demograficos_2.csv')
data.set_index('subject', inplace=True)
data = data.drop(columns=['group','IAT_score_ind','IAT_score_level','dscore_1','dscore_2','dscore_3','dscore_5','dscore_6','dscore_7','dscore_8','modo_0','modo_1','modo_2'], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 21100 to 24101
Columns: 146 entries, diameter_ac to IH
dtypes: float64(120), int64(21), object(5)
memory usage: 104.5+ KB


In [7]:
# cambio tipo de actor sujeto 22106
data.at[22106,'type'] = 'controls'
data.loc[22106]

In [None]:
# Imputación de datos
# Datos ordinales se imputan con la mediana del grupo redondeada, datos nominales con el más frecuente del grupo
data['IRI_PT'].fillna(data.groupby('type')['IRI_PT'].transform(lambda x: np.around(x.median())), inplace=True)
data['IRI_PD'].fillna(data.groupby('type')['IRI_PD'].transform(lambda x: np.around(x.median())), inplace=True)
data['IRI_EC'].fillna(data.groupby('type')['IRI_EC'].transform(lambda x: np.around(x.median())), inplace=True)
data['IRI_FS'].fillna(data.groupby('type')['IRI_FS'].transform(lambda x: np.around(x.median())), inplace=True)
data['IMA'].fillna(data.groupby('type')['IMA'].transform(lambda x: np.around(x.median())), inplace=True)
data['RPQ AR'].fillna(data.groupby('type')['RPQ AR'].transform(lambda x: np.around(x.median())), inplace=True)
data['RPQ AP'].fillna(data.groupby('type')['RPQ AP'].transform(lambda x: np.around(x.median())), inplace=True)
data['victims_self'].fillna(method='backfill', inplace=True)

data[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']] = data[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']].astype('int64')

# La variable de salida son los tipos de actores
X= data.drop(['type'], axis=1).copy()
y = data['type'].copy()

# codificación variables binarias
X['victims_self'] = pd.factorize(X['victims_self'])[0]
X['exposure_level'] = pd.factorize(X['exposure_level'])[0]
X['gender'] = pd.factorize(X['gender'])[0]
X['laterality'] = pd.factorize(X['laterality'])[0]
X.info()

In [37]:
# Selección de características
# Por selección por modelos
# SVC
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(random_state=72).fit(X, y)
model = SelectFromModel(lsvc, threshold='2*mean', prefit=True)
X_new = model.transform(X)
features_lsvc = model.get_feature_names_out(input_features=X.columns)
features_svc = pd.DataFrame(data=np.ones_like(features_lsvc), columns=['features_lsvc'], index=features_lsvc)

# regresión logística l2
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
lr = LogisticRegression(penalty="l2", solver='saga', max_iter=10000, random_state=72).fit(X,y)
model = SelectFromModel(lr, threshold='2*mean', prefit=True)
X_new = model.transform(X)
features_lrl2 = model.get_feature_names_out(input_features=X.columns)
features_l2 = pd.DataFrame(data=np.ones_like(features_lrl2), columns=['features_lrl2'], index=features_lrl2)

# regresión logística l1
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
lr = LogisticRegression(penalty="l1", solver='saga', max_iter=10000, random_state=72).fit(X,y)
model = SelectFromModel(lr, threshold='2*mean', prefit=True)
X_new = model.transform(X)
features_lrl1 = model.get_feature_names_out(input_features=X.columns)
features_l1 = pd.DataFrame(data=np.ones_like(features_lrl1), columns=['features_lrl1'], index=features_lrl1)

# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
rf = RandomForestClassifier(random_state=72).fit(X,y)
model = SelectFromModel(rf, threshold='1.5*mean', prefit=True)
X_new = model.transform(X)
features_rf = model.get_feature_names_out(input_features=X.columns)
features_rfo = pd.DataFrame(data=np.ones_like(features_rf), columns=['features_rf'], index=features_rf)

# anova
from sklearn.feature_selection import f_classif
__, p_values = f_classif(X,y)
features_anova = pd.DataFrame(p_values, columns=['p_values'], index=X.columns)
features_anova = features_anova[features_anova['p_values']<0.05]
features_anova['features_an'] = 1

# información mutua
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X, y)
features_mi = pd.DataFrame(mi, columns=['mutual information'], index=X.columns)
features_mi = features_mi[features_mi['mutual information']>0.1]
features_mi['features_im'] = 1

# atributos seleccionados
features_sel = features_svc.join([features_l2, features_l1, features_rfo, features_anova, features_mi], how='outer')
features_sel.drop(['p_values','mutual information'], axis=1, inplace=True)
features_sel['total'] = features_sel.sum(axis=1)
features_sel = features_sel[features_sel['total']>=3]
lista_atributos = list(features_sel.index)
print(lista_atributos)

['max_degree_b1d', 'max_degree_b2c', 'max_degree_b2i', 'max_degree_gi', 'mean_eccentricity_ac', 'mean_eccentricity_b2i', 'mean_eccentricity_b2d', 'mean_eccentricity_gc', 'mean_eccentricity_gd', 'exposure_level', 'EX2_score', 'school_years', 'gender', 'IRI_PT', 'IRI_EC', 'RPQ AP', 'TD', 'AN', 'AL', 'mean_eccentricity_ti', 'IRI_PD', 'RPQ AR', 'victims_self']


In [38]:
X_sel = X[lista_atributos].copy()
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=77, stratify=y)
X_train.shape

(72, 23)

In [17]:
## XG Boosting
from sklearn.preprocessing import LabelEncoder
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train
    model = XGBClassifier(**params)
    error= cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
model = XGBClassifier(**params)
scores = cross_val_score(estimator = model, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train, y_train_label)
print(model.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.4038166961694294, 'eta': 0.25914711587802153, 'max_depth': 9, 'n_estimators': 47, 'subsample': 0.6888406003092017}
best accuracy: 0.9152380952380952
0.9152380952380952 0.08348421714594716
0.7894736842105263


In [18]:
# con standard scaler
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    numeric_cols = X_train.select_dtypes(include=['float64','int64']).columns.to_list()
    preprocessor = ColumnTransformer([('scale', StandardScaler(), numeric_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
numeric_cols = X_train.select_dtypes(include=['float64','int64']).columns.to_list()
preprocessor = ColumnTransformer([('scale', StandardScaler(), numeric_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.8589403000388333, 'eta': 0.10498347170421485, 'max_depth': 8, 'n_estimators': 52, 'subsample': 0.46627176177001706}
best accuracy: 0.9295238095238094
0.9295238095238094 0.06391604470690485
0.7894736842105263


In [19]:
# con standard_scaler y min_max_scaler
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
    discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
    preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.3852270726188255, 'eta': 0.8289342011926254, 'max_depth': 4, 'n_estimators': 86, 'subsample': 0.4810409702671835}
best accuracy: 0.9047619047619048
0.9047619047619048 0.0898979012717273
0.7894736842105263


In [20]:
# con power transformer y min_max_scaler
param_space =dict(n_estimators=range(1,100), max_depth=range(3,10), subsample=uniform(0.1,0.9), eta=uniform(0,1), colsample_bytree=uniform(0.1,0.9))
@scheduler.parallel(n_jobs=-1)
def objective(**params):
    global X_train, y_train_label
    continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
    discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
    preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
    pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
    error = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5).mean()
    return error
tuner = Tuner(param_space, objective)
best_results = tuner.maximize()
print('best parameters:', best_results['best_params'])
print('best accuracy:', best_results['best_objective'])

params = best_results['best_params']
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'colsample_bytree': 0.5040016234611927, 'eta': 0.17915533213844959, 'max_depth': 8, 'n_estimators': 45, 'subsample': 0.49924545178281066}
best accuracy: 0.9152380952380952
0.9152380952380952 0.08348421714594716
0.7894736842105263


In [21]:
# Nejor modelo
params = {'colsample_bytree': 0.8589403000388333, 'eta': 0.10498347170421485, 'max_depth': 8, 'n_estimators': 52, 'subsample': 0.46627176177001706}
numeric_cols = X_train.select_dtypes(include=['float64','int64']).columns.to_list()
preprocessor = ColumnTransformer([('scale', StandardScaler(), numeric_cols)], remainder='passthrough')
pipe = Pipeline([('preprocessing', preprocessor),('model', XGBClassifier(**params))])
scores = cross_val_score(estimator = pipe, X= X_train, y= y_train_label, scoring='accuracy', cv=5)
print(scores.mean(), scores.std())
pipe.fit(X_train, y_train_label)
print(pipe.score(X_test, y_test_label))

0.9295238095238094 0.06391604470690485
0.7894736842105263


In [22]:
# Análisis de relevancia
from sklearn.inspection import permutation_importance
y_label = LabelEncoder().fit_transform(y)
model = pipe.fit(X_sel, y_label)
r = permutation_importance(model, X_sel, y_label,n_repeats=10,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X_sel.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
EX2_score,0.151648
IRI_PT,0.08022
gender,0.040659
IRI_EC,0.038462
mean_eccentricity_ti,0.021978
mean_eccentricity_b2i,0.016484
school_years,0.014286
IRI_PD,0.013187
mean_eccentricity_gc,0.012088
mean_eccentricity_b2d,0.008791


In [24]:
importancia_atributos.to_csv('importancia atributos clasificación grupos conectividad Natalia.csv', index_label='feature')

In [26]:
# Extracción sujetos mal clasificados
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)

le = LabelEncoder()
le.fit(y_train)
y_train_label = le.fit_transform(y_train)
y_test_label = le.fit_transform(y_test)
model = pipe.fit(X_train, y_train_label)

df_errados = pd.DataFrame(columns=['predicted'])
# errores dataset de entrenamiento
for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train_label)):
    model.fit(X_train.iloc[train_index], y_train_label[train_index])
    y_est = model.predict(X_train.iloc[test_index])
    errado = test_index[y_train_label[test_index] != y_est]
    y_pred = le.inverse_transform(y_est)
    y_p_df = pd.DataFrame(data=(y_pred), index=test_index, columns=['predicted'])
    errado_idx = pd.Index(errado)
    y_errados = y_p_df.loc[errado_idx].copy()
    df_errados = pd.concat([df_errados, y_errados], ignore_index=False)
y_t = y_train.reset_index().copy()
df_errados = pd.merge(y_t, df_errados, how='inner', left_index=True, right_index=True)
df_errados.set_index('subject', inplace=True)

# errores dataset de prueba
y_test_pred = model.predict(X_test)
y_test_pred = le.inverse_transform(y_test_pred)
y_test_pred_df = pd.DataFrame(data=y_test_pred, index=y_test.index, columns=['predicted'])
test_errados_df = pd.merge(y_test, y_test_pred_df, left_index=True, right_index=True)
test_errados_df = test_errados_df[test_errados_df['type'] != test_errados_df['predicted']]

df_errados = pd.concat([df_errados, test_errados_df], ignore_index=False)
df_errados

Unnamed: 0_level_0,type,predicted
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
21137,ex-combatant,controls
24027,victims,ex-combatant
24003,victims,ex-combatant
21100,ex-combatant,victims
22108,controls,ex-combatant
22106,controls,victims
24045,victims,ex-combatant
22110,controls,ex-combatant


In [None]:
datos_errados = pd.merge(df_errados, data, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'type_x':'type'}, inplace=True)
variables = lista_atributos
variables.append('type')
variables.append('predicted')
print(variables)
datos_errados = datos_errados[variables]
datos_errados.head()

In [42]:
datos_errados.to_csv('sujetos errados clasificación conectividad grupos Natalia.csv')