In [1]:
import importlib
import funciones_modelos_ML as ml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
from mango import Tuner, scheduler, MetaTuner
from mango.domain.distribution import loguniform
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from xgboost import XGBClassifier
import shap
%matplotlib qt5

In [2]:
importlib.reload(ml)

<module 'funciones_modelos_ML' from 'c:\\Users\\jhquiza\\OneDrive - Universidad de Medellin\\JupyterNotebooks\\IAT\\funciones_modelos_ML.py'>

In [3]:
# Configuro parámetros de gráficos
params = {'axes.labelsize':20, 'axes.titlesize': 20, 'axes.grid':False, 'axes.grid.axis':'both', 'axes.grid.which':'major', 'font.size':20, 'legend.fontsize':20, 'lines.linewidth': 2.0 ,'legend.loc': 'best','xtick.labelsize': 20, 'xtick.minor.visible': True, 'ytick.labelsize': 20, 'ytick.minor.visible': True}
plt.rcParams.update(params)

In [4]:
# otra opción usando todas las variables y sin hacer imputación de datos
data_all = pd.read_csv('medidas_conectividad_globales_comportamentales_demograficos_2.csv')
data_all.set_index('subject', inplace=True)
data_all= data_all.drop(columns=['IAT_score_ind','IAT_score_level','dscore_1','dscore_2','dscore_3','dscore_5','dscore_6',
                                 'dscore_7','dscore_8','modo_0','modo_1','modo_2'], axis=1)
data_all.dropna(inplace=True)
data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 
          'RPQ AP']] = data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']].astype('int64')

In [5]:
X = data_all.drop(['type', 'group'], axis=1).copy()
X = pd.get_dummies(X)
y = data_all['type'].copy()

# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)


In [6]:
lista_atributos = ml.select_features_clf(X_train=X_train, y_train=y_train, threshold='1.5*mean', mi_threshold=0.1)
print(lista_atributos)

['max_degree_b2c', 'max_degree_b2i', 'mean_eccentricity_b2i', 'mean_eccentricity_b2d', 'mean_eccentricity_gc', 'mean_eccentricity_gd', 'mean_eccentricity_tc', 'mean_eccentricity_td', 'EX2_score', 'age', 'school_years', 'IRI_PT', 'IRI_EC', 'IRI_PD', 'RPQ AR', 'RPQ AP', 'AN', 'AL', 'gender_F', 'gender_M', 'exposure_level_high', 'exposure_level_low', 'victims_self_no', 'victims_self_yes', 'diameter_b2d']


In [7]:
print(len(lista_atributos))

25


In [6]:
lista_atributos = ['max_degree_b2c', 'max_degree_b2i', 'mean_eccentricity_b2i', 'mean_eccentricity_b2d', 
                  'mean_eccentricity_gc', 'mean_eccentricity_gd', 'mean_eccentricity_tc', 'mean_eccentricity_td', 
                  'EX2_score', 'age', 'school_years', 'IRI_PT', 'IRI_EC', 'IRI_PD', 'RPQ AR', 'RPQ AP', 'AN', 'AL', 
                  'gender_F', 'gender_M', 'exposure_level_high', 'exposure_level_low', 'victims_self_no', 
                  'victims_self_yes', 'diameter_b2d']
print(len(lista_atributos))

25


In [7]:
data_all_cod = pd.get_dummies(data_all)
X_sel = data_all_cod[lista_atributos].copy()

# Modelo eliminando sujetos con datos faltantes y usando todas las variables

In [7]:
# modelos con variables sin preprocesar
best_results_np = ml.get_best_model_np(X_train=X_train, y_train_label=y_train_label, X_test=X_test, 
                                       y_test_label=y_test_label)
# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

# modelos con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), 
                                  ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
model_names = ['Logistic Regression SC', 'KNN SC', 'Random Forest SC', 'Gradient Boosting SC', 'SVC SC', 'XGBoost SC']
best_results_sc = ml.get_best_model_sc(preprocessor=preprocessor, model_names=model_names, X_train=X_train, 
                                       y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# modelos con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), 
                                  ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
model_names = ['Logistic Regression PT', 'KNN PT', 'Random Forest PT', 'Gradient Boosting PT', 'SVC PT', 'XGBoost PT']
best_results_pt = ml.get_best_model_sc(preprocessor=preprocessor, model_names=model_names, X_train=X_train, 
                                       y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

df_np = best_results_np.reset_index().copy()
df_sc = best_results_sc.reset_index().copy()
df_pt = best_results_pt.reset_index().copy()
df_all = pd.concat([df_np, df_sc, df_pt], axis=0, ignore_index=True)
df_all = df_all.sort_values(by='test score', ascending=False)
df_all

<mango.tuner.Tuner object at 0x000001A9CA962EE0>


Best score: 0.7291950113378685: 100%|██████████| 40/40 [00:23<00:00,  1.74it/s]
Best score: 0.6814659197012138:   0%|          | 0/40 [00:00<?, ?it/s]

<mango.tuner.Tuner object at 0x000001A9CA962B80>


Best score: 0.7603469106941022: 100%|██████████| 40/40 [00:02<00:00, 17.05it/s]


<mango.tuner.Tuner object at 0x000001A9CA98F5E0>


Best score: 0.6233941997851772: 100%|██████████| 40/40 [00:15<00:00,  2.50it/s]


<mango.tuner.Tuner object at 0x000001A9CA9A2EB0>


Best score: 0.7880310018744341: 100%|██████████| 40/40 [00:32<00:00,  1.24it/s]
Best score: 0.6565290661719233:   0%|          | 0/40 [00:00<?, ?it/s]

<mango.tuner.Tuner object at 0x000001A9CA9A2640>


Best score: 0.679930426716141: 100%|██████████| 40/40 [00:03<00:00, 10.95it/s]


<mango.tuner.Tuner object at 0x000001A9CA962490>


Best score: 0.730094493937926: 100%|██████████| 40/40 [00:24<00:00,  1.67it/s]
Best score: 0.5635055170393517: 100%|██████████| 40/40 [00:14<00:00,  2.76it/s]
Best score: 0.6145711617980526: 100%|██████████| 40/40 [00:03<00:00, 10.83it/s]
Best score: 0.643923770465124: 100%|██████████| 40/40 [00:17<00:00,  2.23it/s] 
Best score: 0.7963935749738493: 100%|██████████| 40/40 [00:40<00:00,  1.01s/it]
Best score: 0.6104611844737895: 100%|██████████| 40/40 [00:05<00:00,  7.19it/s]
Best score: 0.7769877073636472: 100%|██████████| 40/40 [00:17<00:00,  2.34it/s]
Best score: 0.6021859844416235: 100%|██████████| 40/40 [00:42<00:00,  1.06s/it]
Best score: 0.6072911971806267: 100%|██████████| 40/40 [00:30<00:00,  1.31it/s]
Best score: 0.6434654818865345: 100%|██████████| 40/40 [00:42<00:00,  1.07s/it]
Best score: 0.7613736722759279: 100%|██████████| 40/40 [01:06<00:00,  1.66s/it]
Best score: 0.6411935786460333: 100%|██████████| 40/40 [00:33<00:00,  1.21it/s]
Best score: 0.7324129651860745: 100%|████

Unnamed: 0,model,hiperparameters,mean cv score,std cv score,test score
9,Gradient Boosting SC,"{'ccp_alpha': 0.0002714468917970071, 'n_estima...",0.796394,0.100576,0.722222
3,Gradient Boosting,"{'ccp_alpha': 0.00042274779881326885, 'n_estim...",0.788031,0.069265,0.722222
11,XGBoost SC,"{'colsample_bytree': 0.9856858325442092, 'eta'...",0.776988,0.126166,0.833333
15,Gradient Boosting PT,"{'ccp_alpha': 0.0007559302849244197, 'n_estima...",0.761374,0.111013,0.722222
1,KNN,"{'n_neighbors': 6, 'p': 1, 'weights': 'uniform'}",0.760347,0.063542,0.666667
17,XGBoost PT,"{'colsample_bytree': 0.7797682482504644, 'eta'...",0.732413,0.117587,0.611111
5,XGBoost,"{'colsample_bytree': 0.9225695976323615, 'eta'...",0.730094,0.093391,0.666667
0,Logistic Regression,"{'C': 0.007838720308709295, 'penalty': 'l2'}",0.729195,0.109274,0.777778
4,SVC,"{'C': 0.5363005433908413, 'degree': 3, 'gamma'...",0.67993,0.111279,0.555556
8,Random Forest SC,"{'ccp_alpha': 0.0015726770189548485, 'n_estima...",0.643924,0.117338,0.611111


In [44]:
df_all.to_csv('mejores_resultados_modelos_conectividad_actores_dataset_completo_f1.csv', index=False)

In [8]:
# Mejor modelo
df_all = df_all.set_index('model')
params= df_all.loc['XGBoost SC']['hiperparameters']
y = data_all['type'].copy()
model, model_fit = ml.mejor_modelo(params=params, X=X, y=y, pre_pipe='sc')

mean val score:  0.7769877073636472
std val score:  0.1261661878793255
test score:  0.8333333333333334


In [9]:
disp = ConfusionMatrixDisplay.from_estimator(model_fit, X_test, y_test_label, 
                                             display_labels=['civilians', 'ex-combatants', 'victims'])

In [10]:
report = pd.DataFrame(classification_report(y_test_label, model_fit.predict(X_test), 
                                            target_names=['civilians', 'ex-combatants', 'victims'], output_dict=True))
report

Unnamed: 0,civilians,ex-combatants,victims,accuracy,macro avg,weighted avg
precision,0.5,1.0,1.0,0.833333,0.833333,0.916667
recall,1.0,0.909091,0.5,0.833333,0.80303,0.833333
f1-score,0.666667,0.952381,0.666667,0.833333,0.761905,0.84127
support,3.0,11.0,4.0,0.833333,18.0,18.0


In [12]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit, X, y_label,n_repeats=20,random_state=0, scoring='f1_weighted')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X.columns, index=['mean importance']).transpose()
importancia_atributos = importancia_atributos.sort_values(by='mean importance', ascending=False)
importancia_atributos

(149, 20)

In [27]:
# Gráfica de importancia de atributos
temp = pd.DataFrame(data=r.importances, index=X.columns)
temp['mean'] = temp.mean(axis=1)
temp = temp.sort_values(by='mean', ascending=False)
temp = temp[temp['mean']>0]
temp = temp.drop(columns=['mean'], axis=1)
temp = temp.T
fig = sns.boxplot(data=temp, orient='h')

Unnamed: 0,mean importance
EX2_score,0.118246
IRI_PT,0.036690
gender_F,0.025544
AL,0.022462
IRI_FS,0.010807
...,...
tree_hierarchy_ac,-0.007086
mean_eccentricity_b2d,-0.009223
max_betweenness_b1c,-0.009266
mean_mpli_ac,-0.013638


In [58]:
importancia_atributos.to_csv('importancia_atributos_clasificación_dataset_completo_sc_f1.csv', index_label='feature')

In [59]:
# Sujetos mal clasificados
df_errados = ml.errores(model=model, label='type', X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_all, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'type_x':'type'}, inplace=True)
datos_errados = datos_errados.drop(['type_y'], axis=1)
datos_errados

Unnamed: 0_level_0,type,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24015,victims,ex-combatant,0.253968,0.253968,0.0,0.380952,0.238095,0.142857,0.333333,0.269841,...,14,13,14,29,14,13,1.75,3.5,8.25,2.0
24065,victims,ex-combatant,0.285714,0.285714,0.0,0.285714,0.301587,-0.015873,0.285714,0.222222,...,10,18,8,39,18,12,4.25,5.25,7.5,0.0
22106,controls,victims,0.31746,0.31746,0.0,0.31746,0.285714,0.031746,0.301587,0.365079,...,16,17,15,57,21,14,0.0,3.25,7.5,0.0
24003,victims,ex-combatant,0.253968,0.253968,0.0,0.31746,0.285714,0.031746,0.285714,0.285714,...,16,15,14,28,12,12,0.0,2.25,5.25,0.0
24014,victims,ex-combatant,0.31746,0.253968,0.063492,0.301587,0.253968,0.047619,0.301587,0.285714,...,15,19,14,43,19,12,3.75,6.75,6.5,0.0
21102,ex-combatant,controls,0.301587,0.301587,0.0,0.285714,0.285714,0.0,0.253968,0.301587,...,16,15,10,42,13,12,0.0,0.0,4.5,0.0
21100,ex-combatant,victims,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
24045,victims,ex-combatant,0.31746,0.31746,0.0,0.285714,0.31746,-0.031746,0.269841,0.206349,...,13,13,11,49,19,18,0.0,0.5,3.75,0.25
24069,victims,ex-combatant,0.253968,0.301587,-0.047619,0.380952,0.285714,0.095238,0.396825,0.301587,...,20,19,12,28,15,12,2.5,4.0,10.0,0.0
22108,controls,ex-combatant,0.285714,0.253968,0.031746,0.238095,0.269841,-0.031746,0.285714,0.349206,...,15,19,9,33,20,14,0.0,1.5,9.5,0.0


In [60]:
datos_errados.to_csv('errores_clasificación_dataset_completo_f1.csv')

# Modelos con atributos seleccionados

In [9]:
# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=1, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)

# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

In [15]:
# modelos con variables sin preprocesar
best_results_np_sel = ml.get_best_model_np(X_train=X_train, y_train_label=y_train_label, X_test=X_test, 
                                           y_test_label=y_test_label)

# modelos con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), 
                                  ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
model_names = ['Logistic Regression SC', 'KNN SC', 'Random Forest SC', 'Gradient Boosting SC', 'SVC SC', 'XGBoost SC']
best_results_sc_sel = ml.get_best_model_sc(preprocessor=preprocessor, model_names=model_names, X_train=X_train, 
                                           y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# modelos con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), 
                                  ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
model_names = ['Logistic Regression PT', 'KNN PT', 'Random Forest PT', 'Gradient Boosting PT', 'SVC PT', 'XGBoost PT']
best_results_pt_sel = ml.get_best_model_sc(preprocessor=preprocessor, model_names=model_names, X_train=X_train, 
                                           y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

df_np_sel = best_results_np_sel.reset_index().copy()
df_sc_sel = best_results_sc_sel.reset_index().copy()
df_pt_sel = best_results_pt_sel.reset_index().copy()
df_sel = pd.concat([df_np_sel, df_sc_sel, df_pt_sel], axis=0, ignore_index=True)
df_sel = df_sel.sort_values(by='mean cv score', ascending=False)
df_sel

<mango.tuner.Tuner object at 0x0000017427DB6820>


Best score: 0.7366845685642678: 100%|██████████| 40/40 [00:11<00:00,  3.36it/s]
Best score: 0.8157894736842106:  12%|█▎        | 5/40 [00:00<00:00, 42.50it/s]

<mango.tuner.Tuner object at 0x0000017427DB64F0>


Best score: 0.8157894736842106: 100%|██████████| 40/40 [00:01<00:00, 25.69it/s]


<mango.tuner.Tuner object at 0x00000174259A8E20>


Best score: 0.8044758956214066: 100%|██████████| 40/40 [00:12<00:00,  3.18it/s]


<mango.tuner.Tuner object at 0x00000174259A8D30>


Best score: 0.8387019369151169: 100%|██████████| 40/40 [00:20<00:00,  1.99it/s]
Best score: 0.7486804721888756:   0%|          | 0/40 [00:00<?, ?it/s]

<mango.tuner.Tuner object at 0x0000017425596AF0>


Best score: 0.782563025210084: 100%|██████████| 40/40 [00:03<00:00, 12.20it/s] 


<mango.tuner.Tuner object at 0x0000017425843A90>


Best score: 0.849559613319012: 100%|██████████| 40/40 [00:09<00:00,  4.08it/s] 
Best score: 0.5019816622301094: 100%|██████████| 40/40 [00:14<00:00,  2.71it/s]
Best score: 0.836546618647459: 100%|██████████| 40/40 [00:02<00:00, 13.56it/s] 
Best score: 0.7874939449463996: 100%|██████████| 40/40 [00:15<00:00,  2.57it/s]
Best score: 0.8328967376424254: 100%|██████████| 40/40 [00:18<00:00,  2.11it/s]
Best score: 0.8079197994987467: 100%|██████████| 40/40 [00:05<00:00,  7.59it/s]
Best score: 0.8489151450053706: 100%|██████████| 40/40 [00:13<00:00,  2.92it/s]
Best score: 0.5019816622301094: 100%|██████████| 40/40 [00:19<00:00,  2.09it/s]
Best score: 0.8066833751044278: 100%|██████████| 40/40 [00:04<00:00,  8.16it/s]
Best score: 0.787360277444311: 100%|██████████| 40/40 [00:16<00:00,  2.40it/s]
Best score: 0.8516271069831441: 100%|██████████| 40/40 [00:19<00:00,  2.08it/s]
Best score: 0.7799946294307196: 100%|██████████| 40/40 [00:06<00:00,  6.13it/s]
Best score: 0.8366969243837886: 100%|████

Unnamed: 0,model,hiperparameters,mean cv score,std cv score,test score
15,Gradient Boosting PT,"{'ccp_alpha': 0.0005411161255749367, 'n_estima...",0.851627,0.061333,0.833333
5,XGBoost,"{'colsample_bytree': 0.7621922701080411, 'eta'...",0.84956,0.079072,0.833333
11,XGBoost SC,"{'colsample_bytree': 0.9531706384873114, 'eta'...",0.848915,0.078492,0.777778
3,Gradient Boosting,"{'ccp_alpha': 0.0002665244718441294, 'n_estima...",0.838702,0.051155,0.833333
17,XGBoost PT,"{'colsample_bytree': 0.73845848247027, 'eta': ...",0.836697,0.050586,0.666667
7,KNN SC,"{'n_neighbors': 6, 'p': 1, 'weights': 'uniform'}",0.836547,0.054463,0.722222
9,Gradient Boosting SC,"{'ccp_alpha': 0.006543752781176673, 'n_estimat...",0.832897,0.085865,0.833333
1,KNN,"{'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}",0.815789,0.090725,0.555556
10,SVC SC,"{'C': 1170.3860791037255, 'degree': 1, 'gamma'...",0.80792,0.105901,0.611111
13,KNN PT,"{'n_neighbors': 6, 'p': 1, 'weights': 'uniform'}",0.806683,0.113673,0.722222


In [16]:
df_sel.to_csv('mejores_resultados_modelos_conectividad_actores_atributos_seleccionados_f1_2_1.csv', index=False)

In [18]:
# Mejor modelo
# df_sel = df_sel.set_index('model')
params= df_sel.loc['XGBoost']['hiperparameters']
y = data_all['type'].copy()
model, model_fit = ml.mejor_modelo(params=params, X=X_sel, y=y, pre_pipe='np')

mean val score:  0.849559613319012
std val score:  0.07907171078165788
test score:  0.8333333333333334


In [19]:
disp = ConfusionMatrixDisplay.from_estimator(model_fit, X_test, y_test_label, 
                                             display_labels=['civilians', 'ex-combatants', 'victims'])

In [20]:
report = pd.DataFrame(classification_report(y_test_label, model_fit.predict(X_test), 
                                            target_names=['civilians', 'ex-combatants', 'victims'], output_dict=True))
report

Unnamed: 0,civilians,ex-combatants,victims,accuracy,macro avg,weighted avg
precision,0.75,0.9,0.75,0.833333,0.8,0.841667
recall,1.0,0.818182,0.75,0.833333,0.856061,0.833333
f1-score,0.857143,0.857143,0.75,0.833333,0.821429,0.833333
support,3.0,11.0,4.0,0.833333,18.0,18.0


In [21]:
report.to_csv('reporte_mejor_modelo_conectividad_actores_atributos_seleccionados_f1_2_1.csv', index_label='metric')

In [22]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit, X_sel, y_label,n_repeats=50,random_state=0, scoring='f1_weighted')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X_sel.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
EX2_score,0.148873
IRI_PT,0.062832
gender_F,0.04308
IRI_EC,0.016374
mean_eccentricity_b2i,0.016116
AL,0.012021
diameter_b2d,0.0096
exposure_level_high,0.009346
mean_eccentricity_gc,0.008459
victims_self_yes,0.008434


In [23]:
importancia_atributos.to_csv('importancia_atributos_clasificación_atributos_seleccionados_f1_2_1.csv', index_label='feature')

In [24]:
temp = pd.DataFrame(data=r.importances, index=X_sel.columns)
temp['mean'] = temp.mean(axis=1)
temp = temp.sort_values(by='mean', ascending=False)
temp = temp[temp['mean']>0]
temp = temp.drop(columns=['mean'], axis=1)
temp = temp.T
fig = sns.boxplot(data=temp, orient='h')

In [26]:
df_errados = ml.errores(model=model, label='type', X_train=X_train, y_train=y_train, 
                        X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_all, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'type_x':'type'}, inplace=True)
datos_errados = datos_errados.drop(['type_y'], axis=1)
datos_errados = datos_errados.sort_index()
datos_errados

Unnamed: 0_level_0,type,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,ex-combatant,victims,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
21102,ex-combatant,controls,0.301587,0.301587,0.0,0.285714,0.285714,0.0,0.253968,0.301587,...,16,15,10,42,13,12,0.0,0.0,4.5,0.0
21137,ex-combatant,victims,0.380952,0.285714,0.095238,0.333333,0.269841,0.063492,0.396825,0.269841,...,5,17,5,33,17,14,0.0,2.0,8.0,0.0
22103,controls,ex-combatant,0.222222,0.285714,-0.063492,0.269841,0.380952,-0.111111,0.238095,0.285714,...,9,12,17,28,15,12,0.0,0.0,6.0,0.0
22106,controls,victims,0.31746,0.31746,0.0,0.31746,0.285714,0.031746,0.301587,0.365079,...,16,17,15,57,21,14,0.0,3.25,7.5,0.0
22108,controls,ex-combatant,0.285714,0.253968,0.031746,0.238095,0.269841,-0.031746,0.285714,0.349206,...,15,19,9,33,20,14,0.0,1.5,9.5,0.0
22114,controls,ex-combatant,0.396825,0.333333,0.063492,0.222222,0.269841,-0.047619,0.285714,0.269841,...,9,6,13,29,11,12,0.0,0.0,6.0,0.0
23005,ex-combatant,victims,0.238095,0.285714,-0.047619,0.301587,0.301587,0.0,0.269841,0.269841,...,22,14,18,36,17,13,0.0,0.0,0.0,0.0
24003,victims,ex-combatant,0.253968,0.253968,0.0,0.31746,0.285714,0.031746,0.285714,0.285714,...,16,15,14,28,12,12,0.0,2.25,5.25,0.0
24014,victims,ex-combatant,0.31746,0.253968,0.063492,0.301587,0.253968,0.047619,0.301587,0.285714,...,15,19,14,43,19,12,3.75,6.75,6.5,0.0


In [27]:
datos_errados.to_csv('errores_clasificación_atributos_seleccionados_f1_2_1.csv')

# Modelos con atributos no correlacionados seleccionados

In [10]:
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
corr = spearmanr(X_sel).correlation

# Ensure the correlation matrix is symmetric
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1)

# We convert the correlation matrix to a distance matrix before performing
# hierarchical clustering using Ward's linkage.
distance_matrix = 1 - np.abs(corr)
dist_linkage = hierarchy.ward(squareform(distance_matrix))
dendro = hierarchy.dendrogram(dist_linkage, labels=X_sel.columns, ax=ax1, leaf_rotation=90, leaf_font_size=18)
dendro_idx = np.arange(0, len(dendro["ivl"]))
ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
ax2.set_xticks(dendro_idx)
ax2.set_yticks(dendro_idx)
ax2.set_xticklabels(dendro["ivl"], fontsize=18, rotation="vertical")
ax2.set_yticklabels(dendro["ivl"], fontsize=18)
fig.tight_layout()
plt.show()

In [11]:
from collections import defaultdict
cluster_ids = hierarchy.fcluster(dist_linkage, 0.5, criterion="distance")
cluster_id_to_feature_ids = defaultdict(list)
for idx, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(idx)
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
X_train_sel = X_train[X_train.columns[selected_features]]
X_test_sel = X_test[X_test.columns[selected_features]]
X_train_sel.shape, X_test_sel.shape

((70, 16), (18, 16))

In [13]:
print(X_train_sel.columns)

Index(['max_degree_b2c', 'max_degree_b2i', 'mean_eccentricity_b2i',
       'mean_eccentricity_gc', 'mean_eccentricity_tc', 'EX2_score', 'age',
       'school_years', 'IRI_PT', 'IRI_EC', 'IRI_PD', 'RPQ AR', 'AN', 'AL',
       'gender_F', 'victims_self_no'],
      dtype='object')


In [8]:
features_sel_sel = ['max_degree_b2c', 'max_degree_b2i', 'mean_eccentricity_b2i',
                    'mean_eccentricity_gc', 'mean_eccentricity_tc', 'EX2_score', 'age',
                    'school_years', 'IRI_PT', 'IRI_EC', 'IRI_PD', 'RPQ AR', 'AN', 'AL', 'gender_F', 'victims_self_no']

In [9]:
data_all_cod = pd.get_dummies(data_all)
X_sel_sel = data_all_cod[features_sel_sel].copy()
y = data_all['type'].copy()
# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X_sel_sel, y, test_size=0.2, random_state=1, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(70, 16) (18, 16) (70,) (18,)


In [16]:
# modelos con variables sin preprocesar
best_results_np_sel = ml.get_best_model_np(X_train=X_train_sel, y_train_label=y_train_label, X_test=X_test_sel, 
                                           y_test_label=y_test_label)

# modelos con standard scaler y min_max scaler
# separación de variables para preprocesar
continuas_cols = X_train_sel.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train_sel.select_dtypes(include=['int64']).columns.to_list()

preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), 
                                  ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
model_names = ['Logistic Regression SC', 'KNN SC', 'Random Forest SC', 'Gradient Boosting SC', 'SVC SC', 'XGBoost SC']
best_results_sc_sel = ml.get_best_model_sc(preprocessor=preprocessor, model_names=model_names, X_train=X_train_sel, 
                                           y_train_label=y_train_label, X_test=X_test_sel, y_test_label=y_test_label)

# modelos con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), 
                                  ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
model_names = ['Logistic Regression PT', 'KNN PT', 'Random Forest PT', 'Gradient Boosting PT', 'SVC PT', 'XGBoost PT']
best_results_pt_sel = ml.get_best_model_sc(preprocessor=preprocessor, model_names=model_names, X_train=X_train_sel, 
                                           y_train_label=y_train_label, X_test=X_test_sel, y_test_label=y_test_label)

df_np_sel = best_results_np_sel.reset_index().copy()
df_sc_sel = best_results_sc_sel.reset_index().copy()
df_pt_sel = best_results_pt_sel.reset_index().copy()
df_sel = pd.concat([df_np_sel, df_sc_sel, df_pt_sel], axis=0, ignore_index=True)
df_sel = df_sel.sort_values(by='mean cv score', ascending=False)
df_sel

  0%|          | 0/40 [00:00<?, ?it/s]

<mango.tuner.Tuner object at 0x0000013D7AB86A90>


Best score: 0.7800630252100842: 100%|██████████| 40/40 [00:10<00:00,  3.69it/s]
Best score: 0.784625850340136:   0%|          | 0/40 [00:00<?, ?it/s] 

<mango.tuner.Tuner object at 0x0000013D7A798520>


Best score: 0.8130730186811567: 100%|██████████| 40/40 [00:01<00:00, 25.59it/s]


<mango.tuner.Tuner object at 0x0000013D7AE23370>


Best score: 0.835513995071713: 100%|██████████| 40/40 [00:15<00:00,  2.67it/s]


<mango.tuner.Tuner object at 0x0000013D7A678FA0>


Best score: 0.835513995071713: 100%|██████████| 40/40 [00:17<00:00,  2.24it/s] 
Best score: 0.8296428571428571:   0%|          | 0/40 [00:00<?, ?it/s]

<mango.tuner.Tuner object at 0x0000013D7AE27B20>


Best score: 0.8296428571428571: 100%|██████████| 40/40 [00:03<00:00, 12.34it/s]


<mango.tuner.Tuner object at 0x0000013D7AB94D60>


Best score: 0.848248211565328: 100%|██████████| 40/40 [00:09<00:00,  4.32it/s] 
Best score: 0.45059288537549413: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]
Best score: 0.8215804216423412: 100%|██████████| 40/40 [00:02<00:00, 14.98it/s]
Best score: 0.8259903961584634: 100%|██████████| 40/40 [00:17<00:00,  2.30it/s]
Best score: 0.8303058065331397: 100%|██████████| 40/40 [00:20<00:00,  1.97it/s]
Best score: 0.8557929838602109: 100%|██████████| 40/40 [00:05<00:00,  7.83it/s]
Best score: 0.8368745392893999: 100%|██████████| 40/40 [00:15<00:00,  2.64it/s]
Best score: 0.45059288537549413: 100%|██████████| 40/40 [00:13<00:00,  2.92it/s]
Best score: 0.7971922453191803: 100%|██████████| 40/40 [00:05<00:00,  7.46it/s]
Best score: 0.8195240201343695: 100%|██████████| 40/40 [00:18<00:00,  2.17it/s]
Best score: 0.835513995071713: 100%|██████████| 40/40 [00:25<00:00,  1.55it/s] 
Best score: 0.8935532107579874: 100%|██████████| 40/40 [00:07<00:00,  5.60it/s]
Best score: 0.8507834361814902: 100%|█

Unnamed: 0,model,hiperparameters,mean cv score,std cv score,test score
16,SVC PT,"{'C': 238.43120555440737, 'degree': 4, 'gamma'...",0.893553,0.075751,0.722222
10,SVC SC,"{'C': 1572.6196729272306, 'degree': 1, 'gamma'...",0.855793,0.067832,0.722222
17,XGBoost PT,"{'colsample_bytree': 0.5079483215981613, 'eta'...",0.850783,0.050008,0.777778
5,XGBoost,"{'colsample_bytree': 0.595037627164339, 'eta':...",0.848248,0.065245,0.777778
11,XGBoost SC,"{'colsample_bytree': 0.7998223642135491, 'eta'...",0.836875,0.083119,0.722222
2,Random Forest,"{'ccp_alpha': 0.01158420531906019, 'n_estimato...",0.835514,0.08298,0.888889
3,Gradient Boosting,"{'ccp_alpha': 0.001847112773844237, 'n_estimat...",0.835514,0.08298,0.833333
15,Gradient Boosting PT,"{'ccp_alpha': 0.0034588363885953693, 'n_estima...",0.835514,0.08298,0.777778
9,Gradient Boosting SC,"{'ccp_alpha': 0.0025336573913398923, 'n_estima...",0.830306,0.085843,0.833333
4,SVC,"{'C': 7.792283952473439, 'degree': 1, 'gamma':...",0.829643,0.095388,0.666667


In [32]:
df_sel.to_csv('resultados_clasificación_atributos_seleccionados_f1_2_1_0.5.csv', index=False)

# Mejores modelos

## Mejor modelo RF

In [10]:
# Mejor modelo RF
params = {'ccp_alpha': 0.00010090416322484955, 'n_estimators': 88}
model_rf = RandomForestClassifier(**params)
scores = cross_val_score(estimator = model_rf, X= X_train, y= y_train_label, scoring='f1_weighted', cv=5)
print('mean val score: ', scores.mean())
print('std val score: ', scores.std())
model_fit_rf = model_rf.fit(X_train, y_train_label)
print('test score: ',model_fit_rf.score(X_test, y_test_label))

mean val score:  0.83468910371166
std val score:  0.08592677079739121
test score:  0.9444444444444444


In [86]:
disp = ConfusionMatrixDisplay.from_estimator(model_fit_rf, X_test, y_test_label, 
                                             display_labels=['civilians', 'ex-combatants', 'victims'])

In [87]:
report = pd.DataFrame(classification_report(y_test_label, model_fit_rf.predict(X_test), 
                                            target_names=['civilians', 'ex-combatants', 'victims'], output_dict=True))
report

Unnamed: 0,civilians,ex-combatants,victims,accuracy,macro avg,weighted avg
precision,1.0,0.916667,1.0,0.944444,0.972222,0.949074
recall,0.666667,1.0,1.0,0.944444,0.888889,0.944444
f1-score,0.8,0.956522,1.0,0.944444,0.918841,0.940097
support,3.0,11.0,4.0,0.944444,18.0,18.0


In [88]:
report.to_csv('reporte_clasificación_atributos_seleccionados_f1_2_1_0.5_RF.csv', index_label='metric')

In [95]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit_rf, X_sel_sel, y_label,n_repeats=50,random_state=0, scoring='f1_weighted')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X_sel_sel.columns, 
                                     index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
EX2_score,0.117901
gender_F,0.062753
IRI_PT,0.04392
IRI_EC,0.016459
school_years,0.010424
max_degree_b2c,0.009343
age,0.008875
IRI_PD,0.008438
mean_eccentricity_b2i,0.008092
mean_eccentricity_gc,0.007668


In [97]:
temp = pd.DataFrame(data=r.importances, index=X_sel_sel.columns)
temp['mean'] = temp.mean(axis=1)
temp = temp.sort_values(by='mean', ascending=False)
temp = temp[temp['mean']>0]
temp = temp.drop(columns=['mean'], axis=1)
temp = temp.T
fig = sns.boxplot(data=temp, orient='h')

In [98]:
importancia_atributos.to_csv('importancia_atributos_seleccionados_f1_2_1_0.5_RF.csv', index_label='feature')

In [128]:
df_errados = ml.errores(model=model_rf, label='type', X_train=X_train, y_train=y_train, 
                        X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_all, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'type_x':'type'}, inplace=True)
datos_errados = datos_errados.drop(['type_y'], axis=1)
datos_errados = datos_errados.sort_index()
datos_errados

Unnamed: 0_level_0,type,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,ex-combatant,victims,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
21137,ex-combatant,victims,0.380952,0.285714,0.095238,0.333333,0.269841,0.063492,0.396825,0.269841,...,5,17,5,33,17,14,0.0,2.0,8.0,0.0
22100,controls,ex-combatant,0.269841,0.333333,-0.063492,0.285714,0.269841,0.015873,0.301587,0.238095,...,10,8,7,29,16,12,1.5,2.75,3.5,0.0
22103,controls,ex-combatant,0.222222,0.285714,-0.063492,0.269841,0.380952,-0.111111,0.238095,0.285714,...,9,12,17,28,15,12,0.0,0.0,6.0,0.0
22106,controls,victims,0.31746,0.31746,0.0,0.31746,0.285714,0.031746,0.301587,0.365079,...,16,17,15,57,21,14,0.0,3.25,7.5,0.0
22108,controls,ex-combatant,0.285714,0.253968,0.031746,0.238095,0.269841,-0.031746,0.285714,0.349206,...,15,19,9,33,20,14,0.0,1.5,9.5,0.0
22114,controls,ex-combatant,0.396825,0.333333,0.063492,0.222222,0.269841,-0.047619,0.285714,0.269841,...,9,6,13,29,11,12,0.0,0.0,6.0,0.0
24003,victims,ex-combatant,0.253968,0.253968,0.0,0.31746,0.285714,0.031746,0.285714,0.285714,...,16,15,14,28,12,12,0.0,2.25,5.25,0.0
24014,victims,ex-combatant,0.31746,0.253968,0.063492,0.301587,0.253968,0.047619,0.301587,0.285714,...,15,19,14,43,19,12,3.75,6.75,6.5,0.0
24027,victims,ex-combatant,0.222222,0.238095,-0.015873,0.380952,0.238095,0.142857,0.285714,0.269841,...,8,14,11,32,19,13,0.0,0.0,9.0,0.0


In [None]:
datos_errados.to_csv('errores_clasificación_atributos_seleccionados_f1_2_1_0.5_RF.csv', index_label='subject')

## Mejor modelo gradient boosting

In [11]:
# Mejor modelo GB
params = {'ccp_alpha': 0.0006260658638342504, 'n_estimators': 74, 'subsample': 0.39238936278396697}
model_gb = GradientBoostingClassifier(**params)
scores = cross_val_score(estimator = model_gb, X= X_train, y= y_train_label, scoring='f1_weighted', cv=5)
print('mean val score: ', scores.mean())
print('std val score: ', scores.std())
model_fit_gb = model_gb.fit(X_train, y_train_label)
print('test score: ',model_fit_gb.score(X_test, y_test_label))

mean val score:  0.806420357616731
std val score:  0.08849500571859763
test score:  0.8888888888888888


In [89]:
disp = ConfusionMatrixDisplay.from_estimator(model_fit_gb, X_test, y_test_label, 
                                             display_labels=['civilians', 'ex-combatants', 'victims'])

In [90]:
report = pd.DataFrame(classification_report(y_test_label, model_fit_gb.predict(X_test), 
                                            target_names=['civilians', 'ex-combatants', 'victims'], output_dict=True))
report

Unnamed: 0,civilians,ex-combatants,victims,accuracy,macro avg,weighted avg
precision,0.5,0.9,1.0,0.833333,0.8,0.855556
recall,0.666667,0.818182,1.0,0.833333,0.828283,0.833333
f1-score,0.571429,0.857143,1.0,0.833333,0.809524,0.84127
support,3.0,11.0,4.0,0.833333,18.0,18.0


In [91]:
report.to_csv('reporte_clasificación_atributos_seleccionados_f1_2_1_0.5_GB.csv', index_label='metric')

In [118]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit_gb, X_sel_sel, y_label,n_repeats=50,random_state=0, scoring='f1_weighted')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X_sel_sel.columns, 
                                     index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
EX2_score,0.171818
IRI_PT,0.076369
gender_F,0.036466
AL,0.03393
mean_eccentricity_b2i,0.023428
IRI_EC,0.020973
victims_self_no,0.008414
age,0.007933
mean_eccentricity_gc,0.005903
AN,0.005706


In [119]:
temp = pd.DataFrame(data=r.importances, index=X_sel_sel.columns)
temp['mean'] = temp.mean(axis=1)
temp = temp.sort_values(by='mean', ascending=False)
temp = temp[temp['mean']>0]
temp = temp.drop(columns=['mean'], axis=1)
temp = temp.T
fig = sns.boxplot(data=temp, orient='h')

In [120]:
importancia_atributos.to_csv('importancia_atributos_seleccionados_f1_2_1_0.5_GB.csv', index_label='feature')

In [121]:
df_errados = ml.errores(model=model_gb, label='type', X_train=X_train, y_train=y_train, 
                        X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_all, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'type_x':'type'}, inplace=True)
datos_errados = datos_errados.drop(['type_y'], axis=1)
datos_errados = datos_errados.sort_index()
datos_errados

Unnamed: 0_level_0,type,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,ex-combatant,victims,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
21102,ex-combatant,controls,0.301587,0.301587,0.0,0.285714,0.285714,0.0,0.253968,0.301587,...,16,15,10,42,13,12,0.0,0.0,4.5,0.0
22102,controls,victims,0.285714,0.333333,-0.047619,0.253968,0.238095,0.015873,0.31746,0.301587,...,23,19,8,37,14,12,0.0,0.75,9.25,0.0
22103,controls,ex-combatant,0.222222,0.285714,-0.063492,0.269841,0.380952,-0.111111,0.238095,0.285714,...,9,12,17,28,15,12,0.0,0.0,6.0,0.0
22106,controls,victims,0.31746,0.31746,0.0,0.31746,0.285714,0.031746,0.301587,0.365079,...,16,17,15,57,21,14,0.0,3.25,7.5,0.0
22108,controls,ex-combatant,0.285714,0.253968,0.031746,0.238095,0.269841,-0.031746,0.285714,0.349206,...,15,19,9,33,20,14,0.0,1.5,9.5,0.0
22114,controls,ex-combatant,0.396825,0.333333,0.063492,0.222222,0.269841,-0.047619,0.285714,0.269841,...,9,6,13,29,11,12,0.0,0.0,6.0,0.0
23005,ex-combatant,controls,0.238095,0.285714,-0.047619,0.301587,0.301587,0.0,0.269841,0.269841,...,22,14,18,36,17,13,0.0,0.0,0.0,0.0
24003,victims,ex-combatant,0.253968,0.253968,0.0,0.31746,0.285714,0.031746,0.285714,0.285714,...,16,15,14,28,12,12,0.0,2.25,5.25,0.0
24014,victims,ex-combatant,0.31746,0.253968,0.063492,0.301587,0.253968,0.047619,0.301587,0.285714,...,15,19,14,43,19,12,3.75,6.75,6.5,0.0


In [122]:
datos_errados.to_csv('errores_clasificación_atributos_seleccionados_f1_2_1_0.5_GB.csv', index_label='subject')

## Mejor modelo XGBoost NP

In [12]:
# Mejor modelo XGBoost NP
params = {'colsample_bytree': 0.7355218292441176, 'eta': 0.4691773326745049, 'max_depth': 5, 'n_estimators': 87, 
          'subsample': 0.7220090785891405}
model_xgnp = XGBClassifier(**params)
scores = cross_val_score(estimator = model_xgnp, X= X_train, y= y_train_label, scoring='f1_weighted', cv=5)
print('mean val score: ', scores.mean())
print('std val score: ', scores.std())
model_fit_xgnp = model_xgnp.fit(X_train, y_train_label)
print('test score: ',model_fit_xgnp.score(X_test, y_test_label))

mean val score:  0.8367278139325907
std val score:  0.08310009570639276
test score:  0.7777777777777778


In [108]:
df_errados = ml.errores(model=model_xgnp, label='type', X_train=X_train, y_train=y_train, 
                        X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_all, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'type_x':'type'}, inplace=True)
datos_errados = datos_errados.drop(['type_y'], axis=1)
datos_errados = datos_errados.sort_index()
datos_errados

Unnamed: 0_level_0,type,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,ex-combatant,victims,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
21102,ex-combatant,controls,0.301587,0.301587,0.0,0.285714,0.285714,0.0,0.253968,0.301587,...,16,15,10,42,13,12,0.0,0.0,4.5,0.0
21114,ex-combatant,victims,0.269841,0.238095,0.031746,0.31746,0.269841,0.047619,0.301587,0.301587,...,8,16,10,31,13,13,0.0,2.25,8.25,0.75
21137,ex-combatant,controls,0.380952,0.285714,0.095238,0.333333,0.269841,0.063492,0.396825,0.269841,...,5,17,5,33,17,14,0.0,2.0,8.0,0.0
22100,controls,victims,0.269841,0.333333,-0.063492,0.285714,0.269841,0.015873,0.301587,0.238095,...,10,8,7,29,16,12,1.5,2.75,3.5,0.0
22103,controls,ex-combatant,0.222222,0.285714,-0.063492,0.269841,0.380952,-0.111111,0.238095,0.285714,...,9,12,17,28,15,12,0.0,0.0,6.0,0.0
22106,controls,victims,0.31746,0.31746,0.0,0.31746,0.285714,0.031746,0.301587,0.365079,...,16,17,15,57,21,14,0.0,3.25,7.5,0.0
22107,controls,victims,0.428571,0.301587,0.126984,0.365079,0.333333,0.031746,0.238095,0.269841,...,19,15,11,31,19,13,0.0,3.75,0.75,0.0
22108,controls,ex-combatant,0.285714,0.253968,0.031746,0.238095,0.269841,-0.031746,0.285714,0.349206,...,15,19,9,33,20,14,0.0,1.5,9.5,0.0
22114,controls,ex-combatant,0.396825,0.333333,0.063492,0.222222,0.269841,-0.047619,0.285714,0.269841,...,9,6,13,29,11,12,0.0,0.0,6.0,0.0


In [37]:
datos_errados.to_csv('errores_clasificación_atributos_seleccionados_f1_2_1_0.5_XGNP.csv', index_label='subject')

## Mejor modelo XGBoost PT

In [10]:
# Mejor modelo XGBoost PT
params = {'colsample_bytree': 0.9924499363730269, 'eta': 0.46201686419017307, 'max_depth': 8, 'n_estimators': 57, 
          'subsample': 0.7050140622206494}
model_xg, model_fit_xg = ml.mejor_modelo(params=params, X=X_sel_sel, y=y, pre_pipe='pt')

mean val score:  0.8563323224026453
std val score:  0.062010477137945164
test score:  0.8333333333333334


In [11]:
# Mejor modelo XGBoost PT
params = {'colsample_bytree': 0.9924499363730269, 'eta': 0.46201686419017307, 'max_depth': 8, 'n_estimators': 57, 
          'subsample': 0.7050140622206494}
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), 
                                    ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
X_train_pt = preprocessor.fit_transform(X_train)
X_test_pt = preprocessor.transform(X_test)
model_xgpt = XGBClassifier(**params)
scores = cross_val_score(estimator = model_xgpt, X= X_train_pt, y= y_train_label, scoring='f1_weighted', cv=5)
print('mean val score: ', scores.mean())
print('std val score: ', scores.std())
model_fit_xgpt = model_xgpt.fit(X_train_pt, y_train_label)
print('test score: ',model_fit_xgpt.score(X_test_pt, y_test_label))

mean val score:  0.8563323224026453
std val score:  0.062010477137945164
test score:  0.8333333333333334


In [92]:
disp = ConfusionMatrixDisplay.from_estimator(model_fit_xg, X_test, y_test_label, 
                                             display_labels=['civilians', 'ex-combatants', 'victims'])

In [93]:
report = pd.DataFrame(classification_report(y_test_label, model_fit_xg.predict(X_test), 
                                            target_names=['civilians', 'ex-combatants', 'victims'], output_dict=True))
report

Unnamed: 0,civilians,ex-combatants,victims,accuracy,macro avg,weighted avg
precision,0.666667,0.9,0.6,0.777778,0.722222,0.794444
recall,0.666667,0.818182,0.75,0.777778,0.744949,0.777778
f1-score,0.666667,0.857143,0.666667,0.777778,0.730159,0.783069
support,3.0,11.0,4.0,0.777778,18.0,18.0


In [94]:
report.to_csv('reporte_clasificación_atributos_seleccionados_f1_2_1_0.5_XG.csv', index_label='metric')

In [123]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit_xg, X_sel_sel, y_label,n_repeats=50,random_state=0, scoring='f1_weighted')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X_sel_sel.columns, 
                                     index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
EX2_score,0.129605
IRI_PT,0.069544
gender_F,0.034597
IRI_EC,0.019199
school_years,0.011344
victims_self_no,0.010728
mean_eccentricity_b2i,0.00863
age,0.006039
AL,0.005194
AN,0.005055


In [124]:
temp = pd.DataFrame(data=r.importances, index=X_sel_sel.columns)
temp['mean'] = temp.mean(axis=1)
temp = temp.sort_values(by='mean', ascending=False)
temp = temp[temp['mean']>0]
temp = temp.drop(columns=['mean'], axis=1)
temp = temp.T
fig = sns.boxplot(data=temp, orient='h')

In [125]:
importancia_atributos.to_csv('importancia_atributos_seleccionados_f1_2_1_0.5_XG.csv', index_label='feature')

In [12]:
X_train_pt_df = pd.DataFrame(data=X_train_pt, columns=X_train.columns, index=X_train.index)
X_test_pt_df = pd.DataFrame(data=X_test_pt, columns=X_test.columns, index=X_test.index)
X_train_pt_df.head()

Unnamed: 0_level_0,max_degree_b2c,max_degree_b2i,mean_eccentricity_b2i,mean_eccentricity_gc,mean_eccentricity_tc,EX2_score,age,school_years,IRI_PT,IRI_EC,IRI_PD,RPQ AR,AN,AL,gender_F,victims_self_no
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
21118,2.072855,-1.008351,-0.277067,-1.054432,1.152989,0.25,0.2,0.416667,0.230769,0.666667,0.64,0.85,0.944444,0.318182,0.0,0.0
24015,0.116696,-0.998473,-0.291443,1.098738,0.854671,0.5,0.2,0.5,0.173077,0.444444,0.56,0.65,0.777778,0.136364,1.0,0.0
24039,-1.860656,-0.102147,1.9877,-0.657322,1.152989,1.0,0.4,0.25,0.576923,0.777778,0.68,1.0,0.555556,0.363636,1.0,1.0
21105,0.604309,-1.459706,1.77183,1.210443,0.610058,0.5,0.4,0.583333,0.576923,0.444444,0.8,0.85,1.0,0.681818,0.0,0.0
23021,0.962608,0.138711,0.157481,-1.054432,-1.477399,0.5,0.2,0.416667,0.346154,0.611111,0.84,0.45,0.388889,0.0,0.0,0.0


In [13]:
df_errados = ml.errores(model=model_xgpt, label='type', X_train=X_train_pt_df, y_train=y_train, 
                        X_test=X_test_pt_df, y_test=y_test)
datos_errados = pd.merge(df_errados, data_all, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'type_x':'type'}, inplace=True)
datos_errados = datos_errados.drop(['type_y'], axis=1)
datos_errados = datos_errados.sort_index()
datos_errados

Unnamed: 0_level_0,type,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,ex-combatant,victims,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
21102,ex-combatant,controls,0.301587,0.301587,0.0,0.285714,0.285714,0.0,0.253968,0.301587,...,16,15,10,42,13,12,0.0,0.0,4.5,0.0
21118,ex-combatant,victims,0.253968,0.269841,-0.015873,0.269841,0.222222,0.047619,0.269841,0.396825,...,23,17,17,37,18,15,1.0,0.0,9.5,0.0
21137,ex-combatant,victims,0.380952,0.285714,0.095238,0.333333,0.269841,0.063492,0.396825,0.269841,...,5,17,5,33,17,14,0.0,2.0,8.0,0.0
22103,controls,ex-combatant,0.222222,0.285714,-0.063492,0.269841,0.380952,-0.111111,0.238095,0.285714,...,9,12,17,28,15,12,0.0,0.0,6.0,0.0
22106,controls,victims,0.31746,0.31746,0.0,0.31746,0.285714,0.031746,0.301587,0.365079,...,16,17,15,57,21,14,0.0,3.25,7.5,0.0
22108,controls,ex-combatant,0.285714,0.253968,0.031746,0.238095,0.269841,-0.031746,0.285714,0.349206,...,15,19,9,33,20,14,0.0,1.5,9.5,0.0
22114,controls,ex-combatant,0.396825,0.333333,0.063492,0.222222,0.269841,-0.047619,0.285714,0.269841,...,9,6,13,29,11,12,0.0,0.0,6.0,0.0
23005,ex-combatant,victims,0.238095,0.285714,-0.047619,0.301587,0.301587,0.0,0.269841,0.269841,...,22,14,18,36,17,13,0.0,0.0,0.0,0.0
24014,victims,ex-combatant,0.31746,0.253968,0.063492,0.301587,0.253968,0.047619,0.301587,0.285714,...,15,19,14,43,19,12,3.75,6.75,6.5,0.0


In [43]:
datos_errados.to_csv('errores_clasificación_atributos_seleccionados_f1_2_1_0.5_XGPT.csv', index_label='subject')

# SHAP

## Random Forest

In [101]:
explainer = shap.TreeExplainer(model_fit_rf)
shap_values = explainer.shap_values(X_sel_sel)
shap.summary_plot(shap_values, X_sel_sel)

In [None]:
# Assuming you have a list of new labels for the legend called 'new_legend_labels'
new_legend_labels = ['Ex-combatants', 'Victims', 'Civilians']  # Example new legend labels
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles, new_legend_labels)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()

In [102]:
shap_values_controls=pd.DataFrame(shap_values[0], columns=X_sel_sel.columns, index=X_sel_sel.index)
shap_values_excombatants=pd.DataFrame(shap_values[1], columns=X_sel_sel.columns, index=X_sel_sel.index)
shap_values_victims=pd.DataFrame(shap_values[2], columns=X_sel_sel.columns, index=X_sel_sel.index)
shap_values_victims.head()


Unnamed: 0_level_0,max_degree_b2c,max_degree_b2i,mean_eccentricity_b2i,mean_eccentricity_gc,mean_eccentricity_tc,EX2_score,age,school_years,IRI_PT,IRI_EC,IRI_PD,RPQ AR,AN,AL,gender_F,victims_self_no
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
21100,-0.006585,0.01302,0.085489,-0.100331,0.010539,0.018337,0.025362,0.019581,0.120979,-0.051866,0.025524,0.001677,0.025716,0.082895,0.07701,-0.00096
21101,-0.005375,-0.007305,-0.050879,0.008569,-0.002232,0.008354,0.001947,0.006822,-0.060492,-0.013986,-0.031795,-0.009021,0.001811,0.055586,-0.061923,-0.016421
21102,-0.007328,0.000998,-0.018327,0.040373,-0.005852,0.002243,-0.020003,0.003147,-0.053377,-0.013778,-0.013673,-0.005429,-0.012921,-0.041417,-0.058327,-0.00676
21103,-0.011211,-0.003128,-0.012788,-0.023131,-0.004724,0.005999,-0.016518,-0.002392,0.030276,-0.010321,0.002865,0.00453,-0.009254,-0.034622,-0.084809,-0.007111
21104,-0.002359,-0.004921,-0.04459,-0.049936,-0.001435,0.004629,-0.020672,0.005226,-0.053821,0.013203,-0.022045,0.00652,0.00206,0.043941,-0.056635,-0.006868


In [80]:
shap_values_controls.to_csv('shap_values_controls.csv', index_label='subject')
shap_values_excombatants.to_csv('shap_values_excombatants.csv', index_label='subject')
shap_values_victims.to_csv('shap_values_victims.csv', index_label='subject')

In [103]:
shap_21100 = shap_values_victims.loc[21100, :]
shap_21100 = shap_21100.sort_values(ascending=False)
shap_21100

IRI_PT                   0.120979
mean_eccentricity_b2i    0.085489
AL                       0.082895
gender_F                 0.077010
AN                       0.025716
IRI_PD                   0.025524
age                      0.025362
school_years             0.019581
EX2_score                0.018337
max_degree_b2i           0.013020
mean_eccentricity_tc     0.010539
RPQ AR                   0.001677
victims_self_no         -0.000960
max_degree_b2c          -0.006585
IRI_EC                  -0.051866
mean_eccentricity_gc    -0.100331
Name: 21100, dtype: float64

In [104]:
shap_21102 = shap_values_controls.loc[21102, :]
shap_21102 = shap_21102.sort_values(ascending=False)
shap_21102

EX2_score                0.075481
gender_F                 0.008612
mean_eccentricity_b2i    0.003718
school_years             0.002462
age                      0.001309
max_degree_b2i           0.001083
max_degree_b2c           0.000181
IRI_PD                  -0.002774
mean_eccentricity_tc    -0.004422
AN                      -0.005753
IRI_PT                  -0.008612
IRI_EC                  -0.009152
RPQ AR                  -0.016000
AL                      -0.023167
mean_eccentricity_gc    -0.026366
victims_self_no         -0.045301
Name: 21102, dtype: float64

In [105]:
shap_21137 = shap_values_controls.loc[21137, :]
shap_21137 = shap_21137.sort_values(ascending=False)
shap_21137

EX2_score                0.083546
victims_self_no          0.068548
AL                       0.047270
IRI_PD                   0.030974
school_years             0.012874
age                      0.012435
mean_eccentricity_b2i    0.005436
RPQ AR                   0.002393
max_degree_b2c           0.000824
mean_eccentricity_tc    -0.000996
AN                      -0.001553
max_degree_b2i          -0.001946
IRI_PT                  -0.005449
IRI_EC                  -0.010812
gender_F                -0.030770
mean_eccentricity_gc    -0.034202
Name: 21137, dtype: float64

In [107]:
shap_22100 = shap_values_excombatants.loc[22100, :]
shap_22100 = shap_22100.sort_values(ascending=False)
shap_22100

AL                       0.054028
IRI_EC                   0.034971
school_years             0.019913
max_degree_b2i           0.001700
mean_eccentricity_tc     0.000953
IRI_PD                  -0.001973
max_degree_b2c          -0.006548
RPQ AR                  -0.009028
AN                      -0.013793
age                     -0.013866
mean_eccentricity_gc    -0.020452
mean_eccentricity_b2i   -0.028521
IRI_PT                  -0.047538
gender_F                -0.062223
victims_self_no         -0.068160
EX2_score               -0.250785
Name: 22100, dtype: float64

In [108]:
shap_22103 = shap_values_excombatants.loc[22103, :]
shap_22103 = shap_22103.sort_values(ascending=False)
shap_22103

gender_F                 0.062141
AL                       0.047976
IRI_PT                   0.035229
IRI_EC                   0.033802
age                      0.018872
AN                       0.013904
RPQ AR                   0.013167
mean_eccentricity_b2i    0.012895
max_degree_b2c           0.008033
max_degree_b2i           0.006958
IRI_PD                  -0.004251
mean_eccentricity_tc    -0.007915
mean_eccentricity_gc    -0.008231
school_years            -0.014420
victims_self_no         -0.063922
EX2_score               -0.099650
Name: 22103, dtype: float64

In [109]:
shap_22106 = shap_values_victims.loc[22106, :]
shap_22106 = shap_22106.sort_values(ascending=False)
shap_22106

IRI_PT                   0.110728
gender_F                 0.066051
AL                       0.038299
AN                       0.014623
IRI_PD                   0.008902
EX2_score                0.004330
school_years            -0.000128
IRI_EC                  -0.000561
max_degree_b2i          -0.002836
max_degree_b2c          -0.006658
victims_self_no         -0.007785
RPQ AR                  -0.010979
mean_eccentricity_tc    -0.033441
age                     -0.034931
mean_eccentricity_b2i   -0.070540
mean_eccentricity_gc    -0.103687
Name: 22106, dtype: float64

In [110]:
shap_22108 = shap_values_excombatants.loc[22108, :]
shap_22108 = shap_22108.sort_values(ascending=False)
shap_22108

gender_F                 0.041194
IRI_PT                   0.022239
EX2_score                0.020568
mean_eccentricity_b2i    0.018674
age                      0.012692
max_degree_b2c           0.000686
mean_eccentricity_gc    -0.003743
IRI_PD                  -0.005976
AN                      -0.007500
mean_eccentricity_tc    -0.008930
max_degree_b2i          -0.011795
RPQ AR                  -0.019652
school_years            -0.083587
victims_self_no         -0.102967
AL                      -0.146130
IRI_EC                  -0.171188
Name: 22108, dtype: float64

In [111]:
shap_22114 = shap_values_excombatants.loc[22114, :]
shap_22114 = shap_22114.sort_values(ascending=False)
shap_22114

AL                       0.059997
gender_F                 0.051546
IRI_EC                   0.036686
IRI_PT                   0.033382
RPQ AR                   0.023344
age                      0.014951
AN                       0.012491
mean_eccentricity_tc     0.010359
mean_eccentricity_b2i    0.007007
max_degree_b2i           0.006812
max_degree_b2c           0.006626
IRI_PD                  -0.005184
mean_eccentricity_gc    -0.005235
school_years            -0.023230
victims_self_no         -0.073089
EX2_score               -0.101878
Name: 22114, dtype: float64

In [112]:
shap_24003 = shap_values_excombatants.loc[24003, :]
shap_24003 = shap_24003.sort_values(ascending=False)
shap_24003

AL                       0.029191
EX2_score                0.013489
victims_self_no          0.013187
RPQ AR                   0.004741
max_degree_b2c           0.002543
mean_eccentricity_b2i    0.001252
max_degree_b2i          -0.000678
IRI_EC                  -0.001315
AN                      -0.004092
mean_eccentricity_tc    -0.009990
school_years            -0.023695
IRI_PD                  -0.059408
age                     -0.063062
IRI_PT                  -0.077228
mean_eccentricity_gc    -0.083118
gender_F                -0.119047
Name: 24003, dtype: float64

In [113]:
shap_24014 = shap_values_excombatants.loc[24014, :]
shap_24014 = shap_24014.sort_values(ascending=False)
shap_24014

EX2_score                0.038706
victims_self_no          0.027232
age                      0.020490
mean_eccentricity_b2i    0.004321
max_degree_b2i           0.004005
max_degree_b2c           0.002326
mean_eccentricity_gc    -0.007184
AL                      -0.007618
mean_eccentricity_tc    -0.008015
RPQ AR                  -0.012488
school_years            -0.017578
AN                      -0.024431
IRI_PT                  -0.054354
IRI_PD                  -0.055405
gender_F                -0.111722
IRI_EC                  -0.152791
Name: 24014, dtype: float64

In [114]:
shap_24027 = shap_values_excombatants.loc[24027, :]
shap_24027 = shap_24027.sort_values(ascending=False)
shap_24027

gender_F                 0.067088
victims_self_no          0.035184
EX2_score                0.018560
AN                       0.007466
IRI_EC                  -0.000863
RPQ AR                  -0.006407
mean_eccentricity_gc    -0.007211
mean_eccentricity_b2i   -0.010137
school_years            -0.016658
IRI_PD                  -0.019309
age                     -0.023935
mean_eccentricity_tc    -0.026375
max_degree_b2i          -0.032794
max_degree_b2c          -0.079886
AL                      -0.140595
IRI_PT                  -0.152723
Name: 24027, dtype: float64

In [115]:
shap_24043 = shap_values_excombatants.loc[24043, :]
shap_24043 = shap_24043.sort_values(ascending=False)
shap_24043

AL                       0.060269
IRI_PT                   0.044518
EX2_score                0.040391
victims_self_no          0.019194
RPQ AR                   0.012145
IRI_EC                   0.008381
max_degree_b2c           0.002885
mean_eccentricity_tc     0.001904
AN                      -0.004426
mean_eccentricity_gc    -0.007375
mean_eccentricity_b2i   -0.010811
school_years            -0.021699
max_degree_b2i          -0.033751
IRI_PD                  -0.036465
age                     -0.043741
gender_F                -0.090469
Name: 24043, dtype: float64

In [116]:
shap_24045 = shap_values_excombatants.loc[24045, :]
shap_24045 = shap_24045.sort_values(ascending=False)
shap_24045

AL                       0.054165
gender_F                 0.036617
EX2_score                0.020377
victims_self_no          0.011943
school_years             0.010906
IRI_EC                   0.008540
max_degree_b2i           0.004227
AN                       0.003340
RPQ AR                  -0.008676
mean_eccentricity_gc    -0.009382
IRI_PD                  -0.011039
mean_eccentricity_tc    -0.038024
age                     -0.061066
mean_eccentricity_b2i   -0.085316
max_degree_b2c          -0.086496
IRI_PT                  -0.193258
Name: 24045, dtype: float64

In [117]:
shap_24069 = shap_values_excombatants.loc[24069, :]
shap_24069 = shap_24069.sort_values(ascending=False)
shap_24069

IRI_PT                   0.051405
EX2_score                0.011543
mean_eccentricity_b2i    0.005160
RPQ AR                   0.002176
max_degree_b2c          -0.000664
mean_eccentricity_tc    -0.001691
max_degree_b2i          -0.004615
IRI_PD                  -0.010530
AN                      -0.014954
mean_eccentricity_gc    -0.021675
age                     -0.022518
school_years            -0.032368
victims_self_no         -0.074277
gender_F                -0.078991
IRI_EC                  -0.150687
AL                      -0.159546
Name: 24069, dtype: float64

## XGBoosting NP

In [117]:
explainer = shap.TreeExplainer(model_fit_xgnp)
shap_values = explainer.shap_values(X_sel_sel)
shap.summary_plot(shap_values, X_sel_sel)

In [102]:
data_all.head()

Unnamed: 0_level_0,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,diameter_b2d,diameter_dc,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,0.063492,0.365079,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
21101,0.222222,0.269841,-0.047619,0.269841,0.365079,-0.095238,0.333333,0.365079,-0.031746,0.31746,...,22,13,6,70,33,26,0.0,1.25,9.5,0.5
21102,0.301587,0.301587,0.0,0.285714,0.285714,0.0,0.253968,0.301587,-0.047619,0.380952,...,16,15,10,42,13,12,0.0,0.0,4.5,0.0
21103,0.333333,0.396825,-0.063492,0.269841,0.285714,-0.015873,0.349206,0.285714,0.063492,0.333333,...,19,16,12,32,16,15,0.0,0.0,6.25,0.0
21104,0.238095,0.238095,0.0,0.31746,0.396825,-0.079365,0.269841,0.380952,-0.111111,0.285714,...,15,18,8,30,16,17,2.0,1.5,7.5,0.0


In [64]:
# Assuming you have a list of new labels for the legend called 'new_legend_labels'
new_legend_labels = ['Ex-combatants', 'Victims', 'Civilians']  # Example new legend labels
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles, new_legend_labels)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()

The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


In [110]:
shap_values_controls_xgnp=pd.DataFrame(shap_values[0], columns=X_sel_sel.columns, index=X_sel_sel.index)
shap_values_excombatants_xgnp=pd.DataFrame(shap_values[1], columns=X_sel_sel.columns, index=X_sel_sel.index)
shap_values_victims_xgnp=pd.DataFrame(shap_values[2], columns=X_sel_sel.columns, index=X_sel_sel.index)
shap_values_victims_xgnp.head()

Unnamed: 0_level_0,max_degree_b2c,max_degree_b2i,mean_eccentricity_b2i,mean_eccentricity_gc,mean_eccentricity_tc,EX2_score,age,school_years,IRI_PT,IRI_EC,IRI_PD,RPQ AR,AN,AL,gender_F,victims_self_no
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
21100,-0.100669,0.0,0.304575,-0.869069,0.206855,-0.056525,-0.266199,0.0,0.536687,-0.591859,0.086399,0.096935,0.147204,0.205284,0.881012,0.0
21101,-0.070745,0.0,-0.338173,0.425068,0.01351,0.053163,0.193423,0.0,-0.735385,-0.058667,-0.252578,0.050599,-0.020604,0.350321,-1.113127,0.0
21102,-0.070745,0.0,-0.398097,0.416767,-0.204477,-0.006731,-0.30884,0.0,-0.486882,-0.019994,-0.194438,-0.064868,-0.052713,-0.156378,-1.109069,0.0
21103,-0.100669,0.0,-0.371471,-0.593914,-0.348216,-0.001265,0.120835,0.0,0.261813,0.076491,0.07631,0.013877,-0.046523,-0.178524,-1.035945,0.0
21104,0.080298,0.0,-0.374202,-0.514144,0.010372,-0.030626,-0.181341,0.0,-0.729959,0.164517,-0.210258,0.013877,-0.020604,-0.10137,-1.051696,0.0


In [111]:
shap_values_controls_xgnp['sum_civilians'] = shap_values_controls_xgnp.sum(axis=1)
shap_values_excombatants_xgnp['sum_excombatants'] = shap_values_excombatants_xgnp.sum(axis=1)
shap_values_victims_xgnp['sum_victims'] = shap_values_victims_xgnp.sum(axis=1)
shap_values_errors = pd.concat([shap_values_controls_xgnp['sum_civilians'], 
                                shap_values_excombatants_xgnp['sum_excombatants'], 
                                shap_values_victims_xgnp['sum_victims']], axis=1)
shap_values_errors = shap_values_errors.loc[datos_errados.index, :]
shap_values_errors

Unnamed: 0_level_0,sum_civilians,sum_excombatants,sum_victims
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
21100,-0.608702,-3.459136,0.580631
21102,-1.163859,1.5128,-2.656465
21114,-2.050569,-1.074434,0.394234
21137,1.896574,-2.085007,0.422361
22100,2.26217,-2.708965,-0.539031
22103,-0.087473,-0.089853,-2.027231
22106,1.579729,-3.514569,-0.624073
22107,0.782667,-3.919968,2.584012
22108,1.531988,-1.822402,-1.806883
22114,1.017761,0.656038,-2.609826


In [112]:
shap_values_errors.to_csv('shap_values_errors.csv', index_label='subject')

In [66]:
shap_values_controls_xgnp.to_csv('shap_values_controls_xgnp.csv', index_label='subject')
shap_values_excombatants_xgnp.to_csv('shap_values_excombatants_xgnp.csv', index_label='subject')
shap_values_victims_xgnp.to_csv('shap_values_victims_xgnp.csv', index_label='subject')

In [100]:
shap_error = shap_values_excombatants_xgnp.loc[24069, :]
shap_error = shap_error.sort_values(ascending=False)
fig = sns.barplot(x=shap_error.values, y=shap_error.index, palette='bwr')
print(shap_error)

IRI_PT                   0.864156
RPQ AR                   0.215063
mean_eccentricity_b2i    0.157893
age                      0.140085
mean_eccentricity_tc     0.135861
IRI_PD                   0.001171
max_degree_b2c          -0.011413
mean_eccentricity_gc    -0.065486
school_years            -0.222207
max_degree_b2i          -0.243702
AN                      -0.310913
EX2_score               -0.374753
victims_self_no         -0.535600
gender_F                -0.581674
AL                      -0.767136
IRI_EC                  -1.031655
Name: 24069, dtype: float32


## XGBoosting PT

In [14]:
X_sel_pt=pd.concat([X_train_pt_df, X_test_pt_df], axis=0)
X_sel_pt = X_sel_pt.sort_index()
explainer = shap.TreeExplainer(model_fit_xgpt)
shap_values = explainer.shap_values(X_sel_pt)
shap.summary_plot(shap_values, X_sel_pt)

In [59]:
# Assuming you have a list of new labels for the legend called 'new_legend_labels'
new_legend_labels = ['Ex-combatants', 'Victims', 'Civilians']  # Example new legend labels
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles, new_legend_labels)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()

The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


In [17]:
shap_values_controls_xgpt=pd.DataFrame(shap_values[0], columns=X_sel_sel.columns, index=X_sel_sel.index)
shap_values_excombatants_xgpt=pd.DataFrame(shap_values[1], columns=X_sel_sel.columns, index=X_sel_sel.index)
shap_values_victims_xgpt=pd.DataFrame(shap_values[2], columns=X_sel_sel.columns, index=X_sel_sel.index)
shap_values_victims_xgpt.head()

Unnamed: 0_level_0,max_degree_b2c,max_degree_b2i,mean_eccentricity_b2i,mean_eccentricity_gc,mean_eccentricity_tc,EX2_score,age,school_years,IRI_PT,IRI_EC,IRI_PD,RPQ AR,AN,AL,gender_F,victims_self_no
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
21100,0.618714,-0.76069,0.246692,0.174152,0.28067,-0.04155,0.154856,0.047073,-0.387526,0.041343,0.347672,-0.599455,0.003492,0.0,0.66893,0.0
21101,-0.503276,0.331122,-0.039062,-0.094043,0.214395,-0.04155,-0.218305,0.010517,0.409565,0.041343,-0.686948,-0.043697,-0.340644,0.0,-0.80384,0.0
21102,-0.326011,0.399415,-0.127808,-0.094043,-0.292837,-0.04155,0.113649,-0.014365,-0.354943,0.041343,-0.543542,0.119099,-0.340644,0.0,-0.834749,0.0
21103,-0.243454,-0.706112,-0.205437,-0.072381,-0.246504,-0.04155,-0.217125,0.047073,-0.33313,0.041343,0.503079,0.186497,0.131276,0.0,-0.697889,0.0
21104,-0.503276,-0.418433,-0.021205,-0.094043,-0.051113,0.047205,-0.218305,0.047073,-0.351645,0.041343,-0.508172,0.108354,-0.23064,0.0,-0.734621,0.0


In [18]:
shap_values_controls_xgpt.to_csv('shap_values_controls_xgpt.csv', index_label='subject')
shap_values_excombatants_xgpt.to_csv('shap_values_excombatants_xgpt.csv', index_label='subject')
shap_values_victims_xgpt.to_csv('shap_values_victims_xgpt.csv', index_label='subject')

In [20]:
shap_values_controls_xgpt['sum_civilians'] = shap_values_controls_xgpt.sum(axis=1)
shap_values_excombatants_xgpt['sum_excombatants'] = shap_values_excombatants_xgpt.sum(axis=1)
shap_values_victims_xgpt['sum_victims'] = shap_values_victims_xgpt.sum(axis=1)
shap_values_errors = pd.concat([shap_values_controls_xgpt['sum_civilians'], 
                                shap_values_excombatants_xgpt['sum_excombatants'], 
                                shap_values_victims_xgpt['sum_victims']], axis=1)
shap_values_errors = shap_values_errors.loc[datos_errados.index, :]
shap_values_errors

Unnamed: 0_level_0,sum_civilians,sum_excombatants,sum_victims
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
21100,-1.010216,-3.042165,0.794373
21102,-0.741568,1.883578,-2.296986
21118,0.065615,1.75616,-0.059929
21137,0.094809,-2.206912,0.138747
22103,0.027208,0.046877,-1.958764
22106,1.658798,-3.390066,-1.070342
22108,1.199577,-2.124701,-2.201005
22114,0.427119,1.352212,-2.13625
23005,-0.257836,-1.209573,-0.060678
24014,-2.398983,-1.525929,2.122966


In [22]:
shap_values_victims_xgpt.drop('sum_victims', axis=1, inplace=True)
shap_values_controls_xgpt.drop('sum_civilians', axis=1, inplace=True)
shap_values_excombatants_xgpt.drop('sum_excombatants', axis=1, inplace=True)

In [42]:
shap_error = shap_values_excombatants_xgpt.loc[24045, :]
shap_error = shap_error.sort_values(ascending=False)
fig = sns.barplot(x=shap_error.values, y=shap_error.index, palette='bwr')
print(shap_error)

mean_eccentricity_gc     0.354350
mean_eccentricity_tc     0.337665
IRI_EC                   0.323461
victims_self_no          0.246268
gender_F                 0.221383
IRI_PT                   0.163894
RPQ AR                   0.071177
AN                       0.054046
EX2_score                0.000000
age                     -0.080713
mean_eccentricity_b2i   -0.093334
max_degree_b2i          -0.246412
AL                      -0.380830
school_years            -0.495095
max_degree_b2c          -0.527133
IRI_PD                  -1.937395
Name: 24045, dtype: float32
