In [1]:
import importlib
import funciones_modelos_ML as ml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
from mango import Tuner, scheduler, MetaTuner
from mango.domain.distribution import loguniform
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [91]:
importlib.reload(ml)

<module 'funciones_modelos_ML' from 'c:\\Users\\jhquiza\\OneDrive - Universidad de Medellin\\JupyterNotebooks\\IAT\\funciones_modelos_ML.py'>

# Modelo eliminando sujetos con datos faltantes y usando todas las variables

In [3]:
# otra opción usando todas las variables y sin hacer imputación de datos
data_all = pd.read_csv('medidas_conectividad_globales_comportamentales_demograficos_2.csv')
data_all.set_index('subject', inplace=True)
data_all= data_all.drop(columns=['IAT_score_ind','IAT_score_level','dscore_1','dscore_2','dscore_3','dscore_5','dscore_6','dscore_7','dscore_8','modo_0','modo_1','modo_2'], axis=1)
data_all.dropna(inplace=True)
data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']] = data_all[['IRI_PT', 'IRI_FS', 'IRI_EC', 'IRI_PD', 'IMA', 'RPQ AR', 'RPQ AP']].astype('int64')

In [5]:
X = data_all.drop(['type', 'group'], axis=1).copy()
X = pd.get_dummies(X)
y = data_all['type'].copy()

# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)


In [76]:
# modelos con variables sin preprocesar
best_results_np = ml.get_best_model_np(X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)
best_results_np

<mango.tuner.Tuner object at 0x000001DBAD0739D0>


Best score: 0.7428571428571429: 100%|██████████| 40/40 [00:18<00:00,  2.13it/s]
Best score: 0.7428571428571429:   0%|          | 0/40 [00:00<?, ?it/s]

<mango.tuner.Tuner object at 0x000001DBA9D85670>


Best score: 0.7571428571428571: 100%|██████████| 40/40 [00:01<00:00, 20.95it/s]


<mango.tuner.Tuner object at 0x000001DBACEEB850>


Best score: 0.7285714285714286: 100%|██████████| 40/40 [00:16<00:00,  2.42it/s]


<mango.tuner.Tuner object at 0x000001DB88FA2040>


Best score: 0.8285714285714285: 100%|██████████| 40/40 [00:37<00:00,  1.07it/s]
Best score: 0.7285714285714285:   0%|          | 0/40 [00:00<?, ?it/s]

<mango.tuner.Tuner object at 0x000001DBACD29520>


Best score: 0.7571428571428572: 100%|██████████| 40/40 [00:03<00:00, 11.16it/s]


<mango.tuner.Tuner object at 0x000001DBAC213D00>


Best score: 0.8285714285714286: 100%|██████████| 40/40 [00:29<00:00,  1.34it/s]


Unnamed: 0_level_0,hiperparameters,mean cv score,std cv score,test score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,"{'C': 0.005740461359935142, 'penalty': 'l2'}",0.742857,0.072843,0.777778
KNN,"{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}",0.757143,0.034993,0.833333
Random Forest,"{'ccp_alpha': 0.00030419738294424026, 'n_estim...",0.728571,0.094761,0.5
Gradient Boosting,"{'ccp_alpha': 0.006604664588361813, 'n_estimat...",0.828571,0.09689,0.611111
SVC,"{'C': 4.796416130939705, 'degree': 1, 'gamma':...",0.757143,0.034993,0.777778
XGBoost,"{'colsample_bytree': 0.6236969117109337, 'eta'...",0.828571,0.106904,0.611111


In [80]:
# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

# modelos con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
model_names = ['Logistic Regression SC', 'KNN SC', 'Random Forest SC', 'Gradient Boosting SC', 'SVC SC', 'XGBoost SC']
best_results_sc = ml.get_best_model_sc(preprocessor=preprocessor, model_names=model_names, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)
best_results_sc

Best score: 0.6: 100%|██████████| 40/40 [00:15<00:00,  2.54it/s]
Best score: 0.6428571428571429: 100%|██████████| 40/40 [00:03<00:00, 12.01it/s]
Best score: 0.7714285714285715: 100%|██████████| 40/40 [00:16<00:00,  2.38it/s]
Best score: 0.8428571428571429: 100%|██████████| 40/40 [00:44<00:00,  1.10s/it]
Best score: 0.6857142857142857: 100%|██████████| 40/40 [00:05<00:00,  7.56it/s]
Best score: 0.8857142857142858: 100%|██████████| 40/40 [00:15<00:00,  2.59it/s]


Unnamed: 0_level_0,hiperparameters,mean cv score,std cv score,test score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression SC,"{'C': 0.0016367806265565259, 'penalty': 'l1'}",0.6,0.034993,0.611111
KNN SC,"{'weights': 'uniform', 'p': 1, 'n_neighbors': 5}",0.642857,0.063888,0.555556
Random Forest SC,"{'ccp_alpha': 0.012846190537283824, 'n_estimat...",0.771429,0.053452,0.722222
Gradient Boosting SC,"{'ccp_alpha': 0.00296741957384925, 'n_estimato...",0.842857,0.083299,0.611111
SVC SC,"{'C': 0.28191640439469234, 'degree': 4, 'gamma...",0.685714,0.09689,0.555556
XGBoost SC,"{'colsample_bytree': 0.6565426588537016, 'eta'...",0.885714,0.057143,0.722222


In [81]:
# modelos con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
model_names = ['Logistic Regression PT', 'KNN PT', 'Random Forest PT', 'Gradient Boosting PT', 'SVC PT', 'XGBoost PT']
best_results_pt = ml.get_best_model_sc(preprocessor=preprocessor, model_names=model_names, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)
best_results_pt

Best score: 0.6: 100%|██████████| 40/40 [00:41<00:00,  1.04s/it]
Best score: 0.6571428571428571: 100%|██████████| 40/40 [00:30<00:00,  1.30it/s]
Best score: 0.7714285714285715: 100%|██████████| 40/40 [00:48<00:00,  1.22s/it]
Best score: 0.8428571428571429: 100%|██████████| 40/40 [00:58<00:00,  1.45s/it]
Best score: 0.7142857142857143: 100%|██████████| 40/40 [00:33<00:00,  1.20it/s]
Best score: 0.8571428571428571: 100%|██████████| 40/40 [00:54<00:00,  1.37s/it]


Unnamed: 0_level_0,hiperparameters,mean cv score,std cv score,test score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression PT,"{'C': 0.002027892996328136, 'penalty': 'l1'}",0.6,0.034993,0.611111
KNN PT,"{'n_neighbors': 8, 'p': 1, 'weights': 'uniform'}",0.657143,0.069985,0.611111
Random Forest PT,"{'ccp_alpha': 0.0001296250885245065, 'n_estima...",0.771429,0.053452,0.722222
Gradient Boosting PT,"{'ccp_alpha': 0.0010401012102184473, 'n_estima...",0.842857,0.083299,0.611111
SVC PT,"{'C': 0.4589075359145189, 'degree': 1, 'gamma'...",0.714286,0.045175,0.5
XGBoost PT,"{'colsample_bytree': 0.3974348186365755, 'eta'...",0.857143,0.063888,0.444444


In [84]:
df_np = best_results_np.reset_index().copy()
df_sc = best_results_sc.reset_index().copy()
df_pt = best_results_pt.reset_index().copy()
df_all = pd.concat([df_np, df_sc, df_pt], axis=0, ignore_index=True)
df_all

Unnamed: 0,model,hiperparameters,mean cv score,std cv score,test score
0,Logistic Regression,"{'C': 0.005740461359935142, 'penalty': 'l2'}",0.742857,0.072843,0.777778
1,KNN,"{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}",0.757143,0.034993,0.833333
2,Random Forest,"{'ccp_alpha': 0.00030419738294424026, 'n_estim...",0.728571,0.094761,0.5
3,Gradient Boosting,"{'ccp_alpha': 0.006604664588361813, 'n_estimat...",0.828571,0.09689,0.611111
4,SVC,"{'C': 4.796416130939705, 'degree': 1, 'gamma':...",0.757143,0.034993,0.777778
5,XGBoost,"{'colsample_bytree': 0.6236969117109337, 'eta'...",0.828571,0.106904,0.611111
6,Logistic Regression SC,"{'C': 0.0016367806265565259, 'penalty': 'l1'}",0.6,0.034993,0.611111
7,KNN SC,"{'weights': 'uniform', 'p': 1, 'n_neighbors': 5}",0.642857,0.063888,0.555556
8,Random Forest SC,"{'ccp_alpha': 0.012846190537283824, 'n_estimat...",0.771429,0.053452,0.722222
9,Gradient Boosting SC,"{'ccp_alpha': 0.00296741957384925, 'n_estimato...",0.842857,0.083299,0.611111


In [85]:
df_all.to_csv('mejores_resultados_modelos_conectividad_actores_dataset_completo.csv', index=False)

In [86]:
# Mejor modelo
df_all = df_all.set_index('model')
params= df_all.loc['XGBoost SC']['hiperparameters']
y = data_all['type'].copy()
model, model_fit = ml.mejor_modelo(params=params, X=X, y=y, pre_pipe='sc')

mean val score:  0.8857142857142858
std val score:  0.05714285714285717
test score:  0.7222222222222222


In [87]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit, X, y_label,n_repeats=20,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
EX2_score,0.035795
victims_self_no,0.025000
AL,0.018750
gender_F,0.009659
diameter_td,0.006818
...,...
max_betweenness_tc,0.000000
max_betweenness_ti,0.000000
max_betweenness_td,0.000000
mean_mpli_b1c,-0.006250


In [88]:
importancia_atributos.to_csv('importancia_atributos_clasificación_dataset_completo_sc.csv', index_label='feature')

In [89]:
df_errados = ml.errores(model=model, label='type', X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_all, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'type_x':'type'}, inplace=True)
datos_errados = datos_errados.drop(['type_y'], axis=1)
datos_errados

Unnamed: 0_level_0,type,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,ex-combatant,victims,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
24043,victims,ex-combatant,0.31746,0.269841,0.047619,0.31746,0.238095,0.079365,0.285714,0.238095,...,18,15,15,36,11,12,0.0,2.25,5.25,0.0
21137,ex-combatant,controls,0.380952,0.285714,0.095238,0.333333,0.269841,0.063492,0.396825,0.269841,...,5,17,5,33,17,14,0.0,2.0,8.0,0.0
22108,controls,ex-combatant,0.285714,0.253968,0.031746,0.238095,0.269841,-0.031746,0.285714,0.349206,...,15,19,9,33,20,14,0.0,1.5,9.5,0.0
24027,victims,ex-combatant,0.222222,0.238095,-0.015873,0.380952,0.238095,0.142857,0.285714,0.269841,...,8,14,11,32,19,13,0.0,0.0,9.0,0.0
21131,ex-combatant,victims,0.285714,0.301587,-0.015873,0.238095,0.333333,-0.095238,0.222222,0.269841,...,7,8,15,28,12,12,0.0,2.0,4.25,0.0
23005,ex-combatant,controls,0.238095,0.285714,-0.047619,0.301587,0.301587,0.0,0.269841,0.269841,...,22,14,18,36,17,13,0.0,0.0,0.0,0.0
24003,victims,ex-combatant,0.253968,0.253968,0.0,0.31746,0.285714,0.031746,0.285714,0.285714,...,16,15,14,28,12,12,0.0,2.25,5.25,0.0
24015,victims,ex-combatant,0.253968,0.253968,0.0,0.380952,0.238095,0.142857,0.333333,0.269841,...,14,13,14,29,14,13,1.75,3.5,8.25,2.0
22106,controls,victims,0.31746,0.31746,0.0,0.31746,0.285714,0.031746,0.301587,0.365079,...,16,17,15,57,21,14,0.0,3.25,7.5,0.0


In [90]:
datos_errados.to_csv('errores_clasificación_dataset_completo_sc.csv')

## Modelo con selección de atributos

In [93]:
lista_atributos = ml.select_features_clf(X_train=X_train, y_train=y_train, threshold='1.5*mean', mi_threshold=0.1)
print(lista_atributos)

['max_degree_ac', 'max_degree_b2i', 'max_degree_b2d', 'max_degree_gi', 'mean_eccentricity_ai', 'mean_eccentricity_b1c', 'mean_eccentricity_b2i', 'mean_eccentricity_b2d', 'mean_eccentricity_dd', 'mean_eccentricity_gi', 'mean_eccentricity_gd', 'mean_eccentricity_td', 'EX2_score', 'age', 'school_years', 'IRI_PT', 'IRI_EC', 'RPQ AR', 'RPQ AP', 'AN', 'AL', 'exposure_level_high', 'exposure_level_low', 'victims_self_no', 'victims_self_yes', 'gender_F', 'gender_M', 'IRI_PD']


In [94]:
data_all_cod = pd.get_dummies(data_all)
X_sel = data_all_cod[lista_atributos].copy()

# separación datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=77, stratify=y)
y_train_label = LabelEncoder().fit_transform(y_train)
y_test_label = LabelEncoder().fit_transform(y_test)

# separación de variables para preprocesar
continuas_cols = X_train.select_dtypes(include=['float64']).columns.to_list()
discretas_cols = X_train.select_dtypes(include=['int64']).columns.to_list()

In [95]:
# modelos con variables sin preprocesar
best_results_np_sel = ml.get_best_model_np(X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# modelos con standard scaler y min_max scaler
preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
model_names = ['Logistic Regression SC', 'KNN SC', 'Random Forest SC', 'Gradient Boosting SC', 'SVC SC', 'XGBoost SC']
best_results_sc_sel = ml.get_best_model_sc(preprocessor=preprocessor, model_names=model_names, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

# modelos con power transformer y min_max scaler
preprocessor = ColumnTransformer([('pt', PowerTransformer(), continuas_cols), ('min_max', MinMaxScaler(), discretas_cols)], remainder='passthrough')
model_names = ['Logistic Regression PT', 'KNN PT', 'Random Forest PT', 'Gradient Boosting PT', 'SVC PT', 'XGBoost PT']
best_results_pt_sel = ml.get_best_model_sc(preprocessor=preprocessor, model_names=model_names, X_train=X_train, y_train_label=y_train_label, X_test=X_test, y_test_label=y_test_label)

df_np_sel = best_results_np_sel.reset_index().copy()
df_sc_sel = best_results_sc_sel.reset_index().copy()
df_pt_sel = best_results_pt_sel.reset_index().copy()
df_sel = pd.concat([df_np_sel, df_sc_sel, df_pt_sel], axis=0, ignore_index=True)
df_sel

  0%|          | 0/40 [00:00<?, ?it/s]

<mango.tuner.Tuner object at 0x000001DBAD03C790>


Best score: 0.8142857142857143: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]
Best score: 0.8285714285714286:  12%|█▎        | 5/40 [00:00<00:00, 44.97it/s]

<mango.tuner.Tuner object at 0x000001DBAD073520>


Best score: 0.8428571428571429: 100%|██████████| 40/40 [00:01<00:00, 24.06it/s]


<mango.tuner.Tuner object at 0x000001DBA9E009D0>


Best score: 0.9: 100%|██████████| 40/40 [00:15<00:00,  2.52it/s]              


<mango.tuner.Tuner object at 0x000001DB88EB0700>


Best score: 0.8714285714285713: 100%|██████████| 40/40 [00:16<00:00,  2.38it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

<mango.tuner.Tuner object at 0x000001DB86903280>


Best score: 0.8: 100%|██████████| 40/40 [00:04<00:00,  9.77it/s]


<mango.tuner.Tuner object at 0x000001DBAD0739A0>


Best score: 0.9428571428571428: 100%|██████████| 40/40 [00:12<00:00,  3.27it/s]
Best score: 0.6: 100%|██████████| 40/40 [00:17<00:00,  2.24it/s]
Best score: 0.8857142857142858: 100%|██████████| 40/40 [00:03<00:00, 12.74it/s]
Best score: 0.8571428571428573: 100%|██████████| 40/40 [00:15<00:00,  2.52it/s]
Best score: 0.9142857142857144: 100%|██████████| 40/40 [00:24<00:00,  1.64it/s]
Best score: 0.9285714285714285: 100%|██████████| 40/40 [00:04<00:00,  8.66it/s]
Best score: 0.9285714285714285: 100%|██████████| 40/40 [00:13<00:00,  2.97it/s]
Best score: 0.6: 100%|██████████| 40/40 [00:17<00:00,  2.34it/s]
Best score: 0.8714285714285713: 100%|██████████| 40/40 [00:05<00:00,  7.48it/s]
Best score: 0.8857142857142858: 100%|██████████| 40/40 [00:16<00:00,  2.40it/s]
Best score: 0.9142857142857144: 100%|██████████| 40/40 [00:21<00:00,  1.83it/s]
Best score: 0.9: 100%|██████████| 40/40 [00:06<00:00,  5.72it/s]
Best score: 0.9428571428571428: 100%|██████████| 40/40 [00:18<00:00,  2.13it/s]


Unnamed: 0,model,hiperparameters,mean cv score,std cv score,test score
0,Logistic Regression,"{'C': 0.008454073392612201, 'penalty': 'l2'}",0.814286,0.034993,0.777778
1,KNN,"{'n_neighbors': 8, 'p': 1, 'weights': 'distance'}",0.842857,0.053452,0.666667
2,Random Forest,"{'ccp_alpha': 0.001611177163140738, 'n_estimat...",0.9,0.034993,0.666667
3,Gradient Boosting,"{'ccp_alpha': 0.00670159344582178, 'n_estimato...",0.871429,0.053452,0.777778
4,SVC,"{'C': 2544.2112581201072, 'degree': 3, 'gamma'...",0.8,0.094761,0.833333
5,XGBoost,"{'colsample_bytree': 0.7096775206800409, 'eta'...",0.942857,0.028571,0.722222
6,Logistic Regression SC,"{'C': 0.0005923589176855333, 'penalty': 'l2'}",0.6,0.034993,0.611111
7,KNN SC,"{'weights': 'distance', 'p': 1, 'n_neighbors':...",0.885714,0.057143,0.722222
8,Random Forest SC,"{'ccp_alpha': 0.0019489191859846907, 'n_estima...",0.857143,0.063888,0.833333
9,Gradient Boosting SC,"{'ccp_alpha': 0.0015476131800837907, 'n_estima...",0.914286,0.028571,0.722222


In [97]:
df_sel.to_csv('mejores_resultados_modelos_conectividad_actores_atributos_seleccionados.csv', index=False)

In [100]:
# Mejor modelo
df_sel = df_sel.set_index('model')
params= df_sel.loc['XGBoost PT']['hiperparameters']
y = data_all['type'].copy()
model, model_fit = ml.mejor_modelo(params=params, X=X_sel, y=y, pre_pipe='pt')

mean val score:  0.9428571428571428
std val score:  0.02857142857142856
test score:  0.7777777777777778


In [101]:
# Análisis de relevancia
y_label = LabelEncoder().fit_transform(y)
r = permutation_importance(model_fit, X_sel, y_label,n_repeats=20,random_state=0, scoring='accuracy')
importancia_atributos = pd.DataFrame(data=[r.importances_mean], columns=X_sel.columns, index=['mean importance']).transpose()
importancia_atributos.sort_values(by='mean importance', ascending=False)

Unnamed: 0,mean importance
EX2_score,0.026136
AL,0.023295
gender_F,0.014205
school_years,0.013068
IRI_PT,0.0125
victims_self_yes,0.010795
exposure_level_low,0.010795
exposure_level_high,0.010795
victims_self_no,0.010227
IRI_EC,0.007955


In [102]:
importancia_atributos.to_csv('importancia_atributos_clasificación_atributos_seleccionados_pt.csv', index_label='feature')

In [103]:
df_errados = ml.errores(model=model, label='type', X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
datos_errados = pd.merge(df_errados, data_all, how='inner', left_index=True, right_index=True)
datos_errados.rename(columns={'type_x':'type'}, inplace=True)
datos_errados = datos_errados.drop(['type_y'], axis=1)
datos_errados

Unnamed: 0_level_0,type,predicted,diameter_ac,diameter_ai,diameter_ad,diameter_b1c,diameter_b1i,diameter_b1d,diameter_b2c,diameter_b2i,...,IRI_FS,IRI_EC,IRI_PD,IMA,RPQ AR,RPQ AP,TD,AN,AL,IH
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21100,ex-combatant,victims,0.269841,0.222222,0.047619,0.333333,0.269841,0.063492,0.285714,0.222222,...,9,11,11,27,20,16,1.5,5.75,9.0,0.5
21137,ex-combatant,victims,0.380952,0.285714,0.095238,0.333333,0.269841,0.063492,0.396825,0.269841,...,5,17,5,33,17,14,0.0,2.0,8.0,0.0
22108,controls,ex-combatant,0.285714,0.253968,0.031746,0.238095,0.269841,-0.031746,0.285714,0.349206,...,15,19,9,33,20,14,0.0,1.5,9.5,0.0
24027,victims,ex-combatant,0.222222,0.238095,-0.015873,0.380952,0.238095,0.142857,0.285714,0.269841,...,8,14,11,32,19,13,0.0,0.0,9.0,0.0
22106,controls,victims,0.31746,0.31746,0.0,0.31746,0.285714,0.031746,0.301587,0.365079,...,16,17,15,57,21,14,0.0,3.25,7.5,0.0
23002,ex-combatant,controls,0.269841,0.238095,0.031746,0.206349,0.222222,-0.015873,0.269841,0.269841,...,12,14,13,29,19,14,0.0,0.0,0.0,0.0
22110,controls,ex-combatant,0.31746,0.269841,0.047619,0.222222,0.301587,-0.079365,0.269841,0.253968,...,14,13,8,31,18,12,0.0,2.0,8.0,0.0
24045,victims,ex-combatant,0.31746,0.31746,0.0,0.285714,0.31746,-0.031746,0.269841,0.206349,...,13,13,11,49,19,18,0.0,0.5,3.75,0.25
21111,ex-combatant,victims,0.253968,0.269841,-0.015873,0.253968,0.238095,0.015873,0.349206,0.31746,...,15,13,10,44,24,15,4.5,2.25,3.0,0.25


In [104]:
datos_errados.to_csv('errores_clasificación_atributos_seleccionados_pt.csv')