In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [2]:
import lightgbm as lgb
import xgboost as xgb

In [3]:
# Importar librerias para los modelos
from sklearn.linear_model import LogisticRegression #Regresion Logistica
from sklearn.naive_bayes import MultinomialNB #Naive Bayes, se utiliza este por tener mas de dos clases como variable dependiente.
from sklearn.ensemble import RandomForestClassifier #RandomForest
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis #LDA
from sklearn.svm import SVC #SVM
from sklearn.tree import DecisionTreeClassifier #Arbol de decisión
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis #QDA
from xgboost import XGBClassifier #XG Boost
from lightgbm import LGBMClassifier #LGBM
from sklearn.ensemble import AdaBoostClassifier #Ada Boost
from sklearn.ensemble import GradientBoostingClassifier # Gradient Boosting
from sklearn.neighbors import KNeighborsClassifier #KNN
 
#Importar metricas
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

#Gestion train-test
from sklearn.model_selection import train_test_split

#Transformaciones
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [4]:
proyecto_data_dos = pd.read_csv('Obesity_data_proyecto.csv')
proyecto_data_dos.head()

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad
0,0.442507,283,0.358093,0.345082,639,238,0.63093,0.429517,2015,2067,0.63093,1726,0.334526,0.5,1765,1495,287
1,0.442507,912,0.152287,0.251561,1401,238,1.0,0.675605,96,44,1.0,1726,1.0,0.0,1765,1495,287
2,0.54179,916,0.698413,0.474596,70,238,0.63093,0.675605,2015,2067,0.63093,1726,0.666667,0.5,1765,1495,287
3,0.716781,283,0.698413,0.560113,70,238,1.0,0.429517,2015,2067,0.63093,385,0.334526,0.0,1765,55,290
4,0.493277,916,0.662323,0.582299,1401,238,0.63093,0.0,2015,2067,0.63093,385,0.0,0.0,1765,1495,290


### 5. Construcción de los modelos

In [5]:
#Seleccionar el target (y) y features (x)

X = proyecto_data_dos.drop('NObeyesdad', axis=1)
y = proyecto_data_dos['NObeyesdad']

In [6]:
#split para train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2025, shuffle=True)

In [7]:
#Creamos el scaler, ultimo paso de la ingenieria

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### 5.1. Naive Bayes

In [11]:
#Configuracion de hiperparametros
resultados_nb =[]

model1_nb = MultinomialNB(alpha=0.5, fit_prior=True)
model2_nb = MultinomialNB(alpha=1.0, fit_prior=True)
model3_nb = MultinomialNB(alpha=1.5, fit_prior=False)
model4_nb = MultinomialNB(alpha=0.5, fit_prior=False)
model5_nb = MultinomialNB(alpha=1.0, fit_prior=False)

models_nb = [model1_nb, model2_nb, model3_nb, model4_nb, model5_nb]

for i, model in enumerate(models_nb):
    model.fit(X_train_scaled, y_train)
    nb_predicts = model.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, nb_predicts)
    resultados_nb.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_nb).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)

   Modelo  Accuracy
0  model3  0.479495
1  model4  0.476341
2  model5  0.476341
3  model1  0.268139
4  model2  0.268139


#### 5.2. LDA - Análisis de Discriminante Lineal

In [12]:
#Configuracion de hiperparametros
resultados_lda=[]

model1_lda = LinearDiscriminantAnalysis(solver='svd')
model2_lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
model3_lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage=0.5)
model4_lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage='auto')
model5_lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage=0.3, n_components=3)

models_lda = [model1_lda, model2_lda, model3_lda, model4_lda, model5_lda]

for i, model in enumerate(models_lda):
    model.fit(X_train_scaled, y_train)
    lda_predicts = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, lda_predicts)
    resultados_lda.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_lda).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)

   Modelo  Accuracy
0  model1  0.875394
1  model2  0.835962
2  model4  0.835962
3  model5  0.679811
4  model3  0.588328


#### 5.3. Regresión Logística

In [14]:
#Configuracion de hiperparametros
resultados_lg=[]

model1_lg = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=1000, multi_class='multinomial')
model2_lg = LogisticRegression(penalty='l1', C=0.5, solver='saga', max_iter=1000, multi_class='multinomial')
model3_lg = LogisticRegression(penalty='elasticnet', C=1.0, solver='saga', l1_ratio=0.5, max_iter=1000, multi_class='multinomial')
model4_lg = LogisticRegression(penalty='l2', C=0.1, solver='lbfgs', max_iter=1000, multi_class='multinomial')
model5_lg = LogisticRegression(penalty='l2', solver='sag', max_iter=1000, multi_class='multinomial')

models_lg = [model1_lg, model2_lg, model3_lg, model4_lg, model5_lg]

for i, model in enumerate(models_lg):
    model.fit(X_train_scaled, y_train)
    predicts_lg = model.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, predicts_lg)
    resultados_lg.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_lg).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)



   Modelo  Accuracy
0  model2  0.821767
1  model3  0.747634
2  model1  0.708202
3  model5  0.708202
4  model4  0.575710




#### 5.4. SVM

In [15]:
#Configuracion de hiperparametros
resultados_svm=[]

model1_SVM = SVC(C=1.0, kernel='linear')
model2_SVM = SVC(C=1.0, kernel='poly', degree=3, gamma='scale')
model3_SVM = SVC(C=1.0, kernel='rbf', gamma='scale')
model4_SVM = SVC(C=1.0, kernel='sigmoid', gamma='scale')
model5_SVM = SVC(C=0.1, kernel='rbf', gamma=0.1)

models_SVM = [model1_SVM, model2_SVM, model3_SVM, model4_SVM, model5_SVM]

for i, model in enumerate(models_SVM):
    model.fit(X_train_scaled, y_train)
    predicts_SVM = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, predicts_SVM)
    resultados_svm.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_svm).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)

   Modelo  Accuracy
0  model2  0.908517
1  model1  0.859621
2  model3  0.824921
3  model5  0.422713
4  model4  0.250789


#### 5.5. Arboles de decisión

In [17]:
#Configuracion de hiperparametros
resultados_dtc= []

model1_dtc = DecisionTreeClassifier(criterion='gini', max_depth=5)
model2_dtc = DecisionTreeClassifier(criterion='entropy', max_depth=10)
model3_dtc = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=10)
model4_dtc = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5)
model5_dtc = DecisionTreeClassifier(criterion='log_loss', max_depth=7, min_samples_split=20, min_samples_leaf=10)

models_dtc = [model1_dtc, model2_dtc, model3_dtc, model4_dtc, model5_dtc]

for i, model in enumerate(models_dtc):
    model.fit(X_train_scaled, y_train)
    predicts_dtc = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, predicts_dtc)
    resultados_dtc.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_dtc).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)

   Modelo  Accuracy
0  model2  0.932177
1  model3  0.913249
2  model5  0.867508
3  model1  0.824921
4  model4  0.805994


#### 5.6. Random Forest

In [19]:
#Configuracion de hiperparametros
resultados_rf=[]

model1_rf = RandomForestClassifier(n_estimators=50, criterion='gini', max_depth=10)
model2_rf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=15, min_samples_split=10)
model3_rf = RandomForestClassifier(n_estimators=150, criterion='gini', max_depth=None, min_samples_leaf=5)
model4_rf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=10, min_samples_split=5, min_samples_leaf=3)
model5_rf = RandomForestClassifier(n_estimators=100, criterion='log_loss', max_depth=20, min_samples_split=15, max_features='sqrt')

models_rf = [model1_rf, model2_rf, model3_rf, model4_rf, model5_rf]

for i, model in enumerate(models_rf):
    model.fit(X_train_scaled, y_train)
    predicts_rf = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, predicts_rf)
    resultados_rf.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_rf).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)

   Modelo  Accuracy
0  model2  0.929022
1  model4  0.921136
2  model3  0.913249
3  model5  0.910095
4  model1  0.905363


### 5.7 KNN - K-Nearest Neighbors

In [21]:
resultados_knn=[]

#configuracion de hiperparametros
model1_knn = KNeighborsClassifier(n_neighbors = 3, weights = 'uniform')
model2_knn = KNeighborsClassifier(n_neighbors = 5, weights = 'uniform')
model3_knn = KNeighborsClassifier(n_neighbors = 7, weights = 'uniform')
model4_knn = KNeighborsClassifier(n_neighbors = 3, weights = 'distance')
model5_knn = KNeighborsClassifier(n_neighbors = 5, weights = 'distance')

models_knn = [model1_knn, model2_knn, model3_knn, model4_knn, model5_knn]

for i, model in enumerate(models_knn):
    model.fit(X_train_scaled, y_train)
    predicts_knn = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, predicts_knn)
    resultados_knn.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_knn).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)

   Modelo  Accuracy
0  model4  0.752366
1  model5  0.747634
2  model1  0.733438
3  model2  0.728707
4  model3  0.714511


### 5.8 QDA - Análisis de Discriminante Cuadrático

In [22]:
resultados_qda=[]

# configuracion de hiperparametros
model1_qda = QuadraticDiscriminantAnalysis(reg_param =  0.0, store_covariance= False, tol= 0.0001)
model2_qda = QuadraticDiscriminantAnalysis(reg_param =  0.1, store_covariance= False, tol= 0.0001)
model3_qda = QuadraticDiscriminantAnalysis(reg_param =  0.2, store_covariance= True, tol= 0.0001)
model4_qda = QuadraticDiscriminantAnalysis(reg_param =  0.3, store_covariance= True, tol= 0.001)
model5_qda = QuadraticDiscriminantAnalysis(reg_param =  0.4, store_covariance= False, tol= 0.001)

models_qda = [model1_qda, model2_qda, model3_qda, model4_qda, model5_qda]

for i, model in enumerate(models_qda):
    model.fit(X_train_scaled, y_train)
    predicts_qda = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, predicts_qda)
    resultados_qda.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_qda).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)

   Modelo  Accuracy
0  model2  0.547319
1  model3  0.492114
2  model4  0.465300
3  model5  0.429022
4  model1  0.154574


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


### 5.9 Ada Boost

In [23]:
resultados_ab=[]

#configuracion de hiperparametros
model1_ab = AdaBoostClassifier(n_estimators= 50, learning_rate= 1.0, algorithm= 'SAMME')
model2_ab = AdaBoostClassifier(n_estimators= 100, learning_rate= 0.5, algorithm = 'SAMME')
model3_ab = AdaBoostClassifier(n_estimators= 200, learning_rate= 0.1, algorithm= 'SAMME')
model4_ab = AdaBoostClassifier(n_estimators= 50, learning_rate= 0.5, algorithm= 'SAMME')
model5_ab = AdaBoostClassifier(n_estimators= 500, learning_rate= 0.5, algorithm= 'SAMME')

models_ab = [model1_ab, model2_ab, model3_ab, model4_ab, model5_ab]

for i, model in enumerate(models_ab):
    model.fit(X_train_scaled, y_train)
    predicts_ab = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, predicts_ab)
    resultados_ab.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_ab).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)


   Modelo  Accuracy
0  model2  0.673502
1  model1  0.664038
2  model5  0.646688
3  model3  0.627760
4  model4  0.564669


### 5.10 Gradient Boosting

In [24]:
resultados_gb=[]

#configuracion de hiperparametros
model1_gb = GradientBoostingClassifier(n_estimators= 50, learning_rate= 0.1, max_depth= 3)
model2_gb = GradientBoostingClassifier(n_estimators= 100, learning_rate= 0.1, max_depth= 3)
model3_gb = GradientBoostingClassifier(n_estimators= 100, learning_rate= 0.01, max_depth= 3)
model4_gb = GradientBoostingClassifier(n_estimators= 200, learning_rate= 0.1, max_depth= 4)
model5_gb = GradientBoostingClassifier(n_estimators= 100, learning_rate= 0.1, max_depth= 5)

models_gb = [model1_gb, model2_gb, model3_gb, model4_gb, model5_gb]

for i, model in enumerate(models_gb):
    model.fit(X_train_scaled, y_train)
    predicts_gb = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, predicts_gb)
    resultados_gb.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_gb).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)

   Modelo  Accuracy
0  model5  0.947950
1  model4  0.938486
2  model1  0.927445
3  model2  0.925868
4  model3  0.804416


### 5.11 XG Boost

In [26]:
#Se codifica las etiquetas de y_train, y_test para clasificarlas en clases enteras de 0 hasta el número de clases menos uno.
encoder = LabelEncoder()
y_encoded_train = encoder.fit_transform(y_train)
y_encoded_test = encoder.fit_transform(y_test)

In [27]:
resultados_xg=[]

#configuracion de hiperparametros
model1_xg = XGBClassifier(n_estimators= 50, learning_rate= 0.1, max_depth= 3)
model2_xg = XGBClassifier(n_estimators= 100, learning_rate= 0.1, max_depth= 3)
model3_xg = XGBClassifier(n_estimators= 100, learning_rate= 0.01, max_depth= 3)
model4_xg = XGBClassifier(n_estimators= 200, learning_rate= 0.1, max_depth= 4)
model5_xg = XGBClassifier(n_estimators= 100, learning_rate= 0.1, max_depth= 5)

models_xg = [model1_xg, model2_xg, model3_xg, model4_xg, model5_xg]

for i, model in enumerate(models_xg):
    model.fit(X_train_scaled, y_encoded_train)
    predicts_xg = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_encoded_test, predicts_xg)
    resultados_xg.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_xg).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)

   Modelo  Accuracy
0  model5  0.951104
1  model2  0.947950
2  model4  0.946372
3  model1  0.917981
4  model3  0.772871


### 5.12 LGBM

In [28]:
resultados_lgbm=[]

#configuracion de hiperparametros
model1_lgbm = LGBMClassifier(n_estimators= 50, learning_rate= 0.1, max_depth= 3)
model2_lgbm = LGBMClassifier(n_estimators= 100, learning_rate= 0.05, max_depth= 5)
model3_lgbm = LGBMClassifier(n_estimators= 200, learning_rate= 0.01, max_depth= 4)
model4_lgbm = LGBMClassifier(n_estimators= 150, learning_rate= 0.1, max_depth= 6)
model5_lgbm = LGBMClassifier(n_estimators= 120, learning_rate= 0.05, max_depth= 2)

models_lgbm = [model1_lgbm, model2_lgbm, model3_lgbm, model4_lgbm, model5_lgbm]

for i, model in enumerate(models_lgbm):
    model.fit(X_train_scaled, y_train)
    predicts_lgbm = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, predicts_lgbm)
    resultados_lgbm.append({'Modelo': f'model{i+1}', 'Accuracy': accuracy})


df_resultados = pd.DataFrame(resultados_lgbm).sort_values(by='Accuracy', ascending= False, ignore_index= True)
print(df_resultados)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001643 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1881
[LightGBM] [Info] Number of data points in the train set: 1477, number of used features: 16
[LightGBM] [Info] Start training from score -2.014565
[LightGBM] [Info] Start training from score -2.029910
[LightGBM] [Info] Start training from score -1.267083
[LightGBM] [Info] Start training from score -1.999451
[LightGBM] [Info] Start training from score -1.877233
[LightGBM] [Info] Start training from score -1.800600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1881
[LightGBM] [Info] Number of data points in the train set: 1477, number of used features: 16
[LightGBM] [Info] Start training fro