In [1]:
import pandas as pd 
data = pd.read_csv('C:\\Users\\lclai\\Desktop\\data_dementia\\clean\\vitalsigns.csv')

In [3]:
data['DEMENTIA'].value_counts()

DEMENTIA
0    366
1     36
Name: count, dtype: int64

In [4]:
#undersampling

import pandas as pd

df_alzheimer = data[data['DEMENTIA'] == 1]
df_no_alzheimer = data[data['DEMENTIA'] == 0]

df_no_alzheimer_sampled = pd.DataFrame()

for (age, gender), group in df_alzheimer.groupby(['AGE', 'GENDER']):
 
    matched_controls = df_no_alzheimer[(df_no_alzheimer['AGE'] == age) & (df_no_alzheimer['GENDER'] == gender)]
    
    if len(matched_controls) > len(group):
        matched_controls = matched_controls.sample(n=len(group), random_state=42)
    
    df_no_alzheimer_sampled = pd.concat([df_no_alzheimer_sampled, matched_controls])

df_balanced = pd.concat([df_alzheimer, df_no_alzheimer_sampled])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
# outliers from numerical values

def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

df_balanced = remove_outliers_iqr(df_balanced, 'AGE')
df_balanced = remove_outliers_iqr(df_balanced, 'LAST_ADMISSION_LENGTH')
df_balanced = remove_outliers_iqr(df_balanced, 'HEART_RATE')
df_balanced = remove_outliers_iqr(df_balanced, 'RESPIRATORY_RATE')
df_balanced = remove_outliers_iqr(df_balanced, 'DIASTOLIC_BP')
df_balanced = remove_outliers_iqr(df_balanced, 'SYSTOLIC_BP')
df_balanced = remove_outliers_iqr(df_balanced, 'MEAN_BP')
df_balanced = remove_outliers_iqr(df_balanced, 'OXYGEN_SATURATION')

In [7]:
df_balanced.columns

Index(['AGE', 'GENDER', 'DEMENTIA', 'HYPERTENSION', 'CORONARY_ATHEROSCLEROSIS',
       'ATRIAL_FIBRILLATION', 'HEART_FAILURE', 'KIDNEY_FAILURE',
       'HYPERLIPIDEMIA', 'DIABETES', 'RESPIRATORY_FAILURE', 'UTI',
       'PROPHYLAXIS_VIRAL_HEPATITIS', 'OBSERVATION_INFECTIOUS',
       'HYPERCHOLESTEROLEMIA', 'ESOPHAGEAL_REFLUX', 'ANEMIA', 'PNEUMONIA',
       'POSTHEMORRHAGIC_ANEMIA', 'ACIDOSIS', 'LAST_ADMISSION_LENGTH',
       'ETHNICITY', 'MARITAL_STATUS', 'POTASSIUM_CHLORIDE',
       'SODIUM_CHLORIDE_FLUSH', 'ACETAMINOPHEN', 'MAGNESIUM_SULFATE',
       'INSULIN', 'HEPARIN', 'DOCUSATE_SODIUM', 'D5W', 'ISO_OSMOTIC_DEXTROSE',
       'MORPHINE_SULFATE', 'NS', 'SW', 'FUROSEMIDE', 'CALCIUM_GLUCONATE',
       'BISACODYL', 'HEART_RATE', 'RESPIRATORY_RATE', 'DIASTOLIC_BP',
       'SYSTOLIC_BP', 'MEAN_BP', 'OXYGEN_SATURATION'],
      dtype='object')

In [8]:
df_balanced['DEMENTIA'].value_counts()

DEMENTIA
1    26
0    24
Name: count, dtype: int64

In [9]:
binary = ['GENDER', 'DEMENTIA', 'HYPERTENSION',
       'CORONARY_ATHEROSCLEROSIS', 'ATRIAL_FIBRILLATION', 'HEART_FAILURE',
       'KIDNEY_FAILURE', 'HYPERLIPIDEMIA', 'DIABETES', 'RESPIRATORY_FAILURE',
       'UTI', 'PROPHYLAXIS_VIRAL_HEPATITIS', 'OBSERVATION_INFECTIOUS',
       'HYPERCHOLESTEROLEMIA', 'ESOPHAGEAL_REFLUX', 'ANEMIA', 'PNEUMONIA',
       'POSTHEMORRHAGIC_ANEMIA', 'ACIDOSIS', 'LAST_ADMISSION_LENGTH',
       'ETHNICITY', 'MARITAL_STATUS', 'POTASSIUM_CHLORIDE',
       'SODIUM_CHLORIDE_FLUSH', 'ACETAMINOPHEN', 'MAGNESIUM_SULFATE',
       'INSULIN', 'HEPARIN', 'DOCUSATE_SODIUM', 'D5W', 'ISO_OSMOTIC_DEXTROSE',
       'MORPHINE_SULFATE', 'NS', 'SW', 'FUROSEMIDE', 'CALCIUM_GLUCONATE',
       'BISACODYL']
categorical = ['ETHNICITY','MARITAL_STATUS']
numerical = ['LAST_ADMISSION_LENGTH','AGE','HEART_RATE', 'RESPIRATORY_RATE', 'DIASTOLIC_BP',
       'SYSTOLIC_BP', 'MEAN_BP', 'OXYGEN_SATURATION']

In [10]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

le = LabelEncoder()
scaler = StandardScaler()

for col in binary + categorical:
    df_balanced[col] = le.fit_transform(df_balanced[col]) 

df_balanced[numerical] = scaler.fit_transform(df_balanced[numerical])
df_balanced

Unnamed: 0,AGE,GENDER,DEMENTIA,HYPERTENSION,CORONARY_ATHEROSCLEROSIS,ATRIAL_FIBRILLATION,HEART_FAILURE,KIDNEY_FAILURE,HYPERLIPIDEMIA,DIABETES,...,SW,FUROSEMIDE,CALCIUM_GLUCONATE,BISACODYL,HEART_RATE,RESPIRATORY_RATE,DIASTOLIC_BP,SYSTOLIC_BP,MEAN_BP,OXYGEN_SATURATION
0,0.65834,1,1,0,1,0,1,0,1,1,...,0,1,1,1,-0.684817,0.912699,0.112433,0.414525,2.218348,0.0
2,-0.001466,0,0,1,0,0,0,1,0,0,...,1,1,1,1,0.904421,-0.548918,0.205352,-0.370211,-0.134126,0.0
3,0.365093,0,1,0,0,0,1,0,1,1,...,1,0,1,0,0.615468,0.425493,-0.723842,-1.29343,0.216989,0.0
4,0.365093,0,0,1,0,0,0,0,1,0,...,1,1,1,1,-0.299547,-0.061713,-0.863221,-0.877982,-0.028791,0.0
5,-0.001466,0,1,1,1,0,1,1,1,1,...,1,1,1,0,-0.540341,-0.386516,0.762869,0.876135,0.497882,0.0
7,0.438405,1,1,1,1,1,0,1,1,0,...,1,1,1,1,0.422834,-0.873722,-1.467198,-1.847362,-0.520353,0.0
8,-0.807896,0,0,0,1,0,0,0,0,0,...,0,0,1,1,-1.262722,-0.71132,-0.305705,-0.416372,-0.695911,0.0
9,-1.614326,0,1,0,0,0,1,1,0,0,...,0,1,0,1,-0.20323,-1.52333,-0.305705,-0.508694,-0.028791,0.0
10,0.365093,1,1,1,0,0,0,1,0,1,...,1,1,1,1,0.326516,1.88711,-1.327819,-1.431913,-0.485242,0.0
11,-1.247767,1,1,0,1,1,1,1,0,0,...,0,1,1,1,-0.5885,-0.386516,-0.352165,-0.185567,-0.590576,0.0


In [30]:
from sklearn.model_selection import train_test_split

y = df_balanced['DEMENTIA']  
X = df_balanced.drop(columns=['DEMENTIA','OXYGEN_SATURATION','DIASTOLIC_BP','SYSTOLIC_BP']) 

In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, LeaveOneOut
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

k_best = SelectKBest(score_func=f_classif, k=10)  

rf = RandomForestClassifier(random_state=42)

rf_param_grid = {
    'rf__n_estimators': [30, 35, 40, 50],  
    'rf__max_depth': [None, 10, 20],  
    'rf__min_samples_split': [2, 5], 
    'rf__min_samples_leaf': [1, 2, 4], 
    'rf__max_features': ['sqrt', 'log2'],  
    'rf__bootstrap': [False],  
    'rf__max_samples': [None],  
}

pipeline = Pipeline([
    ('feature_selection', k_best),
    ('rf', rf)
])

rf_grid_search = GridSearchCV(pipeline, rf_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
rf_grid_search.fit(X, y)

print("Best parameters found: ", rf_grid_search.best_params_)

rf_auc = cross_val_score(rf_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()

print(f"Random Forest AUC: {rf_auc}")


Best parameters found:  {'rf__bootstrap': False, 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__max_samples': None, 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 2, 'rf__n_estimators': 35}
Random Forest AUC: 0.6126666666666667


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

k_best = SelectKBest(score_func=f_classif, k=10)

xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

xgb_param_grid = {
    'xgb__n_estimators': [3, 5, 15, 30],
    'xgb__max_depth': [3, 6, 10, 20],
    'xgb__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'xgb__subsample': [0.6, 0.8, 1.0],
    'xgb__colsample_bytree': [0.6, 0.8, 1.0],
    'xgb__gamma': [0, 0.1, 0.2],
}

pipeline = Pipeline([
    ('feature_selection', k_best),
    ('xgb', xgb_model)
])

xgb_grid_search = GridSearchCV(pipeline, xgb_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
xgb_grid_search.fit(X, y)

print("Best parameters found: ", xgb_grid_search.best_params_)

xgb_auc = cross_val_score(xgb_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()

print(f"XGBoost AUC: {xgb_auc}")


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Best parameters found:  {'xgb__colsample_bytree': 1.0, 'xgb__gamma': 0, 'xgb__learning_rate': 0.2, 'xgb__max_depth': 3, 'xgb__n_estimators': 30, 'xgb__subsample': 1.0}
XGBoost AUC: 0.7141666666666666


  f = msb / msw
  f = msb / msw


In [64]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

# Definir validación cruzada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Selección de características
k_best = SelectKBest(score_func=f_classif, k=10)

# Definir el modelo KNN
knn_model = KNeighborsClassifier()

# Definir el grid de parámetros
knn_param_grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],  
    'knn__weights': ['uniform', 'distance'], 
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
    'knn__leaf_size': [20, 30, 40],
    'knn__p': [1, 2], 
}

# Crear pipeline con selección de características y el clasificador KNN
pipeline = Pipeline([
    ('feature_selection', k_best),
    ('knn', knn_model)
])

# Realizar búsqueda de hiperparámetros con GridSearchCV
knn_grid_search = GridSearchCV(pipeline, knn_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
knn_grid_search.fit(X, y)

# Mostrar los mejores parámetros encontrados
print("Best parameters found: ", knn_grid_search.best_params_)

# Calcular el AUC con validación cruzada usando el mejor modelo
knn_auc = cross_val_score(knn_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()

# Imprimir el AUC promedio
print(f"KNN AUC: {knn_auc}")


Best parameters found:  {'knn__algorithm': 'auto', 'knn__leaf_size': 20, 'knn__n_neighbors': 10, 'knn__p': 1, 'knn__weights': 'distance'}
KNN AUC: 0.6486666666666667


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

nb_model = GaussianNB()


nb_param_grid = {
    'nb__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]  
}

pipeline = Pipeline([
    ('nb', nb_model)
])

nb_grid_search = GridSearchCV(pipeline, nb_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
nb_grid_search.fit(X, y)

print("Best parameters found: ", nb_grid_search.best_params_)

nb_auc = cross_val_score(nb_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()

print(f"Naive Bayes AUC: {nb_auc}")


Best parameters found:  {'nb__var_smoothing': 1e-06}
Naive Bayes AUC: 0.5413333333333333


In [68]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(max_iter=1000, random_state=42)

mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],  
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd'], 
    'alpha': [0.0001, 0.001, 0.01],  
    'learning_rate': ['constant', 'invscaling', 'adaptive'], 
    'learning_rate_init': [0.001, 0.01, 0.1],  
}

mlp_grid_search = GridSearchCV(mlp_model, mlp_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
mlp_grid_search.fit(X, y)
print("Best parameters found: ", mlp_grid_search.best_params_)
mlp_auc = cross_val_score(mlp_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()
print(f"Neural Network AUC: {mlp_auc}")

Best parameters found:  {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (100, 100), 'learning_rate': 'constant', 'learning_rate_init': 0.1, 'solver': 'adam'}
Neural Network AUC: 0.613


In [73]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

svm_model = SVC(random_state=42)

svm_param_grid = {
    'svm__C': [0.1, 1, 10, 100], 
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  
    'svm__degree': [2, 3, 4], 
    'svm__gamma': ['scale', 'auto', 0.1, 1, 10],
    'svm__coef0': [0.0, 0.1, 0.5],  
    'svm__shrinking': [True, False]
}


pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Estandarizar las características
    ('select_k_best', SelectKBest(f_classif, k=10)),  # Selección de las 10 mejores características
    ('svm', svm_model)  # Modelo SVM
])

svm_grid_search = GridSearchCV(pipeline, svm_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
svm_grid_search.fit(X, y)

print("Best parameters found: ", svm_grid_search.best_params_)

svm_auc = cross_val_score(svm_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()

print(f"SVM AUC: {svm_auc}")


Best parameters found:  {'svm__C': 0.1, 'svm__coef0': 0.0, 'svm__degree': 2, 'svm__gamma': 10, 'svm__kernel': 'sigmoid', 'svm__shrinking': True}
SVM AUC: 0.7028095238095238


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
