In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


In [41]:
data = pd.read_csv('data/hospital_joined.csv')

In [42]:
data.head()

Unnamed: 0,country_Azerbaijan,country_Other,country_T.C.,age,sex,station_Fall,station_Spring,station_Summer,station_Winter,fever_temperature,...,aids_hiv,diabetes_mellitus_type_1,diabetes_mellitus_type_2,rheumatologic_disorder,dementia,tuberculosis,smoking,other_risks,pcr_result,previous_positives
0,0.0,0.0,1.0,47.0,1,0.0,0.0,0.0,1.0,38.2,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
1,0.0,0.0,1.0,23.0,1,0.0,0.0,0.0,1.0,37.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
2,0.0,0.0,1.0,26.0,1,0.0,0.0,0.0,1.0,38.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,1
3,0.0,0.0,1.0,60.0,1,0.0,0.0,0.0,1.0,36.5,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
4,0.0,0.0,1.0,72.0,1,0.0,0.0,0.0,1.0,36.9,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0


In [43]:
x_train, x_test, y_train, y_test = train_test_split(data.drop('pcr_result', axis=1), data['pcr_result'], test_size=0.2, random_state=42)

In [44]:
#validation 
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)

# Regresión Logística 

In [45]:
# 1. RFE con Regresión Logística
log_reg = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=log_reg, n_features_to_select=10)
rfe.fit(x_train, y_train)

In [46]:
# Ver las características seleccionadas
selected_features = x_train.columns[rfe.support_]
print("Características seleccionadas con RFE:", selected_features)

Características seleccionadas con RFE: Index(['country_Azerbaijan', 'country_Other', 'country_T.C.', 'station_Fall',
       'history_of_fever', 'shortness_of_breath', 'loss_of_taste',
       'chronic_pulmonary_disease', 'liver_disease', 'smoking'],
      dtype='object')


In [47]:
from sklearn.model_selection import GridSearchCV

# Definir los hiperparámetros para GridSearch
param_grid_lr = {
    'C': [0.1, 1, 10, 100],  # Parámetro de regularización
    'penalty': ['l1','l2'],  #penalización
}

# GridSearchCV para Logistic Regression
grid_search_lr = GridSearchCV(estimator=LogisticRegression(max_iter=1000, solver='liblinear'), param_grid=param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(x_train[selected_features], y_train)  # Utilizando las características seleccionadas

# Mejor combinación de hiperparámetros
print("Mejores parámetros para Regresión Logística:", grid_search_lr.best_params_)

# Accuracy en el conjunto de validación
print("Accuracy en el conjunto de validación:", grid_search_lr.score(x_val[selected_features], y_val))

# Matriz de confusión en el conjunto de validación
print("Matriz de confusión en el conjunto de validación:")
print(confusion_matrix(y_val, grid_search_lr.predict(x_val[selected_features])))

Mejores parámetros para Regresión Logística: {'C': 10, 'penalty': 'l1'}
Accuracy en el conjunto de validación: 0.8570747975226298
Matriz de confusión en el conjunto de validación:
[[  38  280]
 [  20 1761]]


# Random Forest

In [48]:
# RFE con Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=rf, n_features_to_select=10)
rfe.fit(x_train, y_train)

In [49]:
# Ver las características seleccionadas
selected_features = x_train.columns[rfe.support_]
print("Características seleccionadas con RFE:", selected_features)

Características seleccionadas con RFE: Index(['country_T.C.', 'age', 'sex', 'station_Fall', 'fever_temperature',
       'oxygen_saturation', 'history_of_fever', 'sore_throat',
       'fatigue_malaise', 'previous_positives'],
      dtype='object')


In [None]:
# Definir los hiperparámetros para GridSearch
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [30, 50, 70],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 5],
    'max_features': ['sqrt', None]
}

grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(x_train[selected_features], y_train)  # Utilizando las características seleccionadas

# Mejor combinación de hiperparámetros
print("Mejores parámetros para Random Forest:", grid_search_rf.best_params_)
# Accuracy en el conjunto de validación
print("Accuracy en el conjunto de validación:", grid_search_rf.score(x_val[selected_features], y_val))
# Matriz de confusión en el conjunto de validación
print("Matriz de confusión en el conjunto de validación:")
print(confusion_matrix(y_val, grid_search_rf.predict(x_val[selected_features])))

Mejores parámetros para Random Forest: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Accuracy en el conjunto de validación: 0.859456884230586
Matriz de confusión en el conjunto de validación:
[[ 118  200]
 [  95 1686]]
