<a href="https://colab.research.google.com/github/jserrataylor/ed_project/blob/main/CASIS_Risk_Detection_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## CASIS Risk Detection Project

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from scipy.stats import uniform, randint
import joblib  # Importar joblib para guardar los modelos

# Carga de datos
data_url = 'https://raw.githubusercontent.com/jserrataylor/CASIS/main/casis_datasets.csv'
df = pd.read_csv(data_url)

# Selección de columnas relevantes
columns_to_keep = [
    'Tengodificultadesconelsueño',
    'Consideréseriamentelastimaraotrapersona',
    'Sentílanecesidaddereducirelusodebebidasalcóholicasyodrogas',
    'Asistíaconsejeríaopsicoterapiaporasuntosrelacionadosconmisalud',
    'Hetenidoataquesdepánicoepisodiosdeansiedadseveraqueduranalreded',
    'Tengopreocupacionesrelacionadasamialimentacióndietasnosaludable',
    'Hetenidocontactossexualesuotrasexperienciasdeíndolesexualsindes',
    'CASIC'
]
df = df[columns_to_keep]

# Limpieza de datos
df = df.apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True)
df['CASIC'] = df['CASIC'].astype(int)

# Preparación de datos para SMOTE
X = df.drop('CASIC', axis=1)
y = df['CASIC']
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)

# División de los datos
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42)

# Configuración de RandomizedSearchCV para cada modelo
modelos = {
    'Logistic Regression': (LogisticRegression(), {'C': uniform(0.01, 100), 'solver': ['liblinear', 'saga'], 'penalty': ['l1', 'l2']}),
    'SVM': (SVC(), {'C': uniform(0.1, 100), 'kernel': ['rbf', 'poly', 'sigmoid'], 'gamma': ['scale', 'auto', 0.1, 0.01, 0.001], 'degree': randint(2, 5)}),
    'Random Forest': (RandomForestClassifier(), {'n_estimators': randint(100, 400), 'max_depth': [10, 20, 30, None], 'min_samples_split': randint(2, 11), 'min_samples_leaf': randint(1, 5)}),
    'KNN': (KNeighborsClassifier(), {'n_neighbors': randint(3, 11), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']})
}

# Entrenar y evaluar cada modelo con configuraciones por defecto
initial_results = {}
for name, (model, _) in modelos.items():
    model.fit(X_train, y_train)  # Entrenamiento con configuración por defecto
    y_pred_initial = model.predict(X_test)
    initial_results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred_initial),
        "Precision": precision_score(y_test, y_pred_initial),
        "Recall": recall_score(y_test, y_pred_initial),
        "F1 Score": f1_score(y_test, y_pred_initial)
    }

# Optimización de modelos usando RandomizedSearchCV
optimized_results = {}
for name, (model, params) in modelos.items():
    random_search = RandomizedSearchCV(model, params, n_iter=100, cv=5, scoring='recall', verbose=1, random_state=42)
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_
    y_pred_optimized = best_model.predict(X_test)
    optimized_results[name] = {
        "Best Parameters": random_search.best_params_,
        "Accuracy": accuracy_score(y_test, y_pred_optimized),
        "Precision": precision_score(y_test, y_pred_optimized),
        "Recall": recall_score(y_test, y_pred_optimized),
        "F1 Score": f1_score(y_test, y_pred_optimized)
    }
    # Guardar el modelo optimizado
    joblib.dump(best_model, f'{name}_optimized_model.pkl')

# Imprimir resultados antes y después de la optimización
for model_name in modelos.keys():
    print(f"Resultados iniciales para {model_name}:")
    for metric, value in initial_results[model_name].items():
        print(f"{metric}: {value:.4f}")
    print("\nResultados optimizados para {model_name}:")
    for metric, value in optimized_results[model_name].items():
        if metric == "Best Parameters":
            print(f"{metric}: {value}")
        else:
            print(f"{metric}: {value:.4f}")
    print("\n")
