In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from itertools import product
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, recall_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm
import math
import mlflow
import os

os.environ["DATABRICKS_HOST"] = "https://dbc-2d843358-2bd3.cloud.databricks.com/"
os.environ["DATABRICKS_TOKEN"] = "dapid34b343fd31d5e3c797e5d9d6966dcf2"
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Desarrollo_de_Soluciones/Coachrane_Dataset")

  from .autonotebook import tqdm as notebook_tqdm


<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/1290665620565639', creation_time=1760389737801, experiment_id='1290665620565639', last_update_time=1760389747368, lifecycle_stage='active', name='/Desarrollo_de_Soluciones/Coachrane_Dataset', tags={'mlflow.experiment.sourceName': '/Desarrollo_de_Soluciones/Coachrane_Dataset',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'maiauniandes@gmail.com',
 'mlflow.ownerId': '78764819635503'}>

In [2]:
# cargar los datos

data_training = pd.read_csv("../data/cleaned_train_dataset.csv")
data_test = pd.read_csv("../data/cleaned_test_dataset.csv")
data_validation = pd.read_csv("../data/cleaned_val_dataset.csv")
data_coachrane = pd.read_csv("../data/cochrane_sample_large.csv")

In [3]:
data_training.tail()

Unnamed: 0,text,label
811,higher blood levels of pyridoxal 5'-phosphate ...,0
812,pectin-based films are wrapping that is made f...,0
813,ste20 (sterile 20)/sps-1 related proline/alani...,1
814,development of a human model of hemorrhage has...,1
815,"the local, systemic, and referred causes of fi...",1


In [4]:
# mapear labels de Cochrane -> mismo esquema (plain=0, technical=1)

data_coachrane["label"] = (data_coachrane["label"].str.strip().str.lower().map({"plain": 0, "technical": 1}).astype("int8"))

# sanity check
assert set(data_coachrane["label"].unique()) <= {0, 1}
data_coachrane.tail()

Unnamed: 0,text,label
1002,Antenatal abdominal decompression was studied ...,0
1003,Trials compared absorbable stitches with other...,0
1004,Trials of acupuncture and similar therapies fo...,0
1005,Randomised trials compared giving birth at hom...,0
1006,Trials tested antiseptic and antibiotic soluti...,0


In [5]:
# Hiperparametros de los Modelos y vectorizador

tfidf_params = {
    "ngram_range": (1, 2),
    "min_df": 0.01,
    "max_features": 10000,
}

logreg_params = {
    "C": 1.0,            
    "solver": "liblinear", 
}

xgb_params = {
    "learning_rate": 0.1,
    "n_estimators": 400,
    "max_depth": 4,
}

nb_params = {
    "alpha": 0.1,
}

svm_params = {
    "kernel": "linear",
    "C": 1.0,
    "probability": True,
}


In [6]:
# Crear vectores a partir de los datos

tfidf_vectorizer = TfidfVectorizer(**tfidf_params)

X_train      = tfidf_vectorizer.fit_transform(data_training["text"].astype(str).tolist())
X_test       = tfidf_vectorizer.transform(data_test["text"].astype(str).tolist())
X_val        = tfidf_vectorizer.transform(data_validation["text"].astype(str).tolist())
X_cochrane   = tfidf_vectorizer.transform(data_coachrane["text"].astype(str).tolist())

y_train    = data_training["label"].astype(int).values
y_test     = data_test["label"].astype(int).values
y_val      = data_validation["label"].astype(int).values
y_cochrane = data_coachrane["label"].astype(int).values

In [7]:
print(f"tamanio de la data vectorizada: {X_train.shape[0]} muestras de {X_train.shape[1]} dimensiones")

tamanio de la data vectorizada: 816 muestras de 4060 dimensiones


In [8]:
# Modelos

logreg = LogisticRegression(**logreg_params)

xgboost = XGBClassifier(
    objective="binary:logistic",
    **xgb_params
)

naive_bayes = MultinomialNB(alpha=nb_params["alpha"])

svm = SVC(**svm_params)

modelos = [logreg, xgboost, naive_bayes, svm]

In [9]:
def get_scores_and_preds(model, X):
    # usa probas si existen; si no, usa decision_function y umbral 0
    if hasattr(model, "predict_proba"):
        s = model.predict_proba(X)[:, 1]
        p = (s >= 0.5).astype(int)
    else:
        s = model.decision_function(X)
        p = (s >= 0).astype(int)
    return s, p

In [10]:
pbar = tqdm(modelos, desc="Model eval", leave=True)
for model in pbar:
    run_name = model.__class__.__name__
    pbar.set_description(run_name)

    with mlflow.start_run(run_name=run_name):
        mlflow.set_tag("model_name", run_name)
        
        # Entrenar SOLO con train
        model.fit(X_train, y_train)

        # --- Validación interna (val) ---
        s_val, yhat_val = get_scores_and_preds(model, X_val)
        metrics_val = {
            "val_pr_auc": float(average_precision_score(y_val, s_val)),
            "val_roc_auc": float(roc_auc_score(y_val, s_val)),
            "val_f1": float(f1_score(y_val, yhat_val)),
            "val_recall": float(recall_score(y_val, yhat_val)),
            "val_accuracy": float(accuracy_score(y_val, yhat_val)),
        }
        mlflow.log_metrics(metrics_val)

        # --- Evaluación externa (Cochrane) ---
        s_coc, yhat_coc = get_scores_and_preds(model, X_cochrane)
        metrics_coc = {
            "coc_pr_auc": float(average_precision_score(y_cochrane, s_coc)),
            "coc_roc_auc": float(roc_auc_score(y_cochrane, s_coc)),
            "coc_f1": float(f1_score(y_cochrane, yhat_coc)),
            "coc_recall": float(recall_score(y_cochrane, yhat_coc)),
            "coc_accuracy": float(accuracy_score(y_cochrane, yhat_coc)),
        }
        mlflow.log_metrics(metrics_coc)

LogisticRegression:   0%|          | 0/4 [00:00<?, ?it/s]

🏃 View run LogisticRegression at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565639/runs/d70c7166c8b74f39b876b750bdb0aec2
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565639


MultinomialNB:  50%|█████     | 2/4 [00:05<00:05,  2.62s/it]     

🏃 View run XGBClassifier at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565639/runs/00ef680873d0435990bee21e5dcfcc68
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565639
🏃 View run MultinomialNB at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565639/runs/34763a1f5a87430fa9b31175734ee52d
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565639


SVC: 100%|██████████| 4/4 [00:10<00:00,  2.58s/it]          

🏃 View run SVC at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565639/runs/aa9e67db2f8d40d3b45c1c6d6e19f11e
🧪 View experiment at: https://dbc-2d843358-2bd3.cloud.databricks.com/ml/experiments/1290665620565639





In [11]:
print("Experimento Concluido con Exito!")

Experimento Concluido con Exito!


In [15]:
from IPython.display import Markdown

def export_code_cells():
    from IPython import get_ipython
    cells = get_ipython().user_ns['In']
    code = '\n\n'.join([c for c in cells if c.strip()])
    return Markdown(f'```python\n{code}\n```')

#export_code_cells()