In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, recall_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm
import math
import mlflow
import os

os.environ["DATABRICKS_HOST"] = "https://dbc-e0c2984f-335b.cloud.databricks.com/"
os.environ["DATABRICKS_TOKEN"] = "dapi407bc8d2e8ea23807d8c3d135876f810"
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Desarrollo_de_Soluciones/Combined_Datasets")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/510210994784371', creation_time=1758028589440, experiment_id='510210994784371', last_update_time=1758028713501, lifecycle_stage='active', name='/Desarrollo_de_Soluciones/Combined_Datasets', tags={'mlflow.experiment.sourceName': '/Desarrollo_de_Soluciones/Combined_Datasets',
 'mlflow.experimentKind': 'custom_model_development',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'j.rico566@uniandes.edu.co',
 'mlflow.ownerId': '75126365786967'}>

In [2]:
# cargar los datos y procesar los datos

data_training = pd.read_csv("../data/cleaned_train_dataset.csv")
data_test = pd.read_csv("../data/cleaned_test_dataset.csv")
data_validation = pd.read_csv("../data/cleaned_val_dataset.csv")
data_coachrane = pd.read_csv("../data/cochrane_sample_large.csv")
data_coachrane["label"] = (data_coachrane["label"].str.strip().str.lower().map({"plain": 0, "technical": 1}).astype("int8"))

In [3]:
# Combinar los DF
frames = []
for df in (data_training, data_validation, data_test):
    frames.append(df[["text", "label"]])
frames.append(data_coachrane[["text", "label"]])

data_all = pd.concat(frames, ignore_index=True).dropna(subset=["text", "label"])
data_all["label"] = data_all["label"].astype(int)

train_df, val_df = train_test_split(data_all, test_size=0.2, shuffle=True, random_state=42, stratify=data_all["label"])

In [4]:
# Hiperparametros de los Modelos y vectorizador

tfidf_params = {
    "ngram_range": (1, 2),
    "min_df": 0.01,
    "max_features": 10000,
}

logreg_params = {
    "C": 1.0,            
    "solver": "liblinear", 
}

xgb_params = {
    "learning_rate": 0.1,
    "n_estimators": 400,
    "max_depth": 4,
}

nb_params = {
    "alpha": 0.1,
}

svm_params = {
    "kernel": "linear",
    "C": 1.0,
    "probability": True,
}


In [5]:
# Crear vectores a partir de los datos

vec = TfidfVectorizer(**tfidf_params)
X_train = vec.fit_transform(train_df["text"].astype(str))
X_val   = vec.transform(val_df["text"].astype(str))

y_train = train_df["label"].values
y_val   = val_df["label"].values

In [6]:
print(f"tamanio de la data vectorizada: {X_train.shape[0]} muestras de {X_train.shape[1]} dimensiones")

tamanio de la data vectorizada: 1862 muestras de 2693 dimensiones


In [7]:
# Modelos

logreg = LogisticRegression(**logreg_params)

xgboost = XGBClassifier(
    objective="binary:logistic",
    **xgb_params
)

naive_bayes = MultinomialNB(alpha=nb_params["alpha"])

svm = SVC(**svm_params)

modelos = [logreg, xgboost, naive_bayes, svm]

In [8]:
def get_scores_and_preds(model, X):
    if hasattr(model, "predict_proba"):
        s = model.predict_proba(X)[:, 1]
        p = (s >= 0.5).astype(int)
    else:
        s = model.decision_function(X)
        p = (s >= 0).astype(int)
    return s, p

In [9]:
pbar = tqdm(modelos, desc="Combined eval", leave=True)
for model in pbar:
    run_name = f"{model.__class__.__name__}"
    pbar.set_description(run_name)

    with mlflow.start_run(run_name=run_name):
        mlflow.set_tag("dataset", "combined_old+cochrane")
        mlflow.log_param("n_train", int(X_train.shape[0]))
        mlflow.log_param("n_val", int(X_val.shape[0]))
        mlflow.log_param("vocab_size", int(X_train.shape[1]))
        mlflow.log_param("tfidf_ngram_range", str(tfidf_params.get("ngram_range")))
        mlflow.log_param("tfidf_min_df", tfidf_params.get("min_df"))
        mlflow.log_param("tfidf_max_features", tfidf_params.get("max_features"))

        # Entrena
        model.fit(X_train, y_train)

        # Val
        s_val, yhat_val = get_scores_and_preds(model, X_val)
        metrics_val = {
            "val_pr_auc": float(average_precision_score(y_val, s_val)),
            "val_roc_auc": float(roc_auc_score(y_val, s_val)),
            "val_f1": float(f1_score(y_val, yhat_val)),
            "val_recall": float(recall_score(y_val, yhat_val)),
            "val_accuracy": float(accuracy_score(y_val, yhat_val)),
        }
        mlflow.log_metrics(metrics_val)

pbar.close()

Combined eval:   0%|          | 0/4 [00:00<?, ?it/s]

🏃 View run LogisticRegression at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/510210994784371/runs/af0ede2ce46e40d397db9022f7164c43
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/510210994784371
🏃 View run XGBClassifier at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/510210994784371/runs/51ad8d75b7e64e4c8a079506d37c3982
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/510210994784371
🏃 View run MultinomialNB at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/510210994784371/runs/c442a56783924a17a1099ffd9f92b384
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/510210994784371
🏃 View run SVC at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/510210994784371/runs/70bca014085c46419abdf05170b6e7da
🧪 View experiment at: https://dbc-e0c2984f-335b.cloud.databricks.com/ml/experiments/510210994784371


In [10]:
print("Experimento Concluido con Exito!")

Experimento Concluido con Exito!


In [25]:
#texto= "The initial symptoms were similar to other viral diseases that are still extant, such as influenza and the common cold: fever of at least 38.3 °C (101 °F), muscle pain, malaise, headache and fatigue. As the digestive tract was commonly involved, nausea, vomiting, and backache often occurred. The early prodromal stage usually lasted 2–4 days. By days 12–15, the first visible lesions – small reddish spots called enanthem – appeared on mucous membranes of the mouth, tongue, palate, and throat, and the temperature fell to near-normal. These lesions rapidly enlarged and ruptured, releasing large amounts of virus into the saliva."
texto = "In contrast to earlier views on BPD, this condition can remit, and symptoms can be reduced and managed. Nevertheless, specific symptoms such as fear of abandonment, impulsivity, intense anger, and an unstable self-image may persist. Individuals with BPD may also continue to experience impairments in social and occupational functioning and may have a need for ongoing treatment. Rates of suicide attempts and episodes of self-harm also decline over time, but they continue to occur more often than in individuals without BPD. Furthermore, in longitudinal studies, BPD is associated with increases in deaths due to suicide as well as with all-cause mortality. Thus, the lifetime burden and psychosocial impairment associated with BPD can be substantial because it typically has an onset in adolescence or early adulthood and can persist for many years. In addition, individuals with BPD experience increases in health care costs related to BPD and to other physical conditions."

In [26]:
vector = vec.transform([texto.lower()])
preds = [clf.predict(vector) for clf in modelos]

inv_map = {0: "plain", 1: "technical"}
for clf, pred in zip(modelos, preds):
    label = int(pred[0])      # o: pred.item()
    print(f"segun {clf.__class__.__name__} el texto es {inv_map[label]}")

segun LogisticRegression el texto es technical
segun XGBClassifier el texto es technical
segun MultinomialNB el texto es technical
segun SVC el texto es technical


In [11]:
from IPython.display import Markdown

def export_code_cells():
    from IPython import get_ipython
    cells = get_ipython().user_ns['In']
    code = '\n\n'.join([c for c in cells if c.strip()])
    return Markdown(f'```python\n{code}\n```')

#export_code_cells()