# Aproximación clásica

In [2]:
import json
from pathlib import Path

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

import joblib

In [3]:
DATASET_PATH = Path("../data/processed/dataset.csv")
df = pd.read_csv(DATASET_PATH)

# Asegurar tipos
df["journal_id"] = df["journal_id"].astype(int)
df["text"] = df["text"].fillna("").astype(str)

df.shape, df["journal_id"].value_counts().sort_index()

((17468, 9),
 journal_id
 1    1083
 2    2331
 3    9589
 5    4465
 Name: count, dtype: int64)

In [4]:
X = df["text"].values
y = df["journal_id"].values

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, np.arange(len(df)),
    test_size=0.2,
    random_state=42,
    stratify=y
)

len(X_train), len(X_test), np.unique(y_train, return_counts=True)

(13974, 3494, (array([1, 2, 3, 5]), array([ 866, 1865, 7671, 3572])))

In [5]:
# Ensure punkt and punkt_tab tokenizers are available; download if missing
for pkg, res in (("punkt", "tokenizers/punkt"), ("punkt_tab", "tokenizers/punkt_tab")):
    try:
        nltk.data.find(res)
    except LookupError:
        nltk.download(pkg)

nltk.download("stopwords")

stop = set(stopwords.words("english"))

def nltk_analyzer(text):
    toks = word_tokenize(text.lower())
    toks = [t for t in toks if t.isalpha()]      # quita puntuación/números
    toks = [t for t in toks if t not in stop]    # stopwords
    return toks


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\imano\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
tfidf = TfidfVectorizer(
    # lowercase=True,
    # stop_words="english",     # puedes cambiarlo o quitarlo; documéntalo en memoria
    analyzer=nltk_analyzer,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

models = {
    "logreg": LogisticRegression(max_iter=2000, n_jobs=None),
    "linear_svm": LinearSVC(),
    "mnb": MultinomialNB(),
}

pipelines = {
    name: Pipeline([("tfidf", tfidf), ("clf", clf)])
    for name, clf in models.items()
}
pipelines
for name, pipeline in pipelines.items():
    print(f"Training {name}...")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"Results for {name}:")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))



Training logreg...
Results for logreg:
              precision    recall  f1-score   support

           1       0.98      0.88      0.93       217
           2       0.75      0.45      0.56       466
           3       0.80      0.90      0.85      1918
           5       0.67      0.65      0.66       893

    accuracy                           0.77      3494
   macro avg       0.80      0.72      0.75      3494
weighted avg       0.77      0.77      0.77      3494

[[ 192    0   23    2]
 [   0  211  136  119]
 [   3   28 1723  164]
 [   0   43  273  577]]
Training linear_svm...




Results for linear_svm:
              precision    recall  f1-score   support

           1       0.96      0.95      0.96       217
           2       0.65      0.46      0.54       466
           3       0.81      0.87      0.84      1918
           5       0.65      0.64      0.64       893

    accuracy                           0.76      3494
   macro avg       0.77      0.73      0.74      3494
weighted avg       0.76      0.76      0.76      3494

[[ 206    0    9    2]
 [   0  215  131  120]
 [   7   52 1666  193]
 [   1   62  255  575]]
Training mnb...




Results for mnb:
              precision    recall  f1-score   support

           1       1.00      0.55      0.71       217
           2       0.97      0.18      0.31       466
           3       0.71      0.91      0.80      1918
           5       0.63      0.59      0.61       893

    accuracy                           0.71      3494
   macro avg       0.83      0.56      0.61      3494
weighted avg       0.74      0.71      0.68      3494

[[ 120    0   96    1]
 [   0   86  245  135]
 [   0    2 1738  178]
 [   0    1  362  530]]


In [7]:
results = {}

for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    rep = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)

    results[name] = {
        "pipeline": pipe,
        "report": rep,
        "cm": cm
    }

    print("\n===", name, "===")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("Confusion matrix:\n", cm)




=== logreg ===
              precision    recall  f1-score   support

           1       0.98      0.88      0.93       217
           2       0.75      0.45      0.56       466
           3       0.80      0.90      0.85      1918
           5       0.67      0.65      0.66       893

    accuracy                           0.77      3494
   macro avg       0.80      0.72      0.75      3494
weighted avg       0.77      0.77      0.77      3494

Confusion matrix:
 [[ 192    0   23    2]
 [   0  211  136  119]
 [   3   28 1723  164]
 [   0   43  273  577]]





=== linear_svm ===
              precision    recall  f1-score   support

           1       0.96      0.95      0.96       217
           2       0.65      0.46      0.54       466
           3       0.81      0.87      0.84      1918
           5       0.65      0.64      0.64       893

    accuracy                           0.76      3494
   macro avg       0.77      0.73      0.74      3494
weighted avg       0.76      0.76      0.76      3494

Confusion matrix:
 [[ 206    0    9    2]
 [   0  215  131  120]
 [   7   52 1666  193]
 [   1   62  255  575]]





=== mnb ===
              precision    recall  f1-score   support

           1       1.00      0.55      0.71       217
           2       0.97      0.18      0.31       466
           3       0.71      0.91      0.80      1918
           5       0.63      0.59      0.61       893

    accuracy                           0.71      3494
   macro avg       0.83      0.56      0.61      3494
weighted avg       0.74      0.71      0.68      3494

Confusion matrix:
 [[ 120    0   96    1]
 [   0   86  245  135]
 [   0    2 1738  178]
 [   0    1  362  530]]


In [8]:
def macro_f1(res):
    return res["report"]["macro avg"]["f1-score"]

best_name = max(results.keys(), key=lambda n: macro_f1(results[n]))
best = results[best_name]["pipeline"]

best_name, macro_f1(results[best_name])

('logreg', 0.749954807790024)

In [9]:
MODELS_DIR = Path("../models/sklearn")
MODELS_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump(best, MODELS_DIR / f"best_model_{best_name}.joblib")
print("Saved:", MODELS_DIR / f"best_model_{best_name}.joblib")

Saved: ..\models\sklearn\best_model_logreg.joblib


In [10]:
OUT_DIR = Path("../reports/sklearn")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Guardar métricas (JSON)
metrics = {
    "best_model": best_name,
    "all_models_macro_f1": {k: macro_f1(v) for k, v in results.items()},
}
with open(OUT_DIR / "metrics_summary.json", "w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

# Errores del mejor modelo
y_pred_best = best.predict(X_test)

errors = df.iloc[idx_test].copy()
errors["y_true"] = y_test
errors["y_pred"] = y_pred_best
errors = errors[errors["y_true"] != errors["y_pred"]]

errors.to_csv(OUT_DIR / "errors_best_model.csv", index=False)
errors.shape, (OUT_DIR / "errors_best_model.csv")

((791, 11), WindowsPath('../reports/sklearn/errors_best_model.csv'))