<a href="https://colab.research.google.com/github/jeramirez169/DataScience_SGS_Classification/blob/main/models/02_classic_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Clonar tu repositorio desde GitHub
!git clone https://github.com/jeramirez169/DataScience_SGS_Classification.git
%cd DataScience_SGS_Classification

# Instalar dependencias necesarias
!pip install -q pandas numpy scikit-learn unidecode spacy imbalanced-learn transformers
!python -m spacy download es_core_news_lg


Cloning into 'DataScience_SGS_Classification'...
remote: Enumerating objects: 82, done.[K
remote: Counting objects: 100% (82/82), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 82 (delta 36), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (82/82), 23.95 MiB | 22.29 MiB/s, done.
Resolving deltas: 100% (36/36), done.
/content/DataScience_SGS_Classification/DataScience_SGS_Classification/DataScience_SGS_Classification
Collecting es-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_lg-3.8.0/es_core_news_lg-3.8.0-py3-none-any.whl (568.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.0/568.0 MB[0m [31m837.7 kB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to re

In [None]:
# Ir a la carpeta data
%cd data

# Descomprimir el archivo ZIP
!unzip "Dataset_SGS_clean.zip" -d .

# Regresar al directorio raíz del proyecto
%cd ..


/content/DataScience_SGS_Classification/DataScience_SGS_Classification/DataScience_SGS_Classification/data
Archive:  Dataset_SGS_clean.zip
  inflating: ./Dataset_SGS_clean.csv  
/content/DataScience_SGS_Classification/DataScience_SGS_Classification/DataScience_SGS_Classification


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
import joblib

sns.set(style="whitegrid")

In [None]:
ruta = "data/Dataset_SGS_clean.csv"
df = pd.read_csv(ruta, encoding="utf-8")

print("Columnas del dataset:", df.columns.tolist())
print("Tamaño del dataset:", df.shape)

# Variables predictoras y objetivo
X = df["texto_truncado_lematizado"]
y = df["Oficina"]

# División train/test estratificada
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

print("Tamaño entrenamiento:", len(X_train))
print("Tamaño prueba:", len(X_test))

# Vectorización TF-IDF compartida
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.95,
    sublinear_tf=True
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("Dimensiones TF-IDF (train):", X_train_tfidf.shape)

labels = sorted(y.unique())


#SVM

In [None]:
svm_model = LinearSVC(
    C=1.0,
    loss="squared_hinge",
    class_weight="balanced",
    max_iter=5000,
    random_state=42
)

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

#Cross validation
scores_svm = cross_validate(
    svm_model,
    X_train_tfidf,
    y_train,
    cv=rskf,
    scoring=["accuracy", "precision_macro", "recall_macro", "f1_macro"],
    n_jobs=-1
)

print("\nResultados de Cross-Validation SVM (5x3):")
for metric, values in scores_svm.items():
    if "test" in metric:
        print(f"{metric}: {values.mean():.4f} ± {values.std():.4f}")

#Entrenamiento
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)

print("\nReporte de clasificación SVM:")
print(classification_report(y_test, y_pred_svm))

#Matriz de Confusión
cm_svm = confusion_matrix(y_test, y_pred_svm, labels=labels)

plt.figure(figsize=(10, 7))
sns.heatmap(cm_svm, annot=True, fmt="d", cmap="Blues",
            xticklabels=labels, yticklabels=labels)
plt.title("Matriz de Confusión – SVM")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
#Encoding
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

print("\nMapeo etiqueta → código:")
for cls, code in zip(le.classes_, le.transform(le.classes_)):
    print(f"{cls:25s} -> {code}")

#Cross-Validation XGBoost
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=len(le.classes_),
    max_depth=6,
    learning_rate=0.1,
    n_estimators=300,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=1.0,
    reg_lambda=1.0,
    tree_method="hist",         # CPU
    eval_metric="mlogloss",
    random_state=42
)

scores_xgb = cross_validate(
    xgb_model,
    X_train_tfidf,
    y_train_enc,
    cv=rskf,
    scoring=["accuracy", "precision_macro", "recall_macro", "f1_macro"],
    n_jobs=-1
)

print("\nResultados de Cross-Validation XGBoost (5x3):")
for metric, values in scores_xgb.items():
    if "test" in metric:
        print(f"{metric}: {values.mean():.4f} ± {values.std():.4f}")

#Entrenamiento
print("\nEntrenando XGBoost...")
xgb_model.fit(X_train_tfidf, y_train_enc)

y_pred_xgb_enc = xgb_model.predict(X_test_tfidf)
y_pred_xgb = le.inverse_transform(y_pred_xgb_enc)

print("\nReporte de clasificación XGBoost:")
print(classification_report(y_test, y_pred_xgb))

#Matriz de confusión
cm_xgb = confusion_matrix(y_test, y_pred_xgb, labels=labels)

plt.figure(figsize=(10, 7))
sns.heatmap(cm_xgb, annot=True, fmt="d", cmap="Greens",
            xticklabels=labels, yticklabels=labels)
plt.title("Matriz de Confusión – XGBoost")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Normalizada
cm_xgb_norm = cm_xgb.astype(float) / cm_xgb.sum(axis=1, keepdims=True)

plt.figure(figsize=(10, 7))
sns.heatmap(cm_xgb_norm, annot=True, fmt=".2f", cmap="Greens",
            xticklabels=labels, yticklabels=labels)
plt.title("Matriz de Confusión Normalizada – XGBoost")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()