In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [22]:
le = LabelEncoder()
y_enc = le.fit_transform(y)
X_train =  pd.read_csv("../data/splits/X_train.csv")["Title"]
y_train = pd.read_csv("../data/splits/y_train.csv")["label"]

# Choose vectorizer: CountVectorizer or TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

In [None]:
import os
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler

# ======================
# Paths
# ======================
VECTORIZER_DIR = "../data/vectorizers"
MODEL_DIR = "../data/models"

os.makedirs(VECTORIZER_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# ======================
# Vectorize (TRAIN ONLY)
# ======================
X_train_vec = vectorizer.fit_transform(X_train)

# Save vectorizer
joblib.dump(vectorizer, f"{VECTORIZER_DIR}/text_vectorizer.joblib")

# ======================
# Oversample (TRAIN ONLY)
# ======================
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train_vec, y_train)


Training logistic_regression...
Training random_forest...
Training multinomial_nb...


['../data/models/label_encoder.joblib']

In [None]:
# ======================
# Models
# ======================
models = {
    "logistic_regression": LogisticRegression(
        max_iter=500,
        class_weight="balanced"
    ),
    "random_forest": RandomForestClassifier(
        n_estimators=200,
        class_weight="balanced",
        random_state=42
    ),
    "multinomial_nb": MultinomialNB()
}

In [None]:
# ======================
# Train + save models
# ======================
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_res, y_train_res)

    joblib.dump(
        model,
        f"{MODEL_DIR}/{name}.joblib"
    )

# Optional but recommended
joblib.dump(le, f"{MODEL_DIR}/label_encoder.joblib")