In [1]:
# -----------------------------------------------------------------------------
# Embedding Vectorization for Justice Dataset
# (Synchronized with Traditional Vectorization)
# -----------------------------------------------------------------------------
# Description:
#   Creates 8 embedding-based vectorization variants (Word2Vec, FastText, GloVe, etc.)
#   for the Justice Dataset. Uses fixed master splits for regression, binary, and multiclass tasks.
#   Structure and outputs are synchronized with traditional_vectorization.ipynb.
# -----------------------------------------------------------------------------

# ---------------------------
# 1. Setup and Imports
# ---------------------------
from generate_master_split import ensure_master_split
ensure_master_split()

import os
import joblib
import pandas as pd

from helpers_text_preprocessing import (
    load_spacy_model,
    enhanced_spacy_preprocessing
)
from helpers_embedding_vectorization import (
    vectorize_embeddings,
    save_results
)

MASTER_SPLIT_PATH = "Artifacts/master_split_indices.joblib"

# ---------------------------
# 2. Load Dataset
# ---------------------------
justice_df = pd.read_csv("../Justice_Dataset/justice.csv")
required_cols = ["facts", "first_party_winner", "issue_area", "term"]
missing = [c for c in required_cols if c not in justice_df.columns]
if missing:
    raise ValueError(f"Missing columns in dataset: {missing}")

print(f"✅ Dataset loaded successfully: {justice_df.shape[0]} rows")

# ---------------------------
# 3. Text Preprocessing
# ---------------------------
nlp = load_spacy_model("en_core_web_sm")
justice_df["facts_clean"] = enhanced_spacy_preprocessing(
    texts=justice_df["facts"],
    nlp=nlp,
    keep_negations=True,
    preserve_legal_terms=True,
    batch_size=100,
    n_process=4
)
print("Text preprocessing complete.")

# ---------------------------
# 4. Embedding Variants
# ---------------------------
embedding_variants = [
    "word2vec_avg", "word2vec_tfidf",
    "fasttext_avg", "fasttext_tfidf",
    "glove_avg", "glove_tfidf",
    "trainable_fasttext_avg", "trainable_fasttext_tfidf"
]

# ---------------------------
# 5. Path Configuration
# ---------------------------
try:
    BASE_DIR = os.path.abspath(os.path.dirname(__file__))
except NameError:
    BASE_DIR = os.getcwd()

ARTIFACTS_DIR = os.path.join(BASE_DIR, "Artifacts")
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

for subdir in ["binary", "multiclass", "regression"]:
    os.makedirs(os.path.join(ARTIFACTS_DIR, subdir), exist_ok=True)

print(f"Saving all results under: {ARTIFACTS_DIR}")

# ---------------------------
# 6. Load Master Split Indices (3 Tasks)
# ---------------------------
try:
    all_splits = joblib.load(MASTER_SPLIT_PATH)
    print("Loaded fixed master split indices for all tasks.")
except FileNotFoundError:
    raise RuntimeError("Master split not found. Run generate_master_split.py first.")

# ---------------------------
# 7. Binary Classification
# ---------------------------
print("\n Binary classification (first_party_winner)")
binary_df = justice_df[["facts_clean", "first_party_winner"]].dropna()
split_bin = all_splits["binary"]

binary_splits = {
    "X_train": binary_df.loc[split_bin["train_idx"], "facts_clean"],
    "X_val":   binary_df.loc[split_bin["val_idx"], "facts_clean"],
    "X_test":  binary_df.loc[split_bin["test_idx"], "facts_clean"],
    "y_train": binary_df.loc[split_bin["train_idx"], "first_party_winner"],
    "y_val":   binary_df.loc[split_bin["val_idx"], "first_party_winner"],
    "y_test":  binary_df.loc[split_bin["test_idx"], "first_party_winner"],
}

for variant in embedding_variants:
    print(f"→ Vectorizing binary task with {variant} ...")
    results = vectorize_embeddings(binary_splits, variant)
    save_results(
        results,
        os.path.join(ARTIFACTS_DIR, "binary", f"binary_{variant}_300f.joblib"),
        "binary"
    )

# ---------------------------
# 8. Multiclass Classification
# ---------------------------
print("\nMulticlass classification (issue_area)")
multiclass_df = justice_df[["facts_clean", "issue_area"]].dropna()
merge_map = {"Private Action": "Miscellaneous", "Interstate Relations": "Miscellaneous"}
multiclass_df["issue_area"] = multiclass_df["issue_area"].replace(merge_map)

split_multi = all_splits["multiclass"]

multiclass_splits = {
    "X_train": multiclass_df.loc[split_multi["train_idx"], "facts_clean"],
    "X_val":   multiclass_df.loc[split_multi["val_idx"], "facts_clean"],
    "X_test":  multiclass_df.loc[split_multi["test_idx"], "facts_clean"],
    "y_train": multiclass_df.loc[split_multi["train_idx"], "issue_area"],
    "y_val":   multiclass_df.loc[split_multi["val_idx"], "issue_area"],
    "y_test":  multiclass_df.loc[split_multi["test_idx"], "issue_area"],
}

for variant in embedding_variants:
    print(f"→ Vectorizing multiclass task with {variant} ...")
    results = vectorize_embeddings(multiclass_splits, variant)
    save_results(
        results,
        os.path.join(ARTIFACTS_DIR, "multiclass", f"multiclass_{variant}_300f.joblib"),
        "multiclass"
    )

# ---------------------------
# 9. Regression (Term Prediction)
# ---------------------------
print("\nRegression task (term)")
justice_df["term_year"] = pd.to_numeric(justice_df["term"], errors="coerce")
regression_df = justice_df[["facts_clean", "term_year"]].dropna()

split_reg = all_splits["regression"]

regression_splits = {
    "X_train": regression_df.loc[split_reg["train_idx"], "facts_clean"],
    "X_val":   regression_df.loc[split_reg["val_idx"], "facts_clean"],
    "X_test":  regression_df.loc[split_reg["test_idx"], "facts_clean"],
    "y_train": regression_df.loc[split_reg["train_idx"], "term_year"],
    "y_val":   regression_df.loc[split_reg["val_idx"], "term_year"],
    "y_test":  regression_df.loc[split_reg["test_idx"], "term_year"],
}

for variant in embedding_variants:
    print(f"→ Vectorizing regression task with {variant} ...")
    results = vectorize_embeddings(regression_splits, variant)
    save_results(
        results,
        os.path.join(ARTIFACTS_DIR, "regression", f"regression_{variant}_300f.joblib"),
        "regression"
    )

# ---------------------------
# 10. Completion
# ---------------------------
print("\n All embedding variants successfully generated and saved with '_300f.joblib' naming convention.")

Master split already exists at: Artifacts/master_split_indices.joblib
✅ Dataset loaded successfully: 3303 rows


INFO:helpers_text_preprocessing:Successfully loaded spaCy model: en_core_web_sm
INFO:helpers_text_preprocessing:Starting enhanced text preprocessing pipeline...
INFO:helpers_text_preprocessing:  Step 1/4: Unicode normalization and mojibake removal
INFO:helpers_text_preprocessing:  Step 2/4: Preserving legal multiword terms
INFO:helpers_text_preprocessing:  Step 3/4: spaCy processing (batch_size=100, n_process=4)
INFO:helpers_text_preprocessing:  Step 4/4: Final cleanup and validation
INFO:helpers_text_preprocessing:Preprocessing complete. Average tokens per document: 92.1
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:collected 13053 word types from a corpus of 211815 raw words and 2301 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 8705 unique wor

Text preprocessing complete.
Saving all results under: /Users/fpmuga/Documents/00-MathClasses/2025-2026/1_FirstSemester_2025-2026/MATH_130.1-UV1_2025-1/MUGA_LAB/03_Notebooks/MUGA_LAB/Artifacts
Loaded fixed master split indices for all tasks.

 Binary classification (first_party_winner)
→ Vectorizing binary task with word2vec_avg ...


INFO:gensim.models.word2vec:EPOCH 0: training on 211815 raw words (186431 effective words) took 0.1s, 1979870 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 211815 raw words (186507 effective words) took 0.1s, 2137061 effective words/s
INFO:gensim.models.word2vec:EPOCH 2: training on 211815 raw words (186555 effective words) took 0.1s, 2138094 effective words/s
INFO:gensim.models.word2vec:EPOCH 3: training on 211815 raw words (186458 effective words) took 0.1s, 1975400 effective words/s
INFO:gensim.models.word2vec:EPOCH 4: training on 211815 raw words (186599 effective words) took 0.1s, 1906472 effective words/s
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training on 1059075 raw words (932550 effective words) took 0.5s, 1980609 effective words/s', 'datetime': '2025-10-11T16:44:03.450126', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'trai

Saved word2vec_avg results to Artifacts/binary/binary_word2vec_avg_300f.joblib
→ Vectorizing binary task with word2vec_tfidf ...


INFO:gensim.models.word2vec:EPOCH 0: training on 211815 raw words (186331 effective words) took 0.1s, 1924453 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 211815 raw words (186585 effective words) took 0.1s, 2015564 effective words/s
INFO:gensim.models.word2vec:EPOCH 2: training on 211815 raw words (186419 effective words) took 0.1s, 2190588 effective words/s
INFO:gensim.models.word2vec:EPOCH 3: training on 211815 raw words (186589 effective words) took 0.1s, 2117355 effective words/s
INFO:gensim.models.word2vec:EPOCH 4: training on 211815 raw words (186398 effective words) took 0.1s, 2043205 effective words/s
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training on 1059075 raw words (932322 effective words) took 0.5s, 1999083 effective words/s', 'datetime': '2025-10-11T16:44:04.375012', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'trai

Saved word2vec_tfidf results to Artifacts/binary/binary_word2vec_tfidf_300f.joblib
→ Vectorizing binary task with fasttext_avg ...


INFO:gensim.utils:FastText lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-10-11T16:44:06.635139', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'build_vocab'}
INFO:gensim.utils:FastText lifecycle event {'msg': 'training model with 4 workers on 8705 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-10-11T16:44:06.635580', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0: training on 211815 raw words (186459 effective words) took 0.6s, 300087 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 211815 raw words (186553 effective words) took 0.6s, 303115 effective words/s
INFO:gensim.models.word2vec:EPOC

Saved fasttext_avg results to Artifacts/binary/binary_fasttext_avg_300f.joblib
→ Vectorizing binary task with fasttext_tfidf ...


INFO:gensim.utils:FastText lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-10-11T16:44:12.054434', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'build_vocab'}
INFO:gensim.utils:FastText lifecycle event {'msg': 'training model with 4 workers on 8705 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-10-11T16:44:12.054855', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0: training on 211815 raw words (186458 effective words) took 0.7s, 283437 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 211815 raw words (186535 effective words) took 0.7s, 280384 effective words/s
INFO:gensim.models.word2vec:EPOC

Saved fasttext_tfidf results to Artifacts/binary/binary_fasttext_tfidf_300f.joblib
→ Vectorizing binary task with glove_avg ...


INFO:gensim.models.keyedvectors:loading projection weights from /Users/fpmuga/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/fpmuga/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-10-11T16:44:44.720536', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}
INFO:gensim.models.keyedvectors:loading projection weights from /Users/fpmuga/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz


Saved glove_avg results to Artifacts/binary/binary_glove_avg_300f.joblib
→ Vectorizing binary task with glove_tfidf ...


INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/fpmuga/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-10-11T16:45:13.077078', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:collected 13053 word types from a corpus of 211815 raw words and 2301 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 13053 unique words (100.00% of original 13053, drops 0)', 'datetime': '2025-10-11T16:45:13.722776', 'gensim': '4.3.3', 'python': '3.10.18 | 

Saved glove_tfidf results to Artifacts/binary/binary_glove_tfidf_300f.joblib
→ Vectorizing binary task with trainable_fasttext_avg ...


INFO:gensim.models.word2vec:EPOCH 0: training on 211815 raw words (190979 effective words) took 0.1s, 1892260 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 211815 raw words (191283 effective words) took 0.1s, 1986430 effective words/s
INFO:gensim.models.word2vec:EPOCH 2: training on 211815 raw words (191202 effective words) took 0.1s, 2057783 effective words/s
INFO:gensim.models.word2vec:EPOCH 3: training on 211815 raw words (191428 effective words) took 0.1s, 1884534 effective words/s
INFO:gensim.models.word2vec:EPOCH 4: training on 211815 raw words (191403 effective words) took 0.1s, 1924481 effective words/s
INFO:gensim.models.word2vec:EPOCH 5: training on 211815 raw words (191082 effective words) took 0.1s, 1914742 effective words/s
INFO:gensim.models.word2vec:EPOCH 6: training on 211815 raw words (191127 effective words) took 0.1s, 1998358 effective words/s
INFO:gensim.models.word2vec:EPOCH 7: training on 211815 raw words (191280 effective words) took 0.1s, 20

Saved trainable_fasttext_avg results to Artifacts/binary/binary_trainable_fasttext_avg_300f.joblib
→ Vectorizing binary task with trainable_fasttext_tfidf ...


INFO:gensim.models.fasttext:estimated required memory for 13053 words, 2000000 buckets and 300 dimensions: 2440377316 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.utils:FastText lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-10-11T16:45:17.946656', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'build_vocab'}
INFO:gensim.utils:FastText lifecycle event {'msg': 'training model with 4 workers on 13053 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-10-11T16:45:17.947102', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0: training on 211815 raw words (191259 effective words) took 0.7s, 271542 

Saved trainable_fasttext_tfidf results to Artifacts/binary/binary_trainable_fasttext_tfidf_300f.joblib

Multiclass classification (issue_area)
→ Vectorizing multiclass task with word2vec_avg ...


INFO:gensim.models.word2vec:EPOCH 0: training on 206121 raw words (180802 effective words) took 0.1s, 2123913 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 206121 raw words (180647 effective words) took 0.1s, 2101713 effective words/s
INFO:gensim.models.word2vec:EPOCH 2: training on 206121 raw words (180712 effective words) took 0.1s, 2097559 effective words/s
INFO:gensim.models.word2vec:EPOCH 3: training on 206121 raw words (180798 effective words) took 0.1s, 2024426 effective words/s
INFO:gensim.models.word2vec:EPOCH 4: training on 206121 raw words (180718 effective words) took 0.1s, 2015571 effective words/s
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training on 1030605 raw words (903677 effective words) took 0.4s, 2035313 effective words/s', 'datetime': '2025-10-11T16:45:32.723022', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'trai

Saved word2vec_avg results to Artifacts/multiclass/multiclass_word2vec_avg_300f.joblib
→ Vectorizing multiclass task with word2vec_tfidf ...


INFO:gensim.models.word2vec:EPOCH 0: training on 206121 raw words (180877 effective words) took 0.1s, 1859040 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 206121 raw words (180810 effective words) took 0.1s, 2000041 effective words/s
INFO:gensim.models.word2vec:EPOCH 2: training on 206121 raw words (180823 effective words) took 0.1s, 2096530 effective words/s
INFO:gensim.models.word2vec:EPOCH 3: training on 206121 raw words (180857 effective words) took 0.1s, 2129123 effective words/s
INFO:gensim.models.word2vec:EPOCH 4: training on 206121 raw words (180778 effective words) took 0.1s, 2194412 effective words/s
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training on 1030605 raw words (904145 effective words) took 0.5s, 2007375 effective words/s', 'datetime': '2025-10-11T16:45:33.603456', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'trai

Saved word2vec_tfidf results to Artifacts/multiclass/multiclass_word2vec_tfidf_300f.joblib
→ Vectorizing multiclass task with fasttext_avg ...


INFO:gensim.utils:FastText lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-10-11T16:45:35.855434', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'build_vocab'}
INFO:gensim.utils:FastText lifecycle event {'msg': 'training model with 4 workers on 8499 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-10-11T16:45:35.855868', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0: training on 206121 raw words (180777 effective words) took 0.6s, 281179 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 206121 raw words (180800 effective words) took 0.6s, 282665 effective words/s
INFO:gensim.models.word2vec:EPOC

Saved fasttext_avg results to Artifacts/multiclass/multiclass_fasttext_avg_300f.joblib
→ Vectorizing multiclass task with fasttext_tfidf ...


INFO:gensim.utils:FastText lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-10-11T16:45:41.233883', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'build_vocab'}
INFO:gensim.utils:FastText lifecycle event {'msg': 'training model with 4 workers on 8499 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-10-11T16:45:41.234333', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0: training on 206121 raw words (180803 effective words) took 0.6s, 292815 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 206121 raw words (180729 effective words) took 0.6s, 292333 effective words/s
INFO:gensim.models.word2vec:EPOC

Saved fasttext_tfidf results to Artifacts/multiclass/multiclass_fasttext_tfidf_300f.joblib
→ Vectorizing multiclass task with glove_avg ...


INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/fpmuga/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-10-11T16:46:13.193160', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}
INFO:gensim.models.keyedvectors:loading projection weights from /Users/fpmuga/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz


Saved glove_avg results to Artifacts/multiclass/multiclass_glove_avg_300f.joblib
→ Vectorizing multiclass task with glove_tfidf ...


INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/fpmuga/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-10-11T16:46:41.870072', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:collected 12919 word types from a corpus of 206121 raw words and 2212 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 12919 unique words (100.00% of original 12919, drops 0)', 'datetime': '2025-10-11T16:46:42.472964', 'gensim': '4.3.3', 'python': '3.10.18 | 

Saved glove_tfidf results to Artifacts/multiclass/multiclass_glove_tfidf_300f.joblib
→ Vectorizing multiclass task with trainable_fasttext_avg ...


INFO:gensim.models.word2vec:EPOCH 0: training on 206121 raw words (185810 effective words) took 0.1s, 1881061 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 206121 raw words (185735 effective words) took 0.1s, 1871341 effective words/s
INFO:gensim.models.word2vec:EPOCH 2: training on 206121 raw words (185642 effective words) took 0.1s, 2010462 effective words/s
INFO:gensim.models.word2vec:EPOCH 3: training on 206121 raw words (185619 effective words) took 0.1s, 1956910 effective words/s
INFO:gensim.models.word2vec:EPOCH 4: training on 206121 raw words (185577 effective words) took 0.1s, 2035847 effective words/s
INFO:gensim.models.word2vec:EPOCH 5: training on 206121 raw words (185416 effective words) took 0.1s, 1979137 effective words/s
INFO:gensim.models.word2vec:EPOCH 6: training on 206121 raw words (185598 effective words) took 0.1s, 2031093 effective words/s
INFO:gensim.models.word2vec:EPOCH 7: training on 206121 raw words (185699 effective words) took 0.1s, 20

Saved trainable_fasttext_avg results to Artifacts/multiclass/multiclass_trainable_fasttext_avg_300f.joblib
→ Vectorizing multiclass task with trainable_fasttext_tfidf ...


INFO:gensim.models.fasttext:estimated required memory for 12919 words, 2000000 buckets and 300 dimensions: 2439962364 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.utils:FastText lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-10-11T16:46:46.585091', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'build_vocab'}
INFO:gensim.utils:FastText lifecycle event {'msg': 'training model with 4 workers on 12919 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-10-11T16:46:46.585536', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0: training on 206121 raw words (185810 effective words) took 0.6s, 289176 

Saved trainable_fasttext_tfidf results to Artifacts/multiclass/multiclass_trainable_fasttext_tfidf_300f.joblib

Regression task (term)
→ Vectorizing regression task with word2vec_avg ...


INFO:gensim.models.word2vec:EPOCH 0: training on 210531 raw words (184976 effective words) took 0.1s, 1853577 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 210531 raw words (185033 effective words) took 0.1s, 1974167 effective words/s
INFO:gensim.models.word2vec:EPOCH 2: training on 210531 raw words (185018 effective words) took 0.1s, 1970213 effective words/s
INFO:gensim.models.word2vec:EPOCH 3: training on 210531 raw words (185183 effective words) took 0.1s, 2054615 effective words/s
INFO:gensim.models.word2vec:EPOCH 4: training on 210531 raw words (184859 effective words) took 0.1s, 2086507 effective words/s
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training on 1052655 raw words (925069 effective words) took 0.5s, 1942906 effective words/s', 'datetime': '2025-10-11T16:47:00.913801', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'trai

Saved word2vec_avg results to Artifacts/regression/regression_word2vec_avg_300f.joblib
→ Vectorizing regression task with word2vec_tfidf ...


INFO:gensim.models.word2vec:EPOCH 0: training on 210531 raw words (184976 effective words) took 0.1s, 1971505 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 210531 raw words (185066 effective words) took 0.1s, 2071456 effective words/s
INFO:gensim.models.word2vec:EPOCH 2: training on 210531 raw words (184985 effective words) took 0.1s, 1859016 effective words/s
INFO:gensim.models.word2vec:EPOCH 3: training on 210531 raw words (185183 effective words) took 0.1s, 1903648 effective words/s
INFO:gensim.models.word2vec:EPOCH 4: training on 210531 raw words (184974 effective words) took 0.1s, 1988406 effective words/s
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training on 1052655 raw words (925184 effective words) took 0.5s, 1919769 effective words/s', 'datetime': '2025-10-11T16:47:01.828525', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'trai

Saved word2vec_tfidf results to Artifacts/regression/regression_word2vec_tfidf_300f.joblib
→ Vectorizing regression task with fasttext_avg ...


INFO:gensim.utils:FastText lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-10-11T16:47:03.994909', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'build_vocab'}
INFO:gensim.utils:FastText lifecycle event {'msg': 'training model with 4 workers on 8615 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-10-11T16:47:03.995359', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0: training on 210531 raw words (184901 effective words) took 0.6s, 288813 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 210531 raw words (185085 effective words) took 0.7s, 280249 effective words/s
INFO:gensim.models.word2vec:EPOC

Saved fasttext_avg results to Artifacts/regression/regression_fasttext_avg_300f.joblib
→ Vectorizing regression task with fasttext_tfidf ...


INFO:gensim.utils:FastText lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-10-11T16:47:09.566753', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'build_vocab'}
INFO:gensim.utils:FastText lifecycle event {'msg': 'training model with 4 workers on 8615 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-10-11T16:47:09.567245', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0: training on 210531 raw words (184901 effective words) took 0.6s, 292573 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 210531 raw words (185140 effective words) took 0.6s, 300464 effective words/s
INFO:gensim.models.word2vec:EPOC

Saved fasttext_tfidf results to Artifacts/regression/regression_fasttext_tfidf_300f.joblib
→ Vectorizing regression task with glove_avg ...


INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/fpmuga/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-10-11T16:47:41.777462', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}
INFO:gensim.models.keyedvectors:loading projection weights from /Users/fpmuga/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz


Saved glove_avg results to Artifacts/regression/regression_glove_avg_300f.joblib
→ Vectorizing regression task with glove_tfidf ...


INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/fpmuga/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-10-11T16:48:10.378482', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:collected 12996 word types from a corpus of 210531 raw words and 2270 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 12996 unique words (100.00% of original 12996, drops 0)', 'datetime': '2025-10-11T16:48:10.954897', 'gensim': '4.3.3', 'python': '3.10.18 | 

Saved glove_tfidf results to Artifacts/regression/regression_glove_tfidf_300f.joblib
→ Vectorizing regression task with trainable_fasttext_avg ...


INFO:gensim.models.word2vec:EPOCH 0: training on 210531 raw words (189716 effective words) took 0.1s, 1873418 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 210531 raw words (189851 effective words) took 0.1s, 1977108 effective words/s
INFO:gensim.models.word2vec:EPOCH 2: training on 210531 raw words (189641 effective words) took 0.1s, 1909881 effective words/s
INFO:gensim.models.word2vec:EPOCH 3: training on 210531 raw words (189813 effective words) took 0.1s, 1977222 effective words/s
INFO:gensim.models.word2vec:EPOCH 4: training on 210531 raw words (189799 effective words) took 0.1s, 1964138 effective words/s
INFO:gensim.models.word2vec:EPOCH 5: training on 210531 raw words (189969 effective words) took 0.1s, 1956900 effective words/s
INFO:gensim.models.word2vec:EPOCH 6: training on 210531 raw words (189804 effective words) took 0.1s, 1915710 effective words/s
INFO:gensim.models.word2vec:EPOCH 7: training on 210531 raw words (189842 effective words) took 0.1s, 20

Saved trainable_fasttext_avg results to Artifacts/regression/regression_trainable_fasttext_avg_300f.joblib
→ Vectorizing regression task with trainable_fasttext_tfidf ...


INFO:gensim.utils:FastText lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-10-11T16:48:15.168006', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'build_vocab'}
INFO:gensim.utils:FastText lifecycle event {'msg': 'training model with 4 workers on 12996 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-10-11T16:48:15.168479', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:00) [Clang 18.1.8 ]', 'platform': 'macOS-26.0.1-arm64-arm-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0: training on 210531 raw words (189678 effective words) took 0.7s, 276527 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 210531 raw words (189674 effective words) took 0.7s, 265071 effective words/s
INFO:gensim.models.word2vec:EPO

Saved trainable_fasttext_tfidf results to Artifacts/regression/regression_trainable_fasttext_tfidf_300f.joblib

 All embedding variants successfully generated and saved with '_300f.joblib' naming convention.
