# 01 ‚Äî Text (DistilBERT) fine-tune
Train a text classifier and export per-sample probabilities and embeddings into `cache/`.
**TODO**: point `df` to your dataframe with columns `[sample_id, text, label]`.


In [1]:
import os, re, glob, subprocess, sys

import pandas as pd

try:
    import kagglehub  # pip install kagglehub[pandas-datasets]
    from kagglehub import KaggleDatasetAdapter
except ModuleNotFoundError:
    print("kagglehub not found; installing kagglehub[pandas-datasets]...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "kagglehub[pandas-datasets]"])
    import kagglehub
    from kagglehub import KaggleDatasetAdapter

DATASET_ID = "andrewmvd/cyberbullying-classification"
FILE_CANDIDATES = [
    "cyberbullying_tweets.csv",
    "train.csv",
    "TRAIN.csv",
]

def clean_tweet(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#(\w+)", r"\1", text)
    text = re.sub(r"[^a-z\s']", " ", text)
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    return re.sub(r"\s+", " ", text).strip()

df_raw = None
source_name = None
load_errors = []

for file_path in FILE_CANDIDATES:
    try:
        df_raw = kagglehub.load_dataset(
            KaggleDatasetAdapter.PANDAS,
            DATASET_ID,
            file_path,
        )
        source_name = f"{DATASET_ID}/{file_path}"
        print(f"Loaded '{file_path}' from {DATASET_ID} via kagglehub.load_dataset")
        break
    except FileNotFoundError as exc:
        load_errors.append((file_path, exc))
    except Exception as exc:
        load_errors.append((file_path, exc))

if df_raw is None:
    print("Falling back to dataset_download; load_dataset attempts failed:")
    for candidate, exc in load_errors:
        print(f"  candidate='{candidate}': {exc}")
    dataset_dir = kagglehub.dataset_download(DATASET_ID)
    candidates = glob.glob(os.path.join(dataset_dir, "**/*.csv"), recursive=True)

    csv_path = None
    for path in candidates:
        try:
            hdr = pd.read_csv(path, nrows=0).columns.str.lower().tolist()
            if ("tweet_text" in hdr or "text" in hdr) and "cyberbullying_type" in hdr:
                csv_path = path
                break
        except Exception:
            continue

    if csv_path is None:
        raise FileNotFoundError(
            f"Could not locate a CSV with tweet_text/text and cyberbullying_type in {DATASET_ID}."
        )

    print(f"Using fallback CSV: {csv_path}")
    source_name = csv_path
    df_raw = pd.read_csv(csv_path)

print(f"Data source: {source_name}")

# Normalize expected column names
cols = {c.lower(): c for c in df_raw.columns}
text_col = cols.get("tweet_text", cols.get("text"))
label_col = cols.get("cyberbullying_type")
assert text_col and label_col, "CSV must contain tweet_text (or text) and cyberbullying_type."

# Basic clean ‚Üí clean_text
df = (
    df_raw
    .dropna(subset=[text_col, label_col])
    .drop_duplicates(subset=[text_col])
    .copy()
)
df["clean_text"] = df[text_col].astype(str).apply(clean_tweet)
df = df[df["clean_text"].str.len() > 0].reset_index(drop=True)

# Final columns expected by the DistilBERT notebook
df = df.rename(columns={label_col: "label"})
df.insert(0, "sample_id", [f"s{i:07d}" for i in range(len(df))])  # stable IDs
df = df[["sample_id", "clean_text", "label"]]

df = df.rename(columns={"clean_text": "text"})[["sample_id", "text", "label"]]

print(df.head(3))
print(df.label.value_counts())
print("df shape:", df.shape)


  df_raw = kagglehub.load_dataset(


Loaded 'cyberbullying_tweets.csv' from andrewmvd/cyberbullying-classification via kagglehub.load_dataset
Data source: andrewmvd/cyberbullying-classification/cyberbullying_tweets.csv
  sample_id                                               text  \
0  s0000000  in other words katandandre your food was crapi...   
1  s0000001  why is aussietv so white mkr theblock imaceleb...   
2  s0000002         a classy whore or more red velvet cupcakes   

               label  
0  not_cyberbullying  
1  not_cyberbullying  
2  not_cyberbullying  
label
religion               7995
age                    7992
ethnicity              7951
gender                 7878
not_cyberbullying      7847
other_cyberbullying    6089
Name: count, dtype: int64
df shape: (45752, 3)


In [2]:
from pathlib import Path

text_data_dir = Path("../data/phase1/text")
text_data_dir.mkdir(parents=True, exist_ok=True)
text_parquet_path = text_data_dir / "cyberbullying_text.parquet"

df.to_parquet(text_parquet_path, index=False)
print(f"Saved cleaned text dataframe to {text_parquet_path} with {len(df)} rows")


Saved cleaned text dataframe to ..\data\phase1\text\cyberbullying_text.parquet with 45752 rows


In [None]:
"""Lightweight fallback using frozen DistilBERT embeddings + logistic regression."""
import subprocess
import sys
from pathlib import Path

for pkg, import_name in [
    ("transformers", "transformers"),
    ("scikit-learn", "sklearn"),
    ("joblib", "joblib"),
]:
    try:
        __import__(import_name)
    except ModuleNotFoundError:
        print(f"Installing missing dependency: {pkg}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

import joblib
import numpy as np
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer

fast_artifact_dir = Path("../artifacts/phase1/text/distilbert_fast")
fast_artifact_dir.mkdir(parents=True, exist_ok=True)

label2id = {label: idx for idx, label in enumerate(sorted(df.label.unique()))}
id2label = {idx: label for label, idx in label2id.items()}
labels = df["label"].map(label2id).to_numpy()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
model.eval()

batch_size = 128
embedding_batches = []
for start in range(0, len(df), batch_size):
    end = start + batch_size
    batch_text = df.iloc[start:end]["text"].tolist()
    encoded = tokenizer(
        batch_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128,
    )
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        outputs = model(**encoded)
    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    embedding_batches.append(cls_embeddings)

embeddings = np.vstack(embedding_batches)
np.save(fast_artifact_dir / "embeddings.npy", embeddings)

indices = np.arange(len(df))
# Reserve 10% for test, then split remaining into train/validation (~80/10/10 overall)
train_val_idx, test_idx, train_val_labels, test_labels = train_test_split(
    indices,
    labels,
    test_size=0.1,
    stratify=labels,
    random_state=42,
)
train_idx, val_idx, y_train, y_val = train_test_split(
    train_val_idx,
    train_val_labels,
    test_size=0.1111111111,  # 0.1 / 0.9
    stratify=train_val_labels,
    random_state=42,
)

X_train = embeddings[train_idx]
X_val = embeddings[val_idx]
X_test = embeddings[test_idx]

train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)
test_df = df.iloc[test_idx].reset_index(drop=True)
train_df.insert(0, "row_index", train_idx)
val_df.insert(0, "row_index", val_idx)
test_df.insert(0, "row_index", test_idx)

clf = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=500,
    class_weight='balanced',  # Address class imbalance (1.31:1 ratio)
)
clf.fit(X_train, y_train)
joblib.dump(clf, fast_artifact_dir / "logreg.pkl")

val_preds = clf.predict(X_val)
val_accuracy = accuracy_score(y_val, val_preds)
print(f"Validation accuracy (fast fallback): {val_accuracy:.4f}")

y_test = clf.predict(X_test)
print(f"Test predictions generated for {len(y_test)} samples (stored for cache export).")

probs = clf.predict_proba(embeddings)
np.save(fast_artifact_dir / "probabilities.npy", probs)

torch.save({"label2id": label2id, "id2label": id2label}, fast_artifact_dir / "label_maps.pt")

per_sample = df.copy().reset_index(drop=True)
for idx, label in id2label.items():
    per_sample[f"prob_{label}"] = probs[:, idx]

per_sample_path = fast_artifact_dir / "per_sample_probs.parquet"
per_sample.to_parquet(per_sample_path, index=False)

print(f"Saved fast fallback artifacts to {fast_artifact_dir}")
print("Artifacts include embeddings.npy, probabilities.npy, logreg.pkl, per_sample_probs.parquet, label_maps.pt")
print("You can skip the fine-tune cell if this baseline suffices.")

In [4]:
import subprocess
import sys
from pathlib import Path

for pkg, import_name in [
    ("transformers", "transformers"),
    ("datasets", "datasets"),
    ("accelerate", "accelerate"),
]:
    try:
        __import__(import_name)
    except ModuleNotFoundError:
        print(f"Installing missing dependency: {pkg}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          Trainer, TrainingArguments)
import numpy as np
import torch

# Use absolute path from project root
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
artifact_dir = PROJECT_ROOT / "artifacts" / "phase1" / "text" / "distilbert"
artifact_dir.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Model will be saved to: {artifact_dir}")
print(f"   (Absolute path: {artifact_dir.resolve()})")

train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label"],
    random_state=42,
)

label2id = {label: idx for idx, label in enumerate(sorted(df.label.unique()))}
id2label = {idx: label for label, idx in label2id.items()}

def encode_labels(frame):
    mapped = frame.copy()
    mapped["label_id"] = mapped["label"].map(label2id)
    return mapped

train_df = encode_labels(train_df)
val_df = encode_labels(val_df)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

train_dataset = Dataset.from_pandas(train_df[["text", "label_id"]], preserve_index=False)
val_dataset = Dataset.from_pandas(val_df[["text", "label_id"]], preserve_index=False)
train_dataset = train_dataset.map(tokenize_batch, batched=True)
val_dataset = val_dataset.map(tokenize_batch, batched=True)
train_dataset = train_dataset.rename_column("label_id", "labels")
val_dataset = val_dataset.rename_column("label_id", "labels")
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

training_args = TrainingArguments(
    output_dir=str(artifact_dir),
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print("\nüöÄ Starting training...")
train_output = trainer.train()
print(train_output)

print(f"\nüíæ Saving model to {artifact_dir}...")
trainer.save_model(str(artifact_dir))
tokenizer.save_pretrained(str(artifact_dir))
torch.save({"label2id": label2id, "id2label": id2label}, artifact_dir / "label_maps.pt")

# Verify model files were saved
print(f"\n‚úÖ Model saved successfully!")
print(f"\nüìã Saved files:")
for file in sorted(artifact_dir.glob("*")):
    size_mb = file.stat().st_size / 1024 / 1024
    print(f"   - {file.name} ({size_mb:.1f} MB)")

print(f"\n‚ú® Fine-tuned DistilBERT model is ready at: {artifact_dir}")
print(f"   You can now use this model for inference on new data!")


üìÅ Model will be saved to: c:\Users\wasd0\OneDrive\Documents\SMU\Y3S1\DM\Data-Mining-G2T2\artifacts\phase1\text\distilbert
   (Absolute path: C:\Users\wasd0\OneDrive\Documents\SMU\Y3S1\DM\Data-Mining-G2T2\artifacts\phase1\text\distilbert)


Map:   0%|          | 0/36601 [00:00<?, ? examples/s]

Map:   0%|          | 0/9151 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üöÄ Starting training...




Epoch,Training Loss,Validation Loss,Accuracy
1,0.3337,0.358383,0.852694
2,0.2565,0.362834,0.864824
3,0.1258,0.47278,0.864277




TrainOutput(global_step=6864, training_loss=0.2711372054540194, metrics={'train_runtime': 18929.5453, 'train_samples_per_second': 5.801, 'train_steps_per_second': 0.363, 'total_flos': 3636588838749696.0, 'train_loss': 0.2711372054540194, 'epoch': 3.0})

üíæ Saving model to c:\Users\wasd0\OneDrive\Documents\SMU\Y3S1\DM\Data-Mining-G2T2\artifacts\phase1\text\distilbert...

‚úÖ Model saved successfully!

üìã Saved files:
   - checkpoint-2288 (0.0 MB)
   - checkpoint-4576 (0.0 MB)
   - checkpoint-6864 (0.0 MB)
   - config.json (0.0 MB)
   - label_maps.pt (0.0 MB)
   - model.safetensors (255.4 MB)
   - runs (0.0 MB)
   - special_tokens_map.json (0.0 MB)
   - tokenizer.json (0.7 MB)
   - tokenizer_config.json (0.0 MB)
   - training_args.bin (0.0 MB)
   - vocab.txt (0.2 MB)

‚ú® Fine-tuned DistilBERT model is ready at: c:\Users\wasd0\OneDrive\Documents\SMU\Y3S1\DM\Data-Mining-G2T2\artifacts\phase1\text\distilbert
   You can now use this model for inference on new data!


In [5]:
import numpy as np
import pandas as pd
import torch
from pathlib import Path
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Use absolute paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
base_dir = PROJECT_ROOT / "artifacts" / "phase1" / "text"
distilbert_dir = base_dir / "distilbert"
fast_dir = base_dir / "distilbert_fast"

label_map_file = distilbert_dir / "label_maps.pt"
model_available = distilbert_dir.exists() and label_map_file.exists()

print(f"üîç Checking for model at: {distilbert_dir}")

if model_available:
    print(f"‚úÖ Found fine-tuned model!")
    
    label_maps = torch.load(label_map_file)
    label2id = label_maps["label2id"]
    id2label = label_maps["id2label"]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForSequenceClassification.from_pretrained(distilbert_dir).to(device)
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(distilbert_dir)

    print(f"üìä Generating probabilities and embeddings for {len(df)} samples...")

    batch_size = 64
    prob_batches = []
    embed_batches = []

    for start in range(0, len(df), batch_size):
        end = start + batch_size
        batch_text = df.iloc[start:end]["text"].tolist()
        encoded = tokenizer(
            batch_text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128,
        )
        encoded = {k: v.to(device) for k, v in encoded.items()}
        with torch.no_grad():
            outputs = model(**encoded, output_hidden_states=True)
        prob_batches.append(outputs.logits.softmax(dim=-1).cpu().numpy())
        embed_batches.append(outputs.hidden_states[-1][:, 0, :].cpu().numpy())

    probs = np.vstack(prob_batches)
    embeddings = np.vstack(embed_batches)

    np.save(base_dir / "probabilities.npy", probs)
    np.save(base_dir / "embeddings.npy", embeddings)

    per_sample = df.copy().reset_index(drop=True)
    for idx, label in id2label.items():
        per_sample[f"prob_{label}"] = probs[:, idx]

    per_sample_path = base_dir / "per_sample_probs.parquet"
    per_sample.to_parquet(per_sample_path, index=False)

    np.save(base_dir / "distilbert_sentence_embeddings.npy", embeddings)

    print(f"‚úÖ Saved probability matrix to {base_dir / 'probabilities.npy'} with shape {probs.shape}")
    print(f"‚úÖ Saved CLS embeddings to {base_dir / 'embeddings.npy'} with shape {embeddings.shape}")
    print(f"‚úÖ Saved per-sample probabilities to {per_sample_path}")

elif fast_dir.exists():
    print("‚ö†Ô∏è  Fine-tuned artifacts not found; using fast fallback artifacts.")
    probs_path = fast_dir / "probabilities.npy"
    embeddings_path = fast_dir / "embeddings.npy"
    per_sample_path = fast_dir / "per_sample_probs.parquet"
    label_maps = torch.load(fast_dir / "label_maps.pt")
    label2id = label_maps["label2id"]
    id2label = label_maps["id2label"]

    if not probs_path.exists() or not embeddings_path.exists():
        raise FileNotFoundError("Fallback artifacts incomplete. Run the fast baseline cell first.")

    probs = np.load(probs_path)
    embeddings = np.load(embeddings_path)
    per_sample = pd.read_parquet(per_sample_path)

    np.save(base_dir / "probabilities.npy", probs)
    np.save(base_dir / "embeddings.npy", embeddings)
    per_sample.to_parquet(base_dir / "per_sample_probs.parquet", index=False)

    print("‚úÖ Copied fallback probabilities/embeddings into the standard artifact directory.")
    print(f"   Probability matrix shape: {probs.shape}")
    print(f"   Embeddings shape: {embeddings.shape}")
    print(f"   Per-sample probabilities saved to {base_dir / 'per_sample_probs.parquet'}")
else:
    raise FileNotFoundError(
        f"Neither fine-tuned DistilBERT nor fast fallback artifacts are available.\n"
        f"Expected locations:\n"
        f"  - {distilbert_dir}\n"
        f"  - {fast_dir}\n"
        f"Run one of the previous cells first."
    )

üîç Checking for model at: c:\Users\wasd0\OneDrive\Documents\SMU\Y3S1\DM\Data-Mining-G2T2\artifacts\phase1\text\distilbert
‚úÖ Found fine-tuned model!
üìä Generating probabilities and embeddings for 45752 samples...
‚úÖ Saved probability matrix to c:\Users\wasd0\OneDrive\Documents\SMU\Y3S1\DM\Data-Mining-G2T2\artifacts\phase1\text\probabilities.npy with shape (45752, 6)
‚úÖ Saved CLS embeddings to c:\Users\wasd0\OneDrive\Documents\SMU\Y3S1\DM\Data-Mining-G2T2\artifacts\phase1\text\embeddings.npy with shape (45752, 768)
‚úÖ Saved per-sample probabilities to c:\Users\wasd0\OneDrive\Documents\SMU\Y3S1\DM\Data-Mining-G2T2\artifacts\phase1\text\per_sample_probs.parquet


In [6]:
from pathlib import Path
from sklearn.model_selection import train_test_split

cache_dir = Path("../cache")
cache_dir.mkdir(parents=True, exist_ok=True)

if 'per_sample' not in globals():
    raise RuntimeError("Run the inference cell first to populate per_sample outputs.")

# Ensure we have consistent train/val/test splits
# Check if splits were already defined from fine-tuning, otherwise create them
if 'train_df' not in globals() or 'val_df' not in globals():
    print("Creating consistent train/val/test splits...")
    
    # Use the same random state and strategy as the fast fallback
    indices = np.arange(len(df))
    labels = df["label"].map(label2id).to_numpy()
    
    # Reserve 10% for test, then split remaining into train/validation (~80/10/10 overall)
    train_val_idx, test_idx, train_val_labels, test_labels = train_test_split(
        indices,
        labels,
        test_size=0.1,
        stratify=labels,
        random_state=42,
    )
    train_idx, val_idx, y_train, y_val = train_test_split(
        train_val_idx,
        train_val_labels,
        test_size=0.1111111111,  # 0.1 / 0.9 ‚âà 10% of total
        stratify=train_val_labels,
        random_state=42,
    )
    
    # Create dataframes with consistent splits
    train_df = df.iloc[train_idx].reset_index(drop=True)
    val_df = df.iloc[val_idx].reset_index(drop=True)
    test_df = df.iloc[test_idx].reset_index(drop=True)
    
    print(f"Split sizes - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Create split lookup dictionary
split_lookup = {sid: "train" for sid in train_df["sample_id"]}
split_lookup.update({sid: "validation" for sid in val_df["sample_id"]})
split_lookup.update({sid: "test" for sid in test_df["sample_id"]})

# Add split information to per_sample dataframe
per_sample_enriched = per_sample.reset_index(drop=False).rename(columns={"index": "row_index"})
per_sample_enriched["split"] = per_sample_enriched["sample_id"].map(split_lookup)

# Handle any samples not in the lookup (shouldn't happen, but safety check)
if per_sample_enriched["split"].isna().any():
    print("‚ö†Ô∏è  Warning: Some samples couldn't be mapped to splits. Assigning to 'test'.")
    per_sample_enriched["split"] = per_sample_enriched["split"].fillna("test")

print("Split distribution:")
print(per_sample_enriched["split"].value_counts())

# Save split-specific files to cache
for split_name in sorted(per_sample_enriched["split"].unique()):
    subset = per_sample_enriched[per_sample_enriched["split"] == split_name].copy()
    if subset.empty:
        print(f"‚ö†Ô∏è  No samples found for split '{split_name}', skipping...")
        continue
    
    # Get row indices for embeddings
    row_indices = subset.pop("row_index").to_numpy()
    emb_subset = embeddings[row_indices]
    
    # Save probability and embedding files
    probs_path = cache_dir / f"text_probs_{split_name}.parquet"
    emb_path = cache_dir / f"text_emb_{split_name}.npy"
    
    subset.to_parquet(probs_path, index=False)
    np.save(emb_path, emb_subset)
    
    print(f"‚úÖ Saved {len(subset)} rows to {probs_path}")
    print(f"‚úÖ Saved embeddings with shape {emb_subset.shape} to {emb_path}")

print(f"\nüéâ All cache files saved to {cache_dir}")
print("Available files:")
for cache_file in sorted(cache_dir.glob("text_*")):
    print(f"  - {cache_file.name}")

Split distribution:
split
train         36601
test           4576
validation     4575
Name: count, dtype: int64
‚úÖ Saved 4576 rows to ..\cache\text_probs_test.parquet
‚úÖ Saved embeddings with shape (4576, 768) to ..\cache\text_emb_test.npy
‚úÖ Saved 36601 rows to ..\cache\text_probs_train.parquet
‚úÖ Saved embeddings with shape (36601, 768) to ..\cache\text_emb_train.npy
‚úÖ Saved 4575 rows to ..\cache\text_probs_validation.parquet
‚úÖ Saved embeddings with shape (4575, 768) to ..\cache\text_emb_validation.npy

üéâ All cache files saved to ..\cache
Available files:
  - text_emb_test.npy
  - text_emb_train.npy
  - text_emb_validation.npy
  - text_probs_test.parquet
  - text_probs_train.parquet
  - text_probs_validation.parquet
