# Natural Languange Processing (NLP)- Advanced Topics in DL

Group Z:</br>
    - Iliya Morgunov - 206361412</br>
    - Eadan Schechter - 209793553

# Imports

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import time
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score, f1_score,
    precision_score, recall_score,
    classification_report, confusion_matrix
)

Paths & Load Data

In [2]:
df_train = pd.read_csv("df_train_final.csv")
df_test = pd.read_csv("df_test_final.csv")

Label Setup (maps strings → ids 0..4)

In [3]:
# Canonical class order
CANDIDATE_LABELS = ["very negative", "negative", "neutral", "positive", "very positive"]
label2id = {lab: i for i, lab in enumerate(CANDIDATE_LABELS)}

#  strings -> ids
sentiment_map = {
    "Extremely Negative": 0,  # very negative
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4,  # very positive
}

def ensure_label_column(df, string_col="Sentiment"):
    if "label" in df.columns:
        s = df["label"].astype(int)
        if set(s.unique()).issubset({1,2,3,4,5}):
            df["label"] = (s - 1).astype(int)
        return

    if string_col in df.columns:
        if df[string_col].isna().any():
            raise ValueError(f"Found NaNs in '{string_col}'.")
        df["label"] = df[string_col].map(sentiment_map)
        if df["label"].isna().any():
            bad = df.loc[df["label"].isna(), string_col].unique()
            raise ValueError(f"Unrecognized sentiment strings: {bad[:5]}")
        df["label"] = df["label"].astype(int)
    else:
        print(f"Note: '{string_col}' not found in df. Labels unavailable here.")

ensure_label_column(df_train, "Sentiment")
ensure_label_column(df_test,  "Sentiment")  # if absent, we’ll skip metrics

Pick Which Text Columns to Run

In [4]:
# evaluate both representations if available
TEXT_COLS = [
    ("OriginalTweet", "orig"),
    ("TweetWithDateLocation", "twl"),
]

for col, _ in TEXT_COLS:
    if col not in df_train.columns:
        print(f"[warn] Train missing column: {col}")
    if col not in df_test.columns:
        print(f"[warn] Test  missing column: {col}")

Encoder (HF Transformers) + Mean Pooling

In [5]:
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
encoder   = AutoModel.from_pretrained(MODEL_NAME).to(device).eval()

@torch.no_grad()
def encode_texts(texts, batch_size=64, max_length=128):
    """Return float32 numpy array [N, H] using mean pooling."""
    all_vecs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", leave=False):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        ).to(device)
        outputs = encoder(**inputs)
        last_hidden = outputs.last_hidden_state            # [B, T, H]
        attn_mask   = inputs["attention_mask"].unsqueeze(-1)  # [B, T, 1]
        summed = (last_hidden * attn_mask).sum(dim=1)      # [B, H]
        counts = attn_mask.sum(dim=1).clamp(min=1)         # [B, 1]
        mean_pooled = summed / counts
        all_vecs.append(mean_pooled.cpu().float())
    return torch.cat(all_vecs, dim=0).numpy()

Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Select Shots & Build Prototypes

In [6]:
def select_shots(df, text_col, label_col="label", n_shots=1, random_state=42):
    """
    Returns dict: class_id -> list[str] (n_shots exemplars per class).
    """
    rng = np.random.default_rng(random_state)
    shots = {}
    for c in range(len(CANDIDATE_LABELS)):
        cdf = df[df[label_col] == c]
        if len(cdf) == 0:
            raise ValueError(f"No examples in train for class {c} ('{CANDIDATE_LABELS[c]}').")
        take = min(n_shots, len(cdf))
        idx = rng.choice(len(cdf), size=take, replace=False)
        shots[c] = cdf.iloc[idx][text_col].astype(str).tolist()
    return shots

def build_prototypes(shots_dict, batch_size=64, max_length=128):
    """
    shots_dict: class_id -> list[str]
    Returns L2-normalized prototypes: np.array [C, H]
    """
    protos = []
    for c in range(len(CANDIDATE_LABELS)):
        embeds = encode_texts(shots_dict[c], batch_size=batch_size, max_length=max_length)  # [k, H]
        proto  = embeds.mean(axis=0, keepdims=True)                                         # [1, H]
        proto  = proto / (np.linalg.norm(proto, axis=1, keepdims=True) + 1e-12)            # L2 norm
        protos.append(proto)
    return np.vstack(protos)  # [C, H]

Predict by Cosine Similarity to Prototypes

In [7]:
def cosine_predict(texts, prototypes, batch_size=256, max_length=128):
    """
    prototypes: [C, H], L2-normalized
    Returns:
      pred_ids: [N] int
      pred_scores: [N] float  (max cosine)
      sims: [N, C] full cosine matrix
    """
    X = encode_texts(texts, batch_size=batch_size, max_length=max_length)  # [N, H]
    X = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)
    sims = X @ prototypes.T
    pred_ids = sims.argmax(axis=1)
    pred_scores = sims.max(axis=1)
    return pred_ids, pred_scores, sims

Run Few-shot on Both Columns + Metrics

In [9]:
N_SHOTS = 100  # for k-shot
ce_loss_fn = nn.CrossEntropyLoss(reduction="mean")

for text_col, prefix in TEXT_COLS:
    if text_col not in df_train.columns or text_col not in df_test.columns:
        print(f"[skip] '{text_col}' missing in train or test.")
        continue

    # ----- Build class prototypes from TRAIN -----
    shots = select_shots(df_train, text_col, label_col="label", n_shots=N_SHOTS, random_state=42)
    protos = build_prototypes(shots, batch_size=64, max_length=128)  # [C, H]

    # ----- Inference (timed) on TEST -----
    test_texts = df_test[text_col].astype(str).tolist()

    if torch.cuda.is_available():
        torch.cuda.synchronize()
    t0 = time.perf_counter()

    pred_ids, pred_scores, sims = cosine_predict(
        test_texts, protos, batch_size=256, max_length=128
    )  # sims: [N, C]  (use as logits)

    if torch.cuda.is_available():
        torch.cuda.synchronize()
    t1 = time.perf_counter()

    # Attach predictions
    df_test[f"{prefix}_fs_pred_id"]    = pred_ids
    df_test[f"{prefix}_fs_pred_label"] = [CANDIDATE_LABELS[i] for i in pred_ids]
    df_test[f"{prefix}_fs_conf"]       = pred_scores

    # ----- Metrics & CE Loss (only if test has labels) -----
    if "label" in df_test.columns:
        y_true = df_test["label"].to_numpy()
        y_pred = np.asarray(pred_ids)

        # Macro metrics
        acc   = accuracy_score(y_true, y_pred)
        f1m   = f1_score(y_true, y_pred, average="macro")
        precm = precision_score(y_true, y_pred, average="macro", zero_division=0)
        recm  = recall_score(y_true, y_pred, average="macro", zero_division=0)

        # Cross-Entropy Loss from cosine “logits”
        logits  = torch.tensor(sims, dtype=torch.float32)  # [N, C]
        targets = torch.tensor(y_true, dtype=torch.long)   # [N]
        loss_val = ce_loss_fn(logits, targets).item()

        # Inference time
        total_sec = t1 - t0
        per_sample = total_sec / len(test_texts) if len(test_texts) > 0 else float("nan")

        print("\n" + "="*70)
        print(f"Column: {text_col}  [prefix: {prefix}]")
        print("="*70)

        # metrics
        print(f"Loss: {loss_val:.4f}")
        print(f"F1 Score (macro): {f1m:.4f}")
        print(f"Accuracy: {acc:.4f}")
        print(f"Precision (macro): {precm:.4f}")
        print(f"Recall (macro): {recm:.4f}")
        print(f"Inference Time (sec): {total_sec:.4f}")
        print(f"Inference Time (sec/sample): {per_sample:.4f}")

        print("-"*70)
    else:
        print(f"[{text_col}] No ground-truth labels in test — metrics skipped.")




Column: OriginalTweet  [prefix: orig]
Loss: 1.6000
F1 Score (macro): 0.3131
Accuracy: 0.3060
Precision (macro): 0.3081
Recall (macro): 0.3246
Inference Time (sec): 1.0853
Inference Time (sec/sample): 0.0003
----------------------------------------------------------------------


                                                         


Column: TweetWithDateLocation  [prefix: twl]
Loss: 1.6012
F1 Score (macro): 0.3259
Accuracy: 0.3210
Precision (macro): 0.3224
Recall (macro): 0.3464
Inference Time (sec): 1.0583
Inference Time (sec/sample): 0.0003
----------------------------------------------------------------------


