# Natural Languange Processing (NLP)- Advanced Topics in DL

Group Z:</br>
    - Iliya Morgunov - 206361412</br>
    - Eadan Schechter - 209793553

# Imports

In [1]:
import os
import math
import gc
import time
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm import tqdm
from transformers import pipeline, logging as hf_logging

# quiet transformers warnings
hf_logging.set_verbosity_error()

# Choose device for zero-shot pipeline:
#   GPU if available -> device index 0
#   else CPU -> device = -1
ZSL_DEVICE = 0 if torch.cuda.is_available() else -1

print(f"Zero-shot NLI will run on: {'GPU:0' if ZSL_DEVICE >= 0 else 'CPU'}")


Zero-shot NLI will run on: GPU:0


NLI model choice & label set

In [2]:
# Default NLI model (change if you want a different one)
NLI_MODEL_NAME = "facebook/bart-large-mnli"

CANDIDATE_LABELS = [
    "very negative",
    "negative",
    "neutral",
    "positive",
    "very positive",
]

# Must be positional "{}", not "{label}" or anything named.
HYPOTHESIS_TEMPLATE = "This text expresses {} sentiment."

label2id = {lab: i for i, lab in enumerate(CANDIDATE_LABELS)}

Build zero-shot pipeline

In [3]:
def build_zero_shot_pipeline(model_name: str = NLI_MODEL_NAME, device: int = ZSL_DEVICE):
    """
    Returns a Hugging Face zero-shot-classification pipeline on the requested device.
    """
    zsl = pipeline(
        task="zero-shot-classification",
        model=model_name,
        device=device,          # 0..N for CUDA devices, -1 for CPU
    )
    return zsl

zero_shot = build_zero_shot_pipeline()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Paths & Load Data

In [5]:
df_train = pd.read_csv("df_train_final.csv")
df_test = pd.read_csv("df_test_final.csv")

Prediction utility (GPU/CPU, batched)

In [8]:
def zsl_predict(
    texts: List[str],
    pipe,
    candidate_labels: List[str] = CANDIDATE_LABELS,
    hypothesis_template: str = HYPOTHESIS_TEMPLATE,
    batch_size: int = 16,
) -> Tuple[List[int], List[str], List[List[float]]]:
    """
    Predict sentiment for a list of texts using a zero-shot pipeline.

    Returns:
        pred_ids:        class indices aligned with candidate_labels
        pred_labels:     class names (strings)
        all_scores:      scores per label in the order of candidate_labels
    """
    pred_ids: List[int] = []
    pred_labels: List[str] = []
    all_scores: List[List[float]] = []

    n = len(texts)
    for i in tqdm(range(0, n, batch_size), desc="Zero-shot inference", leave=False):
        batch = texts[i:i + batch_size]
        results = pipe(
            sequences=batch,
            candidate_labels=candidate_labels,
            hypothesis_template=hypothesis_template,
            multi_label=False,  # exactly one class per example
        )
        # Pipeline returns either a dict (single item) or list of dicts; normalize
        if isinstance(results, dict):
            results = [results]

        for r in results:
            # r["labels"] is sorted by score; map back to our fixed label order
            score_map = {lab: sc for lab, sc in zip(r["labels"], r["scores"])}
            scores_in_order = [score_map[lab] for lab in candidate_labels]
            idx = int(np.argmax(scores_in_order))
            pred_ids.append(idx)
            pred_labels.append(candidate_labels[idx])
            all_scores.append(scores_in_order)

    # small memory cleanup on CPU runs
    gc.collect()
    if ZSL_DEVICE >= 0:
        torch.cuda.empty_cache()

    return pred_ids, pred_labels, all_scores

Map Sentiment strings to 0–4 ids

In [9]:
sentiment_map = {
    "Extremely Negative": 0,  # very negative
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4,  # very positive
}

if "Sentiment" in df_test.columns:
    df_test["label"] = df_test["Sentiment"].map(sentiment_map)
else:
    print("No 'Sentiment' column found in df_test; cannot create labels.")

Inference - Run ZSL on both `OriginalTweet` and `TweetWithDateLocation` and reports metrics.

In [11]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

cols_to_run = [
    ("OriginalTweet", "orig"),
    ("TweetWithDateLocation", "twl"),
]

any_evaluated = False
ce_loss_fn = nn.CrossEntropyLoss(reduction="mean")

for col, prefix in cols_to_run:
    if col not in df_test.columns:
        print(f"[Skip] Column '{col}' not found in df_test.")
        continue

    texts = df_test[col].astype(str).tolist()

    # --- timed inference ---
    if ZSL_DEVICE >= 0:
        torch.cuda.synchronize()
    t0 = time.perf_counter()

    pred_ids, pred_labels, all_scores = zsl_predict(
        texts,
        zero_shot,
        candidate_labels=CANDIDATE_LABELS,
        hypothesis_template=HYPOTHESIS_TEMPLATE,
        batch_size=16,
    )

    if ZSL_DEVICE >= 0:
        torch.cuda.synchronize()
    t1 = time.perf_counter()

    # Attach predictions with a clear prefix for each source column
    df_test[f"{prefix}_zsl_pred_label"] = pred_labels
    df_test[f"{prefix}_zsl_pred_id"] = [label2id[lbl] for lbl in pred_labels]

    # Compute metrics only if ground-truth exists
    if "label" in df_test.columns:
        y_true = df_test["label"].to_numpy()
        y_pred = df_test[f"{prefix}_zsl_pred_id"].to_numpy()

        # Accuracy / F1 / Precision / Recall (macro)
        acc = accuracy_score(y_true, y_pred)
        f1m = f1_score(y_true, y_pred, average="macro")
        precm = precision_score(y_true, y_pred, average="macro", zero_division=0)
        recm = recall_score(y_true, y_pred, average="macro", zero_division=0)

        # Cross-Entropy Loss from the pipeline probabilities
        probs = torch.tensor(all_scores, dtype=torch.float32)
        logits_like = torch.log(probs.clamp_min(1e-12))  # safe log
        targets = torch.tensor(y_true, dtype=torch.long)
        loss_val = ce_loss_fn(logits_like, targets).item()

        # Inference time
        total_sec = t1 - t0
        per_sample = total_sec / len(texts) if len(texts) > 0 else float("nan")

        # ===== Clear identification for each column =====
        print("\n" + "="*70)
        print(f"Column: {col}  [prefix: {prefix}]")
        print("="*70)

        # print metrics
        print(f"Loss: {loss_val:.4f}")
        print(f"F1 Score (macro): {f1m:.4f}")
        print(f"Accuracy: {acc:.4f}")
        print(f"Precision (macro): {precm:.4f}")
        print(f"Recall (macro): {recm:.4f}")
        print(f"Inference Time (sec): {total_sec:.4f}")
        print(f"Inference Time (sec/sample): {per_sample:.4f}")

        print("-"*70)

        any_evaluated = True
    else:
        print(f"[{col}] Note: 'label' column not found; skipping evaluation.")

if "label" in df_test.columns and not any_evaluated:
    print("Note: Evaluation skipped because none of the specified text columns were present.")




Column: OriginalTweet  [prefix: orig]
Loss: 2.4844
F1 Score (macro): 0.1381
Accuracy: 0.2954
Precision (macro): 0.5494
Recall (macro): 0.2196
Inference Time (sec): 389.5192
Inference Time (sec/sample): 0.1026
----------------------------------------------------------------------





Column: TweetWithDateLocation  [prefix: twl]
Loss: 2.5196
F1 Score (macro): 0.1527
Accuracy: 0.3041
Precision (macro): 0.3870
Recall (macro): 0.2280
Inference Time (sec): 391.6296
Inference Time (sec/sample): 0.1031
----------------------------------------------------------------------
