In [1]:
# 01 - Setup: device, imports, seeds, paths, model
import os
from pathlib import Path
import random
import re
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Optional

# ---- Single toggle: "cpu" or "gpu"
RUN_DEVICE = "cpu"   # change to "gpu" if you have CUDA
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    SELECT_DEVICE = "cuda"
else:
    SELECT_DEVICE = "cpu"
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

device = torch.device(SELECT_DEVICE)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
if device.type == "cpu":
    torch.set_num_threads(max(1, os.cpu_count() // 2))

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

from sklearn.metrics import f1_score, classification_report, confusion_matrix
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM
)

# Paths
EXCEL_PATH = Path("Punjabi Dataset.xlsx")
OUT_DIR = Path("outputs_punjabi_metaphor")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Model - UPDATED TO QWEN
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"  # <-- Changed
# MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

MAX_NEW_TOKENS = 1
DO_SAMPLE = False
print(f"Using device: {device.type.upper()} | Model: {MODEL_NAME}")

Using device: CPU | Model: Qwen/Qwen2.5-3B-Instruct


In [2]:
#02 - Load and inspect data

# Load the Punjabi metaphor dataset
df = pd.read_excel(EXCEL_PATH)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst 3 rows:")
print(df.head(3))
print(f"\nData types:\n{df.dtypes}")

# For this dataset, we'll treat "Metaphor_Used" as the key metaphor expression
# We need to assign binary labels (0=literal, 1=metaphorical)
# Since we don't have ground truth labels, let's create a binary target based on sentence analysis

# Create a simple feature: if the target sentence explicitly says "रूपक" (metaphor in Hindi)
# or similar indicators, mark as 1 (metaphorical), else 0 (literal)

df['Label'] = 0  # Default to literal
# Mark as metaphorical if sentence mentions it's a metaphor
df.loc[df['Target_Sentence'].str.contains('रूपक|ਰੂਪਕ|metaphor', case=False, na=False), 'Label'] = 1

print(f"\nLabel distribution:")
print(df['Label'].value_counts())
print(f"\nData ready for classification!")

Dataset shape: (50, 5)
Columns: ['ID', 'Metaphor_Used', 'Prev_Sentence', 'Target_Sentence', 'Final_Sentence']

First 3 rows:
   ID         Metaphor_Used                    Prev_Sentence  \
0   1  ਕਾਰ ਵਿਚ ਅੱਗ ਲੱਗ ਜਾਣਾ      ਚਾਹ ਕਾਫ਼ੀ ਠੰਢੀ ਹੋ ਚੁੱਕੀ ਸੀ।   
1   2          ਕੂਏਂ ਦਾ ਮੱਛੀ  ਰਾਤ ਨੂੰ ਕੁੱਤਾ ਬਹੁਤ ਭੌਂਕਦਾ ਰਿਹਾ।   
2   3          ਅੱਗ ਲੱਗ ਜਾਣਾ      ਚਾਹ ਕਾਫ਼ੀ ਠੰਢੀ ਹੋ ਚੁੱਕੀ ਸੀ।   

                                     Target_Sentence  \
0  ਕਾਰ ਵਿਚ ਅੱਗ ਲੱਗ ਜਾਣਾ ਸੱਚਮੁੱਚ ਹੋਇਆ ਸੀ, ਜਿਵੇਂ ਕਿ...   
1  ਕੂਏਂ ਦਾ ਮੱਛੀ ਸਿਰਫ਼ ਰੂਪਕ ਸੀ, ਪਰ ਇਸਦਾ ਮਤਲਬ ਕਾਫ਼ੀ...   
2  ਅੱਗ ਲੱਗ ਜਾਣਾ ਸਿਰਫ਼ ਰੂਪਕ ਸੀ, ਪਰ ਇਸਦਾ ਮਤਲਬ ਕਾਫ਼ੀ...   

                    Final_Sentence  
0   ਫਿਰ ਉਸਨੇ ਘਰ ਜਾਣ ਦਾ ਫੈਸਲਾ ਕੀਤਾ।  
1   ਫਿਰ ਸਾਰੇ ਆਪਣੇ ਕੰਮ ਵਿੱਚ ਲੱਗ ਗਏ।  
2  ਉਸ ਤੋਂ ਬਾਅਦ ਮਾਹੌਲ ਸ਼ਾਂਤ ਹੋ ਗਿਆ।  

Data types:
ID                  int64
Metaphor_Used      object
Prev_Sentence      object
Target_Sentence    object
Final_Sentence     object
dtype: object

Label distribution:
Label
0    25
1    25
Name: count, dtype: int64

Data ready for cla

In [3]:
# 03 - Load model and tokenizer

import gc
from huggingface_hub import login as hf_login
from getpass import getpass

# Optional HF login for gated models
hf_token = os.getenv("HF_TOKEN", "").strip()
if not hf_token:
    try:
        hf_token = getpass("Enter HuggingFace token (or press Enter to skip): ").strip()
    except:
        pass

if hf_token:
    try:
        hf_login(token=hf_token)
        print("✓ Logged in to Hugging Face")
    except Exception as e:
        print(f"Warning: HF login failed: {e}")

_token_arg = {"token": hf_token} if hf_token else {}

# Load tokenizer
print(f"Loading tokenizer for {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    trust_remote_code=True,
    **_token_arg
)

# Load model
print(f"Loading model...")
if device.type == "cuda":
    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=dtype,
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **_token_arg
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32,
        device_map={"": "cpu"},
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        **_token_arg
    )

model.eval()

# Ensure pad token
if tokenizer.pad_token_id is None:
    if tokenizer.eos_token_id is not None:
        tokenizer.pad_token = tokenizer.eos_token
    else:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

print(f"✓ Model and tokenizer loaded!")

Loading tokenizer for Qwen/Qwen2.5-3B-Instruct...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Loading model...


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

✓ Model and tokenizer loaded!


In [4]:
# 04 - Prompt templates
from typing import List, Dict

# The instruction is now split into System and User roles for better model compliance.

INSTR_ZS_SYSTEM = (
    "You are a helpful and accurate classification model. "
    "Your task is to determine the usage of a Punjabi metaphor/idiom. "
    "You must only output '1' if the usage is metaphorical (figurative) or '0' if it is literal. "
    "Do not provide any explanation, text, or reasoning."
)

INSTR_ZS_USER_TEMPLATE = (
    "Is the Punjabi metaphor/idiom used metaphorically (figuratively) in the sentence below?"
    "\n\nMetaphor: {metaphor}\n"
    "Previous Sentence: {prev_sent}\n"
    "Target Sentence: {target_sent}\n"
    "Next Sentence: {next_sent}"
)

def make_prompt(metaphor: str, prev_sent: str, target_sent: str, next_sent: str) -> str:
    """
    Creates a prompt using the Qwen model's ChatML template.
    
    The tokenizer is assumed to be loaded and assigned to the global `tokenizer` variable.
    """
    
    # 1. Format the User content with the specific example
    user_content = INSTR_ZS_USER_TEMPLATE.format(
        metaphor=metaphor,
        prev_sent=prev_sent,
        target_sent=target_sent,
        next_sent=next_sent
    )

    # 2. Structure the prompt using the standard Hugging Face ChatML format (used by Qwen)
    messages: List[Dict[str, str]] = [
        {"role": "system", "content": INSTR_ZS_SYSTEM},
        {"role": "user", "content": user_content}
    ]

    # tokenizer.apply_chat_template handles the specific token wrappers for Qwen.
    # add_generation_prompt=True adds the final '<|im_start|>assistant\n' header, 
    # which primes the model to output the answer (the '0' or '1').
    prompt_text = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    return prompt_text

# Test prompt
test_row = df.iloc[0]
test_prompt = make_prompt(
    test_row['Metaphor_Used'],
    test_row['Prev_Sentence'],
    test_row['Target_Sentence'],
    test_row['Final_Sentence'])
    
print("Sample prompt (Qwen ChatML Format):")
print(test_prompt[:400] + "...")

Sample prompt (Qwen ChatML Format):
<|im_start|>system
You are a helpful and accurate classification model. Your task is to determine the usage of a Punjabi metaphor/idiom. You must only output '1' if the usage is metaphorical (figurative) or '0' if it is literal. Do not provide any explanation, text, or reasoning.<|im_end|>
<|im_start|>user
Is the Punjabi metaphor/idiom used metaphorically (figuratively) in the sentence below?

Met...


In [5]:
# 05 - Classification function

# Cache token IDs for "0" and "1"
_id0 = None
_id1 = None

def _candidate_token_id_for_digit(d: str) -> Optional[int]:
    ids = tokenizer.encode(d, add_special_tokens=False)
    if len(ids) == 1: return ids[0]
    ids = tokenizer.encode(" " + d, add_special_tokens=False)
    if len(ids) == 1: return ids[0]
    ids = tokenizer.encode(d + "\n", add_special_tokens=False)
    if len(ids) == 1: return ids[0]
    return None

def _init_digit_ids():
    global _id0, _id1
    if _id0 is None: _id0 = _candidate_token_id_for_digit("0")
    if _id1 is None: _id1 = _candidate_token_id_for_digit("1")

_init_digit_ids()
print(f"Token ID for '0': {_id0}")
print(f"Token ID for '1': {_id1}")

@torch.no_grad()
def classify_prompt(prompt: str) -> int:
    """Classify a single prompt by comparing logits of '0' and '1' tokens"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    logits = model(**inputs).logits
    next_logits = logits[0, -1, :]
    
    if _id0 is not None and _id1 is not None:
        logit0 = next_logits[_id0].item()
        logit1 = next_logits[_id1].item()
        return 1 if logit1 >= logit0 else 0
    
    # Fallback: generate 1 token and check content
    gen = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=DO_SAMPLE,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    text = tokenizer.decode(gen[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
    return 1 if "1" in text else 0

print("Classification function ready!")

Token ID for '0': 15
Token ID for '1': 16
Classification function ready!


In [6]:
# 06 - Batch classification with progress

from tqdm.auto import tqdm

def classify_batch(df_batch: pd.DataFrame) -> List[int]:
    """Classify a batch of samples"""
    predictions = []
    for _, row in tqdm(df_batch.iterrows(), total=len(df_batch), desc="Classifying"):
        prompt = make_prompt(
            row['Metaphor_Used'],
            row['Prev_Sentence'],
            row['Target_Sentence'],
            row['Final_Sentence']
        )
        pred = classify_prompt(prompt)
        predictions.append(pred)
    return predictions

# Run classification on the full dataset
print(f"Starting classification on {len(df)} samples...")
y_pred = classify_batch(df)
y_true = df['Label'].tolist()

print(f"\n✓ Classification complete!")
print(f"Predictions: {pd.Series(y_pred).value_counts().to_dict()}")

Starting classification on 50 samples...


Classifying:   0%|          | 0/50 [00:00<?, ?it/s]


✓ Classification complete!
Predictions: {0: 31, 1: 19}


In [7]:
# 07 - Evaluation

f1 = f1_score(y_true, y_pred, average="binary")
print(f"\n=== Classification Results ===")
print(f"Macro F1 Score: {f1:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['Literal', 'Metaphorical'], digits=4))
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

# Save results
results_df = df.copy()
results_df['Prediction'] = y_pred
results_df['Correct'] = (results_df['Label'] == results_df['Prediction']).astype(int)

# --- Naming the output file differently (using the model name) ---
model_tag = MODEL_NAME.split('/')[-1].replace('-', '_').replace('.', '_') # e.g., "Qwen_2_5_3B_Instruct"
results_path = OUT_DIR / f"punjabi_metaphor_predictions_{model_tag}.csv" # <-- Updated File Name
results_df.to_csv(results_path, index=False)
print(f"\n✓ Results saved to {results_path}")

# Summary
print(f"\nAccuracy: {results_df['Correct'].mean():.4f}")


=== Classification Results ===
Macro F1 Score: 0.6364

Classification Report:
              precision    recall  f1-score   support

     Literal     0.6452    0.8000    0.7143        25
Metaphorical     0.7368    0.5600    0.6364        25

    accuracy                         0.6800        50
   macro avg     0.6910    0.6800    0.6753        50
weighted avg     0.6910    0.6800    0.6753        50


Confusion Matrix:
[[20  5]
 [11 14]]

✓ Results saved to outputs_punjabi_metaphor/punjabi_metaphor_predictions_Qwen2_5_3B_Instruct.csv

Accuracy: 0.6800
