# Introduction

## Preliminaries

### Login

In [1]:
token = "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"
!huggingface-cli login --token $token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `mistral v0.2+v0.3` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `mistral v0.2+v0.3`


### Data

In [2]:
import requests
import os
import pandas as pd

# Create directory structure if it doesn't exist
base_dir = 'Assignment 2/data'
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Updated URLs for the raw files on GitHub
urls = {
    'test': 'https://raw.githubusercontent.com/lt-nlp-lab-unibo/nlp-course-material/main/2024-2025/Assignment%202/data/a2_test.csv',
    'demos': 'https://raw.githubusercontent.com/lt-nlp-lab-unibo/nlp-course-material/main/2024-2025/Assignment%202/data/demonstrations.csv'
}

# Download files
for name, url in urls.items():
    response = requests.get(url)
    if response.status_code == 200:
        filepath = os.path.join(base_dir, f'{name}.csv')
        with open(filepath, 'wb') as f:
            f.write(response.content)
        print(f"Successfully downloaded {filepath}")
    else:
        print(f"Failed to download {name} dataset. Status code: {response.status_code}")
        print(f"URL attempted: {url}")

# Verify files exist before loading
if os.path.exists(os.path.join(base_dir, 'test.csv')) and os.path.exists(os.path.join(base_dir, 'demos.csv')):
    # Load the datasets using pandas

    test_df = pd.read_csv(os.path.join(base_dir, 'test.csv'))
    demos_df = pd.read_csv(os.path.join(base_dir, 'demos.csv'))

    # Display first few rows of test dataset
    print("\nFirst few rows of test dataset:")
    print(test_df.head())
    # Display basic information about both datasets
    print("\nTest dataset info:")
    print(test_df.info())
    print("\nDemonstrations dataset info:")
    print(demos_df.info())
else:
    print("\nFiles were not downloaded successfully. Please check the repository URLs.")

Successfully downloaded Assignment 2/data/test.csv
Successfully downloaded Assignment 2/data/demos.csv

First few rows of test dataset:
                  rewire_id  \
0  sexism2022_english-17133   
1  sexism2022_english-14197   
2   sexism2022_english-3018   
3   sexism2022_english-5301   
4  sexism2022_english-17796   

                                                text label_sexist  
0  The boys must be gaming because there goes the...   not sexist  
1  Look at those eyes. Either someone unexpectedl...       sexist  
2                  Old man mogs everyone in this sub   not sexist  
3  Excellent, I was just looking at another post ...   not sexist  
4  So you run back to daddy whenever you need hel...       sexist  

Test dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   rewire_id     300 non-null    object
 1   text          

# [Task 1]: Model(s) setup

In [3]:
# system packages
from pathlib import Path
import shutil
import urllib
import tarfile
import sys

# data and numerical management packages
import pandas as pd
import numpy as np

# useful during debugging (progress bars)
from tqdm import tqdm
!pip install transformers
!pip install datasets
!pip install accelerate -U
!pip install evaluate
!pip install bitsandbytes
import torch
torch.cuda.is_available()
!nvidia-smi


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [4]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

## Mistral-7B-v0.3

In [5]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

mistral_modelcard = "mistralai/Mistral-7B-Instruct-v0.3" # choose mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_modelcard) # autotokenizer
mistral_tokenizer.pad_token = mistral_tokenizer.eos_token # Set the padding token to be the same as the end-of-sequence token


# Define special tokens that indicate the end of generation
mistral_terminators = [
    mistral_tokenizer.eos_token_id,
    mistral_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]



mistral_model = AutoModelForCausalLM.from_pretrained(
    mistral_modelcard,
    return_dict=True,
    quantization_config=bnb_config,
    device_map='auto'
)

# Clear CUDA cache
torch.cuda.empty_cache()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

## Phi 3 mini 4k instruct

In [6]:
phi3_modelcard = "microsoft/Phi-3-mini-4k-instruct" # choose phi3 model
phi3_tokenizer = AutoTokenizer.from_pretrained(phi3_modelcard, trust_remote_code=True)
# Define special tokens that indicate the end of generation
phi3_terminators = [
    phi3_tokenizer.eos_token_id,
    phi3_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


phi3_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True
)

# Clear CUDA cache
torch.cuda.empty_cache()

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

## Qwen 2 7B Instruct

In [7]:
model_card = "Qwen/Qwen2-7B-Instruct"
# Configure memory and device mapping
max_memory = {
    0: "13GB",      # Reserve most T4 GPU memory
    "cpu": "25GB"   # Allow CPU offloading
}

# Load tokenizer
qwen_tokenizer = AutoTokenizer.from_pretrained(model_card)

# Load model with optimized settings
qwen_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-7B-Instruct",
    quantization_config=bnb_config,
    device_map="balanced",  #  Balanced memory distribution
    max_memory=max_memory,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    offload_folder="offload"  # Enable disk offloading if needed
)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

# [Task 2] Prompt setup

In [8]:
prompt = [
    {
        'role': 'system',
        'content': 'You are an annotator for sexism detection.'
    },
    {
        'role': 'user',
        'content': """Your task is to classify input text as containing sexism or not. Respond only YES or NO.

        TEXT:
        {text}

        ANSWER:
        """
    }
]

In [9]:
def prepare_prompts(texts, prompt_template, tokenizer):
    """
    This function formats input text samples into instruction prompts.
    """
    formatted_prompts = []

    for text in texts:
        # Create a deep copy of the template to avoid modifying the original
        current_prompt = [
            prompt_template[0].copy(),
            prompt_template[1].copy()
        ]

        # Format the user content by replacing the {text} placeholder
        current_prompt[1]['content'] = current_prompt[1]['content'].format(text=text)

        # Convert the prompt list into a chat format string
        chat_text = f"{current_prompt[0]['content']}\n\n{current_prompt[1]['content']}"

        # Tokenize the formatted prompt
        encoded_prompt = tokenizer(
            chat_text,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )

        formatted_prompts.append(encoded_prompt)

    return formatted_prompts

# [Task 3] Inference

In [10]:
def generate_responses(model, prompt_examples):
    """
    This function implements the inference loop for a LLM model.
    """
    responses = []
    device = next(model.parameters()).device

    # Extract model name and tokenizer with detailed debugging
    model_vars = [name for name, obj in globals().items() if obj is model]
    if not model_vars:
        raise ValueError("Could not find model variable name in global scope")

    #print("\nDebugging tokenizer lookup:")
    #print(f"1. Found model object: {model}")
    #print(f"2. Found model variable name(s): {model_vars}")
    model_name = model_vars[0].replace('_model', '')
    #print(f"3. Extracted model name: {model_name}")
    tokenizer_name = f"{model_name}_tokenizer"
    #print(f"4. Looking for tokenizer with name: {tokenizer_name}")
    #print(f"5. Available global variables: {[name for name in globals().keys() if 'tokenizer' in name.lower()]}")

    try:
        tokenizer = globals()[tokenizer_name]
        #print(f"6. Successfully found tokenizer: {tokenizer}")
    except KeyError:
        raise KeyError(f"Could not find tokenizer '{tokenizer_name}' in globals. Available tokenizers: {[name for name in globals().keys() if 'tokenizer' in name.lower()]}")

    # Rest of the function remains the same
    yes_tokens = tokenizer(" YES", add_special_tokens=False)['input_ids']
    no_tokens = tokenizer(" NO", add_special_tokens=False)['input_ids']

    with torch.no_grad():
        for encoded_prompt in prompt_examples:
            input_ids = encoded_prompt['input_ids'].to(device)
            attention_mask = encoded_prompt['attention_mask'].to(device)

            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=3,
                do_sample=False,
                num_beams=1,
                pad_token_id=model.config.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

            new_tokens = outputs[0][input_ids.shape[1]:]
            decoded_output = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

            if 'YES' in decoded_output.upper():
                decoded_output = 'YES'
            elif 'NO' in decoded_output.upper():
                decoded_output = 'NO'
            else:
                logits = model(input_ids, attention_mask=attention_mask).logits
                next_token_logits = logits[0, -1]
                yes_score = sum(next_token_logits[tid] for tid in yes_tokens)
                no_score = sum(next_token_logits[tid] for tid in no_tokens)
                decoded_output = 'YES' if yes_score > no_score else 'NO'

            responses.append(decoded_output)
            torch.cuda.empty_cache()

    return responses

In [11]:

def process_response(response):
    """
    This function processes model responses into binary labels.

    Inputs:
      response: the raw text response from the model

    Outputs:
      binary label (sexist/not sexist)
    """
    # Convert response to lowercase for case-insensitive matching
    response = response.lower().strip()
    #print(f"response: {response}")
    # Check for "yes" indicating sexist content
    if "yes" in response:
        return 1
    # Check for "no" indicating not sexist content
    elif "no" in response:
        return 0
    else:
        # invalid repsonse
        return None

In [12]:
import json
def test_random_samples_with_timing(model, tokenizer, test_df, prompt_template, num_samples=10):
    results = []
    device = next(model.parameters()).device
    tokenizer.pad_token = tokenizer.eos_token

    # Get equal number of samples from both classes
    samples_per_class = num_samples // 2

    # Get indices for both classes
    sexist_indices = test_df[test_df['label_sexist'] == 'sexist'].index
    not_sexist_indices = test_df[test_df['label_sexist'] == 'not sexist'].index

    # Randomly sample indices from both classes
    selected_sexist = random.sample(list(sexist_indices), min(samples_per_class, len(sexist_indices)))
    selected_not_sexist = random.sample(list(not_sexist_indices), min(samples_per_class, len(not_sexist_indices)))

    # Combine and shuffle the selected indices
    selected_indices = selected_sexist + selected_not_sexist
    random.shuffle(selected_indices)

    start_time = time.time()

    # Loop over the selected samples
    for i, idx in enumerate(selected_indices):
        # Get the sample text and true label
        sample_text = test_df.iloc[idx]['text']
        true_label = test_df.iloc[idx]['label_sexist']

        print(f"\nSample {i+1}/{len(selected_indices)}:")
        print(f"Selected text: {sample_text}")
        print(f"True label: {true_label}")

        # Format the prompt and generate response
        formatted_prompts = prepare_prompts([sample_text], prompt_template, tokenizer)

        # Print the formatted prompt
        input_ids = formatted_prompts[0]['input_ids'][0]
        formatted_text = tokenizer.decode(input_ids)

        # Move tensors to device
        for j in range(len(formatted_prompts)):
            formatted_prompts[j] = {k: v.to(device) for k, v in formatted_prompts[j].items()}

        responses = generate_responses(model, formatted_prompts)
        binary_label = process_response(responses[0])
        predicted_label = "sexist" if binary_label == 1 else "not sexist"

        print(f"Predicted label: {predicted_label}")
        print("-" * 80)

        # Save result
        result = {
            'sample_idx': idx,
            'text': sample_text,
            'true_label': true_label,
            'predicted_label': predicted_label,
            'raw_response': responses[0],
            'correct': predicted_label == true_label
        }
        results.append(result)

    total_time = time.time() - start_time

    # Calculate summary statistics
    correct_predictions = sum(result['correct'] for result in results)
    accuracy = correct_predictions / len(results)

    sexist_correct = sum(1 for r in results
                        if r['true_label'] == 'sexist' and r['correct'])
    not_sexist_correct = sum(1 for r in results
                            if r['true_label'] == 'not sexist' and r['correct'])

    sexist_total = sum(1 for r in results if r['true_label'] == 'sexist')
    not_sexist_total = sum(1 for r in results if r['true_label'] == 'not sexist')

    print("\nResults Summary:")
    print(f"Overall Accuracy: {accuracy:.2%}")
    print(f"Sexist Accuracy: {sexist_correct/sexist_total:.2%} ({sexist_correct}/{sexist_total})")
    print(f"Not Sexist Accuracy: {not_sexist_correct/not_sexist_total:.2%} ({not_sexist_correct}/{not_sexist_total})")

    # Add timestamp to results metadata
    test_metadata = {
        'timestamp': time.strftime('%Y-%m-%d_%H-%M-%S'),
        'model': model_card,
        'num_samples': num_samples,
        'overall_accuracy': float(accuracy),  # Convert to float for JSON serialization
        'sexist_accuracy': float(sexist_correct/sexist_total),
        'not_sexist_accuracy': float(not_sexist_correct/not_sexist_total),
        'execution_time': float(total_time),
        'avg_time_per_sample': float(total_time/len(results))
    }

    # Create results directory if it doesn't exist
    os.makedirs('results', exist_ok=True)

    # Save detailed results to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(f'results/test_results_{test_metadata["timestamp"]}.csv', index=False)

    # Save metadata and summary to JSON
    with open(f'results/test_metadata_{test_metadata["timestamp"]}.json', 'w') as f:
        json.dump(test_metadata, f, indent=4)

    print(f"\nResults saved to:")
    print(f"- results/test_results_{test_metadata['timestamp']}.csv")
    print(f"- results/test_metadata_{test_metadata['timestamp']}.json")

    return results

## Run with Mistral

In [13]:
import time, random
from datetime import timedelta

# Run test on n samples
tokenizer = mistral_tokenizer
start_time = time.time()
mistral_results = test_random_samples_with_timing(mistral_model, tokenizer, test_df, prompt, num_samples=30)
total_time = time.time() - start_time
print(f"\nTotal execution time: {str(timedelta(seconds=total_time))}")
print(f"Average time per sample: {str(timedelta(seconds=total_time/100))}")
# Print summary of results
mistral_correct_predictions = sum(result['correct'] for result in mistral_results)
mistral_accuracy = mistral_correct_predictions / len(mistral_results)

print("\nTest Summary:")
print(f"Total samples tested: {len(mistral_results)}")
print(f"Accuracy: {mistral_accuracy:.2%}")
print(f"Results: {mistral_results}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Sample 1/30:
Selected text: new male successes will arise out of the ashes of lost ones like comic culture, pc gaming, herioc fantasy, boyscouts etc. may they forever remained scorned and ridiculed by women.
True label: sexist
Predicted label: sexist
--------------------------------------------------------------------------------

Sample 2/30:
Selected text: Hahaha, what a cuckold. I would've just ghosted her. "I will not tell her what to do" YOU ARE IN A RELATIONSHIP WITH HER, ITS YOUR DUTY TO TELL HER WHAT TO DO. Cuckold motherfucker. Now it's too late, dump the bitch.
True label: sexist
Predicted label: sexist
--------------------------------------------------------------------------------

Sample 3/30:
Selected text: Why the fuck would you go for crazy bitches?
True label: sexist
Predicted label: sexist
--------------------------------------------------------------------------------

Sample 4/30:
Selected text: Women in the military is a joke. Almost any man could beat the shit ou

## Run with Phi-3

In [14]:
import time, random
from datetime import timedelta

# Run test on n samples
start_time = time.time()
phi3_results = test_random_samples_with_timing(phi3_model, phi3_tokenizer, test_df, prompt, num_samples=30)
total_time = time.time() - start_time
print(f"\nTotal execution time: {str(timedelta(seconds=total_time))}")
print(f"Average time per sample: {str(timedelta(seconds=total_time/100))}")
# Print summary of results
phi3_correct_predictions = sum(result['correct'] for result in phi3_results)
phi3_accuracy = phi3_correct_predictions / len(phi3_results)

print("\nTest Summary:")
print(f"Total samples tested: {len(phi3_results)}")
print(f"Accuracy: {phi3_accuracy:.2%}")
print(f"Results: {phi3_results}")

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48



Sample 1/30:
Selected text: Half the human population counts as a 'minority' to you? Some women don't give a shit about pushing paper as an office drone for 50 years of their life.
True label: sexist
Predicted label: sexist
--------------------------------------------------------------------------------

Sample 2/30:
Selected text: It hasn't for many because this greedy bastard had to try and get it all for himself.
True label: not sexist
Predicted label: sexist
--------------------------------------------------------------------------------

Sample 3/30:
Selected text: "Only love can beat hate!" It was not love that drove the Ottomans from Europe and liberated those they subjugated and oppressed, it was the winged hussars might and bravery.
True label: not sexist
Predicted label: not sexist
--------------------------------------------------------------------------------

Sample 4/30:
Selected text: Chelsea Clinton leaned over to her husband and said something about Melania. Not sure 

## Run with Qwen-2

In [15]:
import time, random
from datetime import timedelta

# Run test on n samples
start_time = time.time()
qwen_results = test_random_samples_with_timing(qwen_model, qwen_tokenizer, test_df, prompt, num_samples=30)
total_time = time.time() - start_time
print(f"\nTotal execution time: {str(timedelta(seconds=total_time))}")
print(f"Average time per sample: {str(timedelta(seconds=total_time/100))}")
# Print summary of results
qwen_correct_predictions = sum(result['correct'] for result in qwen_results)
qwen_accuracy = qwen_correct_predictions / len(qwen_results)

print("\nTest Summary:")
print(f"Total samples tested: {len(qwen_results)}")
print(f"Accuracy: {qwen_accuracy:.2%}")
print(f"Results: {qwen_results}")


Sample 1/30:
Selected text: The black African will soon be labeling her a racist. Race card coming soon
True label: not sexist




Predicted label: not sexist
--------------------------------------------------------------------------------

Sample 2/30:
Selected text: I'd be shocked if it ever worked, usually these thots are getting stretched on the side, and after Leafyishere did the content nuke on her, the tier 3 subs lost it. She has a boyfriend is now etched in their collective memories
True label: sexist
Predicted label: sexist
--------------------------------------------------------------------------------

Sample 3/30:
Selected text: As an Eagle Scout, I'm less than pleased with all of this. If women wanted to destroy the BSA, they've done it.
True label: sexist
Predicted label: sexist
--------------------------------------------------------------------------------

Sample 4/30:
Selected text: Na he's not flirting with her so she's reading him as gay.
True label: not sexist
Predicted label: not sexist
--------------------------------------------------------------------------------

Sample 5/30:
Selected te

In [16]:
print(qwen_results)

[{'sample_idx': 10, 'text': 'The black African will soon be labeling her a racist. Race card coming soon', 'true_label': 'not sexist', 'predicted_label': 'not sexist', 'raw_response': 'NO', 'correct': True}, {'sample_idx': 68, 'text': "I'd be shocked if it ever worked, usually these thots are getting stretched on the side, and after Leafyishere did the content nuke on her, the tier 3 subs lost it. She has a boyfriend is now etched in their collective memories", 'true_label': 'sexist', 'predicted_label': 'sexist', 'raw_response': 'YES', 'correct': True}, {'sample_idx': 154, 'text': "As an Eagle Scout, I'm less than pleased with all of this. If women wanted to destroy the BSA, they've done it.", 'true_label': 'sexist', 'predicted_label': 'sexist', 'raw_response': 'YES', 'correct': True}, {'sample_idx': 17, 'text': "Na he's not flirting with her so she's reading him as gay.", 'true_label': 'not sexist', 'predicted_label': 'not sexist', 'raw_response': 'NO', 'correct': True}, {'sample_idx'

# [Task 4] Metrics

In [17]:
def compute_metrics(results):
    """
    Compute metrics with proper error handling
    """
    # Check if results is empty
    if not results:
        return {
            'accuracy': 0.0,
            'fail_ratio': 0.0,
            'total_samples': 0,
            'failed_samples': 0,
            'correct_predictions': 0,
            'valid_predictions': 0,
            'sexist_accuracy': 0.0,
            'not_sexist_accuracy': 0.0,
            'sexist_correct': 0,
            'sexist_total': 0,
            'not_sexist_correct': 0,
            'not_sexist_total': 0
        }

    total_samples = len(results)
    failed_samples = 0
    correct_predictions = 0
    sexist_correct = 0
    sexist_total = 0
    not_sexist_correct = 0
    not_sexist_total = 0

    for result in results:
        if result['true_label'] == 'sexist':
            sexist_total += 1
            if result['correct']:
                sexist_correct += 1
        else:
            not_sexist_total += 1
            if result['correct']:
                not_sexist_correct += 1

        raw_response = result['raw_response'].strip().upper()
        if raw_response not in ['YES', 'NO']:
            failed_samples += 1
            continue

        if result['correct']:
            correct_predictions += 1

    valid_predictions = total_samples - failed_samples
    fail_ratio = failed_samples / total_samples if total_samples > 0 else 0.0
    accuracy = correct_predictions / valid_predictions if valid_predictions > 0 else 0.0
    sexist_accuracy = sexist_correct / sexist_total if sexist_total > 0 else 0.0
    not_sexist_accuracy = not_sexist_correct / not_sexist_total if not_sexist_total > 0 else 0.0

    return {
        'accuracy': accuracy,
        'fail_ratio': fail_ratio,
        'total_samples': total_samples,
        'failed_samples': failed_samples,
        'correct_predictions': correct_predictions,
        'valid_predictions': valid_predictions,
        'sexist_accuracy': sexist_accuracy,
        'not_sexist_accuracy': not_sexist_accuracy,
        'sexist_correct': sexist_correct,
        'sexist_total': sexist_total,
        'not_sexist_correct': not_sexist_correct,
        'not_sexist_total': not_sexist_total
    }


In [18]:
def print_model_results(model_name, results):
    """
    Pretty print the metrics results for a given model
    """
    metrics = compute_metrics(results)

    # Create a styled header
    header = f"\n{'='*50}"
    model_title = f"| {model_name.upper()} RESULTS |"

    print(header)
    print(model_title.center(50))
    print(header)

    # Print main metrics
    print("\nMAIN METRICS:")
    print(f"• Overall Accuracy: {metrics['accuracy']:.2%}")
    print(f"• Fail Ratio: {metrics['fail_ratio']:.2%}")

    # Print detailed counts
    print("\nDETAILED COUNTS:")
    print(f"• Total Samples: {metrics['total_samples']}")
    print(f"• Valid Predictions: {metrics['valid_predictions']}")
    print(f"• Failed Responses: {metrics['failed_samples']}")
    print(f"• Correct Predictions: {metrics['correct_predictions']}")

    # Print class-specific metrics
    print("\nCLASS-SPECIFIC PERFORMANCE:")
    print(f"• Sexist Accuracy: {metrics['sexist_accuracy']:.2%} "
          f"({metrics['sexist_correct']}/{metrics['sexist_total']})")
    print(f"• Not Sexist Accuracy: {metrics['not_sexist_accuracy']:.2%} "
          f"({metrics['not_sexist_correct']}/{metrics['not_sexist_total']})")

    print(header + "\n")

    return metrics

# Print results for each model
print_model_results("Mistral", mistral_results)
print_model_results("Phi-3", phi3_results)
print_model_results("Qwen", qwen_results)


               | MISTRAL RESULTS |                


MAIN METRICS:
• Overall Accuracy: 56.67%
• Fail Ratio: 0.00%

DETAILED COUNTS:
• Total Samples: 30
• Valid Predictions: 30
• Failed Responses: 0
• Correct Predictions: 17

CLASS-SPECIFIC PERFORMANCE:
• Sexist Accuracy: 93.33% (14/15)
• Not Sexist Accuracy: 20.00% (3/15)



                | PHI-3 RESULTS |                 


MAIN METRICS:
• Overall Accuracy: 46.67%
• Fail Ratio: 0.00%

DETAILED COUNTS:
• Total Samples: 30
• Valid Predictions: 30
• Failed Responses: 0
• Correct Predictions: 14

CLASS-SPECIFIC PERFORMANCE:
• Sexist Accuracy: 60.00% (9/15)
• Not Sexist Accuracy: 33.33% (5/15)



                 | QWEN RESULTS |                 


MAIN METRICS:
• Overall Accuracy: 73.33%
• Fail Ratio: 0.00%

DETAILED COUNTS:
• Total Samples: 30
• Valid Predictions: 30
• Failed Responses: 0
• Correct Predictions: 22

CLASS-SPECIFIC PERFORMANCE:
• Sexist Accuracy: 93.33% (14/15)
• Not Sexist Accuracy: 53.33% (8/15)




{'accuracy': 0.7333333333333333,
 'fail_ratio': 0.0,
 'total_samples': 30,
 'failed_samples': 0,
 'correct_predictions': 22,
 'valid_predictions': 30,
 'sexist_accuracy': 0.9333333333333333,
 'not_sexist_accuracy': 0.5333333333333333,
 'sexist_correct': 14,
 'sexist_total': 15,
 'not_sexist_correct': 8,
 'not_sexist_total': 15}

In [19]:

# Print results for each model
print_model_results("Mistral", mistral_results)
print_model_results("Phi-3", phi3_results)
print_model_results("Qwen", qwen_results)


               | MISTRAL RESULTS |                


MAIN METRICS:
• Overall Accuracy: 56.67%
• Fail Ratio: 0.00%

DETAILED COUNTS:
• Total Samples: 30
• Valid Predictions: 30
• Failed Responses: 0
• Correct Predictions: 17

CLASS-SPECIFIC PERFORMANCE:
• Sexist Accuracy: 93.33% (14/15)
• Not Sexist Accuracy: 20.00% (3/15)



                | PHI-3 RESULTS |                 


MAIN METRICS:
• Overall Accuracy: 46.67%
• Fail Ratio: 0.00%

DETAILED COUNTS:
• Total Samples: 30
• Valid Predictions: 30
• Failed Responses: 0
• Correct Predictions: 14

CLASS-SPECIFIC PERFORMANCE:
• Sexist Accuracy: 60.00% (9/15)
• Not Sexist Accuracy: 33.33% (5/15)



                 | QWEN RESULTS |                 


MAIN METRICS:
• Overall Accuracy: 73.33%
• Fail Ratio: 0.00%

DETAILED COUNTS:
• Total Samples: 30
• Valid Predictions: 30
• Failed Responses: 0
• Correct Predictions: 22

CLASS-SPECIFIC PERFORMANCE:
• Sexist Accuracy: 93.33% (14/15)
• Not Sexist Accuracy: 53.33% (8/15)




{'accuracy': 0.7333333333333333,
 'fail_ratio': 0.0,
 'total_samples': 30,
 'failed_samples': 0,
 'correct_predictions': 22,
 'valid_predictions': 30,
 'sexist_accuracy': 0.9333333333333333,
 'not_sexist_accuracy': 0.5333333333333333,
 'sexist_correct': 14,
 'sexist_total': 15,
 'not_sexist_correct': 8,
 'not_sexist_total': 15}

# [Task 5] Few-show inference

In [20]:
import pandas as pd

# Load the demonstrations.csv
url = "https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%202/data/demonstrations.csv"
demonstrations = pd.read_csv(url)
print(demonstrations.head())

                  rewire_id  \
0  sexism2022_english-10348   
1   sexism2022_english-7950   
2   sexism2022_english-7742   
3  sexism2022_english-16170   
4   sexism2022_english-9857   

                                                text label_sexist  
0  I would never have my kid go to a daycare with...   not sexist  
1           Democrat women are so butt fucking ugly.       sexist  
2  If you really wanted to twist the knife you’d ...   not sexist  
3  Those women are pro-rape. They didn’t care abo...       sexist  
4  Married a bitch... Is it the standard scenario...       sexist  


In [21]:
prompt = [
    {
        'role': 'system',
        'content': 'You are an annotator for sexism detection.'
    },
    {
        'role': 'user',
        'content': """Your task is to classify input text as containing sexism or not. Respond only YES or NO.

        EXAMPLES:
        {examples}

        TEXT:
        {text}

        ANSWER:
        """
    }
]

In [22]:
def build_few_shot_demonstrations(demonstrations, num_shots):
    """
    Build demonstrations with specified number of shots per class.

    Args:
        demonstrations (DataFrame): DataFrame containing demonstration examples
        num_shots (int): Number of examples per class to include
    """
    # Filter and sample demonstrations
    sexist_examples = demonstrations[demonstrations['label_sexist'] == 'sexist'].sample(num_shots)
    not_sexist_examples = demonstrations[demonstrations['label_sexist'] == 'not sexist'].sample(num_shots)

    # Format examples into a string
    examples = ""
    for _, row in sexist_examples.iterrows():
        examples += f"TEXT: **{row['text']}**\nANSWER: YES\n"
    for _, row in not_sexist_examples.iterrows():
        examples += f"TEXT: **{row['text']}**\nANSWER: NO\n"

    return examples

In [23]:
def create_prompt(text, demonstrations):
    num_per_class = 2  # or any other number you choose
    examples = build_few_shot_demonstrations(demonstrations, num_per_class)

    prompt = [
        {
            'role': 'system',
            'content': 'You are an annotator for sexism detection.'
        },
        {
            'role': 'user',
            'content': f"""Your task is to classify input text as containing sexism or not. Respond only YES or NO.

            EXAMPLES:
            {examples}

            TEXT:
            {text}

            ANSWER:
            """
        }
    ]
    return prompt

In [24]:
# Test example
sample_text = "Women belong in the kitchen, not in the workplace."

# Create a prompt with few-shot examples
test_prompt = create_prompt(sample_text, demonstrations)

# Print the formatted prompt to see how it looks
print("Generated Prompt Structure:")
print("\nSystem Role:")
print(test_prompt[0]['content'])
print("\nUser Role:")
print(test_prompt[1]['content'])

Generated Prompt Structure:

System Role:
You are an annotator for sexism detection.

User Role:
Your task is to classify input text as containing sexism or not. Respond only YES or NO.

            EXAMPLES:
            TEXT: **Fuck the single mother, she made the choice to have children, we didn't choose our fate.**
ANSWER: YES
TEXT: **ROFL!! Yes! I will never look that video in the same way. My interpretation: Never let her control shit. HB10 has nothing on me. Push her off and take the reins. Strongest reality wins and I'm riding this one. lmao.**
ANSWER: YES
TEXT: **Shame, shame, your white privilege is showing.**
ANSWER: NO
TEXT: **my comment says i don’t agree with her stabbing him**
ANSWER: NO


            TEXT:
            Women belong in the kitchen, not in the workplace.

            ANSWER:
            


In [25]:
# Run few-shot inference with Mistral
import time, random
from datetime import timedelta

def test_n_shot_samples(model, tokenizer, test_df, demonstrations, num_shots, num_samples=30):
    """
    Test model performance with specified number of shots and error handling
    """
    try:
        results = []
        device = next(model.parameters()).device
        tokenizer.pad_token = tokenizer.eos_token

        # Sample selection logic
        samples_per_class = num_samples // 2
        sexist_indices = test_df[test_df['label_sexist'] == 'sexist'].index
        not_sexist_indices = test_df[test_df['label_sexist'] == 'not sexist'].index

        selected_sexist = random.sample(list(sexist_indices), min(samples_per_class, len(sexist_indices)))
        selected_not_sexist = random.sample(list(not_sexist_indices), min(samples_per_class, len(not_sexist_indices)))
        selected_indices = selected_sexist + selected_not_sexist
        random.shuffle(selected_indices)

        for i, idx in enumerate(selected_indices):
            sample_text = test_df.iloc[idx]['text']
            true_label = test_df.iloc[idx]['label_sexist']

            # Create n-shot prompt
            examples = build_few_shot_demonstrations(demonstrations, num_shots)
            few_shot_prompt = [
                {'role': 'system', 'content': 'You are an annotator for sexism detection.'},
                {'role': 'user', 'content': f"""Your task is to classify input text as containing sexism or not. Respond only YES or NO.

                EXAMPLES:
                {examples}

                TEXT:
                {sample_text}

                ANSWER:
                """}
            ]

            formatted_prompts = prepare_prompts([sample_text], few_shot_prompt, tokenizer)

            # Move tensors to device
            for j in range(len(formatted_prompts)):
                formatted_prompts[j] = {k: v.to(device) for k, v in formatted_prompts[j].items()}

            responses = generate_responses(model, formatted_prompts)
            binary_label = process_response(responses[0])
            predicted_label = "sexist" if binary_label == 1 else "not sexist"

            results.append({
                'sample_idx': idx,
                'text': sample_text,
                'true_label': true_label,
                'predicted_label': predicted_label,
                'raw_response': responses[0],
                'correct': predicted_label == true_label
            })

            print(f"\t\t\tProgress: {i+1}/{len(selected_indices)} samples processed")
            torch.cuda.empty_cache()

        return results

    except Exception as e:
        print(f"Error during testing with {num_shots} shots: {str(e)}")
        return []


In [26]:
def print_shot_comparison(results_dict, model_name):
    """
    Print comparison of different numbers of shots

    Args:
        results_dict: Dictionary with number of shots as keys and results as values
        model_name: Name of the model being evaluated
    """
    header = f"\n{'='*90}"
    model_title = f"| {model_name.upper()} PERFORMANCE ACROSS DIFFERENT NUMBERS OF SHOTS |"

    print(header)
    print(model_title.center(90))
    print(header)

    # Print metrics table
    print("\nPERFORMANCE METRICS:")
    print(f"{'Metric':<20}", end='')
    for n_shots in sorted(results_dict.keys()):
        print(f"{f'{n_shots}-shot':>12}", end='')
    print()
    print("-" * 90)

    # Get metrics for each shot count
    metrics_by_shots = {
        n_shots: compute_metrics(results)
        for n_shots, results in results_dict.items()
    }

    # Print accuracy metrics
    print(f"Overall Accuracy:", end='')
    for n_shots in sorted(results_dict.keys()):
        print(f"{metrics_by_shots[n_shots]['accuracy']:>12.1%}", end='')
    print()

    print(f"Sexist Accuracy:", end='')
    for n_shots in sorted(results_dict.keys()):
        print(f"{metrics_by_shots[n_shots]['sexist_accuracy']:>12.1%}", end='')
    print()

    print(f"Not Sexist Accuracy:", end='')
    for n_shots in sorted(results_dict.keys()):
        print(f"{metrics_by_shots[n_shots]['not_sexist_accuracy']:>12.1%}", end='')
    print()

    print(header + "\n")

    return metrics_by_shots


In [27]:

# Test with different numbers of shots
shot_counts = [2, 4, 6, 8, 10]
test_samples = 50

# Initialize results dictionaries
mistral_shot_results = {}
phi3_shot_results = {}
qwen_shot_results = {}

# Test each model with progress tracking
for model_name, model_tuple in [
    ("Mistral", (mistral_model, mistral_tokenizer, mistral_shot_results)),
    ("Phi-3", (phi3_model, phi3_tokenizer, phi3_shot_results)),
    ("Qwen-2", (qwen_model, qwen_tokenizer, qwen_shot_results))
]:
    print(f"\nTesting {model_name}...")
    model, tokenizer, results_dict = model_tuple

    for n_shots in shot_counts:
        print(f"\n\tTesting with {n_shots} shots...")
        results = test_n_shot_samples(
            model,
            tokenizer,
            test_df,
            demonstrations,
            n_shots,
            test_samples
        )
        if results:  # Only store results if they're not empty
            results_dict[n_shots] = results
        print(f"Completed {n_shots}-shot testing for {model_name}")

# Print comparisons with error handling
print("\nResults Summary:")
for model_name, results_dict in [
    ("Mistral", mistral_shot_results),
    ("Phi-3", phi3_shot_results),
    ("Qwen-2", qwen_shot_results)
]:
    if results_dict:
        print(f"\n{model_name} Results:")
        metrics = print_shot_comparison(results_dict, model_name)
    else:
        print(f"\n{model_name}: No valid results collected")


Testing Mistral...

	Testing with 2 shots...
			Progress: 1/50 samples processed
			Progress: 2/50 samples processed
			Progress: 3/50 samples processed
			Progress: 4/50 samples processed
			Progress: 5/50 samples processed
			Progress: 6/50 samples processed
			Progress: 7/50 samples processed
			Progress: 8/50 samples processed
			Progress: 9/50 samples processed
			Progress: 10/50 samples processed
			Progress: 11/50 samples processed
			Progress: 12/50 samples processed
			Progress: 13/50 samples processed
			Progress: 14/50 samples processed
			Progress: 15/50 samples processed
			Progress: 16/50 samples processed
			Progress: 17/50 samples processed
			Progress: 18/50 samples processed
			Progress: 19/50 samples processed
			Progress: 20/50 samples processed
			Progress: 21/50 samples processed
			Progress: 22/50 samples processed
			Progress: 23/50 samples processed
			Progress: 24/50 samples processed
			Progress: 25/50 samples processed
			Progress: 26/50 samples processed
	



			Progress: 1/50 samples processed
			Progress: 2/50 samples processed
			Progress: 3/50 samples processed
			Progress: 4/50 samples processed
			Progress: 5/50 samples processed
			Progress: 6/50 samples processed
			Progress: 7/50 samples processed
			Progress: 8/50 samples processed
			Progress: 9/50 samples processed
			Progress: 10/50 samples processed
			Progress: 11/50 samples processed
			Progress: 12/50 samples processed
			Progress: 13/50 samples processed
			Progress: 14/50 samples processed
			Progress: 15/50 samples processed
			Progress: 16/50 samples processed
			Progress: 17/50 samples processed
			Progress: 18/50 samples processed
			Progress: 19/50 samples processed
			Progress: 20/50 samples processed
			Progress: 21/50 samples processed
			Progress: 22/50 samples processed
			Progress: 23/50 samples processed
			Progress: 24/50 samples processed
			Progress: 25/50 samples processed
			Progress: 26/50 samples processed
			Progress: 27/50 samples processed
			Progres

# [Task 6] Error Analysis

# [Task 7] Report