# Intro

This notebook is part of a series of notebooks that aim to reuse open-source LLM models to perform a binary classification task.

Notebooks can be run completely independently from the others and besides dataset_utils.py have no common local dependencies. (As a result,
you can expect a little bit of code redundancy between notebooks) 

**The task is to detect toxic comments out of text comments retrieved from different news websites.**

For more information, see dataset_utils.py or search for 'Civil Comments dataset' online.

-----
This notebook loads models locally via the Hugging Face Transformers package and **performs Zero-shot classifications**.

In [4]:
from typing import Iterable, Mapping, Any
from tqdm import tqdm

import torch
import evaluate
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

from utils import dataset_utils

Datasets cache is False


# Load Dataset

In [2]:
# Loads only a sample of the dataset for quick experiments!
comments_dataset = dataset_utils.load_sampled_ds(ds_size=200)

Downloading readme:   0%|          | 0.00/7.73k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/194M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/187M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1804874 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/97320 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/97320 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [5]:
# Our dataset already has 3 splits ready

# Our target is the 'is_toxic' binary column
# The main feature we'll use is the free text 'text' column
comments_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit', 'is_toxic'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit', 'is_toxic'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit', 'is_toxic'],
        num_rows: 200
    })
})

# Load model

In [6]:
# Pick a model or try add a different one you'd like to experiment with!
# for gated repos, you need to login - huggingface-cli login
model_name = "Qwen/Qwen2.5-1.5B-Instruct" # Ok on T4 - https://huggingface.co/Qwen/Qwen2.5-1.5B
#model_name = "microsoft/Phi-3.5-mini-instruct" # Ok on T4 - https://huggingface.co/microsoft/Phi-3.5-mini-instruct
#model_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct" # not tested, should be ok on T4 - https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct
#model_name = "meta-llama/Llama-3.2-1B" # not tested, should be ok on T4 - https://huggingface.co/meta-llama/Llama-3.2-1B
#model_name = "facebook/MobileLLM-1B"  # not tested, should be ok on T4 - https://huggingface.co/facebook/MobileLLM-1B
#model_name = "google/gemma-2-2b-it"  # not tested, should be ok on T4 - https://huggingface.co/google/gemma-2-2b-it
#model_name = "mistralai/Mistral-7B-Instruct-v0.2" # Ok on A100 - https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
#model_name = "Qwen/Qwen2.5-7B-Instruct" # Ok on A100 - https://huggingface.co/Qwen/Qwen2.5-7B
#model_name = "meta-llama/Llama-3.2-3B-Instruct" # ok on A100 # https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

print(f"model is mapped to {model.device}")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side='left' # See tokenise_batch_and_generate_prompt for explanations!
)

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

model is mapped to cuda:0


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

# Setup Zero-Shot predictions

In [11]:
# Format comment 'text' field into a _user_ prompt 
def build_basic_prompt(example: Mapping[str, Any]):
    # [Exercise] How to improve the prompt being used here?
    # [Exercise] Let's say we find the model is biased towards always answering 'No', how would we tune the prompt to steer it the other direction?
    prefix = "Is the following comment in quotes toxic?"
    suffix = "Please answer by Yes only or No only, nothing else."
    comment = example["text"]
    full_prompt = f"{prefix} '{comment}'. {suffix}"

    return {"prompt": full_prompt}

# Add to dataset
comments_dataset = comments_dataset.map(build_basic_prompt)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [12]:
# Tokenise in batches to optimise for speed!
def tokenise_batch_and_generate_answer(example_batch: Mapping[str, Iterable[Any]], print_intermediate_outputs: bool = False):
    """
    This method is a simple implementation of zero-shot classification on a input batch of examples.
    It Tokenises the batch and apply the model on it. It then parses the results to return a binary output.

    Optionally, `print_intermediate_outputs` makes the intermediate outputs visible at runtime.
    """

    messages_batch = [
      # [Exercise] Some models can expect in their template a system prompt to tune their behaviour, we can try with and without and observe
      [
        # {"role": "system", "content": ""},
        {"role": "user", "content": prompt}
      ]
      for prompt in example_batch["prompt"]
    ]

    # !!! We are not only doing text autocompletion here but we need to make sure
    # the prompts we are creating correspond to the 'template' the model
    # learnt (different chat templates for different models)
    inputs_batch = tokenizer.apply_chat_template(
        messages_batch,
        add_generation_prompt=True, # indicate we are expecting an answer from the model
        return_dict=True,
        return_tensors="pt",
        truncation=True, # inputs that are too long will be truncated, we should check the context size of the model
        padding=True, # not all inputs have the same length,
        # make sure tokeniser has padding_side='left' because we are padding all inputs to the same size
    )
    
    if print_intermediate_outputs:
        print("---")
        print("Here is the first input of the batch to the model after applying chat template:")
        print("---")
        print(tokenizer.apply_chat_template(messages_batch[0], tokenize=False))

    # Mapping input to GPU for faster processing
    inputs_mapped_to_device_batch = {k: v.to(model.device) for k, v in inputs_batch.items()}

    # [Exercise] If you added a 'long' initial prompt, you are recomputing the same initial attention
    # values multiple times, how can you optimise for processing speed?
    generated_ids_batch = model.generate(
        **inputs_mapped_to_device_batch,
        do_sample=False, # no need to be creative here
        max_new_tokens=5 # We are expecting even less (1 word and perhaps some punctuation)
    )

    if print_intermediate_outputs:
        print("---")
        print("Here is the output of the model after applying chat template:")
        print("---")
        print(tokenizer.batch_decode(generated_ids_batch, skip_special_tokens=True)[0])

    # Generated_ids also contain input_ids that we need to filter out
    generated_ids_batch = generated_ids_batch[:, inputs_batch["input_ids"].shape[1]:]

    decoded_answers_batch = tokenizer.batch_decode(generated_ids_batch, skip_special_tokens=True)

    if print_intermediate_outputs:
        print("---")
        print("Here is the decoded answer, after removing the inputs")
        print("---")
        print(decoded_answers_batch[0])

    # !!! we are returning False if the model doesn't say Yes or No back
    # [Exercise] How can we constrain the model to return only certain tokens?
    def get_binary_output(answer: str)-> bool:
        return True if 'yes' in answer.lower() else False

    binary_output_batch = [get_binary_output(answer) for answer in decoded_answers_batch]

    if print_intermediate_outputs:
        print("---")
        print("Here is the final binary output:")
        print("---")
        print(f"is_toxic = {binary_output_batch[0]}")

    return {"prediction": binary_output_batch}

# Try out an example batch
tokenise_batch_and_generate_answer(
    {"prompt":[build_basic_prompt({"text": "I hate you, you are dumb"})["prompt"], build_basic_prompt({"text": "I love you"})["prompt"]]},
    print_intermediate_outputs=True
)


---
Here is the first input of the batch to the model after applying chat template:
---
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Is the following comment in quotes toxic? 'I hate you, you are dumb'. Please answer by Yes only or No only, nothing else.<|im_end|>





---
Here is the output of the model after applying chat template:
---
system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user
Is the following comment in quotes toxic? 'I hate you, you are dumb'. Please answer by Yes only or No only, nothing else.
assistant
Yes
---
Here is the decoded answer, after removing the inputs
---
Yes
---
Here is the final binary output:
---
is_toxic = True


{'prediction': [True, False]}

In [13]:
# Map method to dataset
# Tune parameters to adjust for memory/storage space reqs of your env
# [Exercise] Is your GPU memory well utilised? if not, how to tune parameters towards better GPU utilisation?
comments_dataset = comments_dataset.map(
    tokenise_batch_and_generate_answer,
    keep_in_memory=True,
    load_from_cache_file=False,
    writer_batch_size=16,
    batched=True,
    batch_size=16
)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

# Evaluate

In [14]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [15]:
# Not expecting awesome accuracy, but at least it enables to single out a few examples,
# so it is in a way more useful than a dummy baseline
clf_metrics.compute(
    references=comments_dataset["validation"]["is_toxic"],
    predictions=comments_dataset["validation"]["prediction"]
)

{'accuracy': 0.39,
 'f1': 0.24691358024691357,
 'precision': 0.14285714285714285,
 'recall': 0.9090909090909091}

# Final test

In [16]:
# When you're happy with your tuning, run the evaluation on the test set and report your results on the sheet!
clf_metrics.compute(
    references=comments_dataset["test"]["is_toxic"],
    predictions=comments_dataset["test"]["prediction"]
)

{'accuracy': 0.325,
 'f1': 0.15094339622641506,
 'precision': 0.08163265306122448,
 'recall': 1.0}