## Classifier: In Context Learing

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

import numpy as np
import os
import time
import random
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Define Parameters

In [2]:
# Limit visibility to only GPU 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Set the device to GPU 0 if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
SAMPLING = "random" # "random", "stratified", "clustered", "shared_domain"
SUFFIX = "_extended" #"", "_holdout", "_extended"
SPLIT = "test" # "train", "test", "holdout", "extende
MAX_CONTENT_LENGTH = 384 # 496, 192
OVERLAP = 64
FEATURES = "url_and_content" # "url", "content", "url_and_content"

In [4]:
topics = ["cannabis", "energie", "kinder"]
models = [
    {
        "name": "aya-101",
        "model": "CohereForAI/aya-101",
        "tokenizer_class": "AutoTokenizer",
        "model_class": "AutoModelForSeq2SeqLM"
    },
    # {
    #     "name": "vicuna-13b",
    #     "model": "lmsys/vicuna-13b-v1.5",
    #     "tokenizer_class": "LlamaTokenizer",
    #     "model_class": "LlamaForCausalLM"
    # },
    # {
    #     "name": "vicuna-7b",
    #     "model": "lmsys/vicuna-7b-v1.5",
    #     "tokenizer_class": "LlamaTokenizer",
    #     "model_class": "LlamaForCausalLM"
    # },
    # {
    #     "name": "FLAN-t5-base",
    #     "model": "google/flan-t5-base",
    #     "tokenizer_class": "AutoTokenizer",
    #     "model_class": "AutoModelForSeq2SeqLM"
    # },
    # {
    #     "name": "FLAN-t5-large",
    #     "model": "google/flan-t5-large",
    #     "tokenizer_class": "AutoTokenizer",
    #     "model_class": "AutoModelForSeq2SeqLM"
    # },
    # {
    #     "name": "FLAN-t5-xxl",
    #     "model": "google/flan-t5-xxl",
    #     "tokenizer_class": "AutoTokenizer",
    #     "model_class": "AutoModelForSeq2SeqLM"
    # },
    # {
    #     "name": "leo-hessianai-13b",
    #     "model": "LeoLM/leo-hessianai-13b",
    #     "tokenizer_class": "AutoTokenizer",
    #     "model_class": "AutoModelForCausalLM"
    # },
    # {
    #     "name": "leo-hessianai-7b",
    #     "model": "LeoLM/leo-hessianai-7b",
    #     "tokenizer_class": "AutoTokenizer",
    #     "model_class": "AutoModelForCausalLM"
    # },

]

## Load Model

In [5]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("LeoLM/leo-hessianai-13b", trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained("LeoLM/leo-hessianai-13b", trust_remote_code=True, device_map="auto", load_in_8bit=True)

In [6]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# model_name = "CohereForAI/aya-101"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(
#     model_name, device_map="auto", load_in_8bit=True)

In [7]:
# model_name = "lmsys/vicuna-13b-v1.5" #"lmsys/vicuna-13b-v1.5"

# from transformers import LlamaTokenizer, LlamaForCausalLM
# tokenizer = LlamaTokenizer.from_pretrained(model_name)
# model = LlamaForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)

## Define Prompt Template:

In [8]:
PROMPT_TEMPLATE = """Given the following text in {lang}, does it contain information about '{topic}'? Please answer with 'Yes' or 'No' only.

Text: "{webpage_text}"

Answer:"""

# Test the template with a dummy text
prompt_test = PROMPT_TEMPLATE.format(
    topic="Cannabis", lang='German', webpage_text='Lorem ipsum dolor sit amet, consectetur adipiscing elit.')
print(prompt_test)

Given the following text in German, does it contain information about 'Cannabis'? Please answer with 'Yes' or 'No' only.

Text: "Lorem ipsum dolor sit amet, consectetur adipiscing elit."

Answer:


## Define Parameter for Text Generation

Each parameter influences the text generation in a specific way. Below are the parameters along with a brief explanation:

**`max_length`**:
* Sets the maximum number of tokens in the generated text (default is 50).
* Generation stops if the maximum length is reached before the model produces an EOS token.
* A higher `max_length` allows for longer generated texts but may increase the time and computational resources required.

**`min_length`**:
* Sets the minimum number of tokens in the generated text (default is 10).
* Generation continues until this minimum length is reached even if an EOS token is produced.

**`num_beams`**:
* In beam search, sets the number of "beams" or hypotheses to keep at each step (default is 4).
* A higher number of beams increases the chances of finding a good output but also increases the computational cost.

**`num_return_sequences`**:
* Specifies the number of independently computed sequences to return (default is 3).
* When using sampling, multiple different sequences are generated independently from each other.

**`early_stopping`**:
* Stops generation if the model produces the EOS (End Of Sentence) token, even if the predefined maximum length is not reached (default is True).
* Useful when an EOS token signifies the logical end of a text (often represented as `</s>`).

**`do_sample`**:
* Tokens are selected probabilistically based on their likelihood scores (default is True).
* Introduces randomness into the generation process for diverse outputs.
* The level of randomness is controlled by the 'temperature' parameter.

**`temperature`**:
* Adjusts the probability distribution used for sampling the next token (default is 0.7).
* Higher values make the generation more random, while lower values make it more deterministic.

**`top_k`**:
* Limits the number of tokens considered for sampling at each step to the top K most likely tokens (default is 50).
* Can make the generation process faster and more focused.

**`top_p`**:
* Also known as nucleus sampling, sets a cumulative probability threshold (default is 0.95).
* Tokens are sampled only from the smallest set whose cumulative probability exceeds this threshold.

**`repetition_penalty`**:
* Discourages the model from repeating the same token by modifying the token's score (default is 1.5).
* Values greater than 1.0 penalize repetitions, and values less than 1.0 encourage repetitions.


In [9]:
params = {'do_sample': True,
          'early_stopping': True,
          # 'max_length': 100,
          # 'min_length': 1,
          # 'logprobs': 1,
          # 'n': 1,
          # 'best_of': 1,

          # 'num_beam_groups': 2,
          'num_beams': 2,
          'num_return_sequences': 1,
          'max_new_tokens': 1024,
          'min_new_tokens': 1,
          'output_scores': True,
          # 'repetition_penalty': 1.0,
          'temperature': 0.6,
          'top_k': 50,
          'top_p': 1.0
          }

## Helper Functions

In [10]:
def compile_prompt(article, template, topic, lang='German'):
    """ Compiles the prompt for the given article and model."""

    # Extract the article headline and text
    article_text = article.get("text")
    prompt = template.format(topic=topic, lang=lang, webpage_text=article_text)
    # prompt = template.format(topic = "Cannabis", lang = 'German', webpage_text=article_text, positive_example=positive_example, negative_example=negative_example)

    return prompt

In [11]:
def calculate_input_length(prompt):
    """ Calculates the length of the input sequence for the model. """

    # Tokenize the prompt
    tokenized_prompt = tokenizer(
        prompt, return_tensors="pt", add_special_tokens=False, truncation=False, padding=False)

    # Calculate the length of the input sequence
    input_length = tokenized_prompt.input_ids.size(1)

    return input_length

In [12]:
def generate_answers(model, tokenizer, prompt, params, remove_input=True):
    """Generates answers from a language model for a given prompt."""

    # Encode the prompt and generate the answers
    encoded_input = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    if encoded_input.size()[1] > tokenizer.model_max_length:
        print("Input too long, truncating.")
        # encoded_input = encoded_input[:, :tokenizer.model_max_length]

    generated_outputs = model.generate(encoded_input, **params)

    # Decode and clean outputs
    outputs = []
    input_text_wo_st = tokenizer.decode(
        encoded_input[0], skip_special_tokens=True)
    for output in generated_outputs:
        decoded_text = tokenizer.decode(output, skip_special_tokens=True)
        cleaned_text = decoded_text.replace(input_text_wo_st, "").strip()
        outputs.append(cleaned_text if remove_input else decoded_text)

    return outputs

In [13]:
def parse_response(output_text):
    """Determines if the model's output signifies "Yes" (1) or "No" (0)."""
    text = output_text.lower()
    return 1 if "yes" in text else 0 if "no" in text else ValueError("Ambiguous response.")

In [14]:
def calc_metrics(labels, preds):
    """
    Calculates the accuracy, precision, recall, and F1 score for the given labels and predictions and returns them in a dictionary.
    """

    metrics = {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds, average='binary'),
        'recall': recall_score(labels, preds, average='binary'),
        'f1': f1_score(labels, preds, average='binary'),
    }

    return metrics

In [15]:
def sample_random_from_dataset(dataset, n=5, subset='test'):
    """
    Samples n random examples from a specified subset of the dataset.
    """
    n = min(n, len(dataset[subset]))
    random_indices = random.sample(range(len(dataset[subset])), n)
    sampled_dataset = dataset[subset].select(random_indices)
    return sampled_dataset

In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
def load_model_and_tokenizer(model_details):
    """
    Loads a model and its corresponding tokenizer based on the provided model details.
    """
    model_name = model_details['model']
    tokenizer_class = model_details['tokenizer_class']
    model_class = model_details['model_class']
    
    # Cohere models and FLAN models
    if tokenizer_class == "AutoTokenizer" and model_class == "AutoModelForSeq2SeqLM":
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name, device_map="auto", load_in_8bit=True)
        
    # Vicuna models
    elif tokenizer_class == "LlamaTokenizer" and model_class == "LlamaForCausalLM":
        from transformers import LlamaTokenizer, LlamaForCausalLM
        tokenizer = LlamaTokenizer.from_pretrained(model_name)
        model = LlamaForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
        
    #  LeoLM models  
    elif tokenizer_class == "AutoTokenizer" and model_class == "AutoModelForCausalLM":
        from transformers import AutoTokenizer, AutoModelForCausalLM
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto", load_in_8bit=True)
        
    else:
        raise ValueError("Model class not supported.")
        
    return tokenizer, model


## Generate Answers

In [17]:
import gc

In [18]:
from collections import defaultdict
eval_results = defaultdict(dict)

for topic in topics:  # ----------------------------------------------------------------------
    print(f"Evaluating topic {topic}")
    for model_details in models: #-------------------------------------------------------------
        
        # Load dataset
        print(f"Loading dataset for {topic}")
        dataset = load_from_disk(f"../data/tmp/processed_dataset_{topic}_buffed_chunkified_random_192")
        dataset['test'] = sample_random_from_dataset(dataset, n=5, subset='test')
        
        # Load model
        model_name = model_details['model']
        print(f"Loading model {model_name}")
        
        # Load model and tokenizer
        tokenizer, model = load_model_and_tokenizer(model_details)
        
        # Generate answers
        answers = [] 
        for row in tqdm(dataset['test']): # ---------------------------------------------------
            prompt = compile_prompt(row, PROMPT_TEMPLATE, topic)
            answers.append(generate_answers(model, tokenizer, prompt, params)[0])

        # Add answers to the dataset
        dataset['test'] = dataset['test'].add_column("answers", answers)
        dataset.save_to_disk(f"../data/tmp/processed_dataset_{topic}_answers_0s_{model_name.split('/')[1]}")

        # Calculate metrics
        metrics = calc_metrics(dataset['test']['label'], [parse_response(ans) for ans in answers])
        eval_results[model_name][topic] = metrics
        
        # Clear GPU memory to avoid memory errors
        model.cpu()
        torch.cuda.empty_cache()
        del model, tokenizer
        gc.collect()  # Explicitly invoking garbage collection
        torch.cuda.empty_cache()  # Clear cache again after garbage collection
        time.sleep(5)

Evaluating topic cannabis
Loading dataset for cannabis


FileNotFoundError: Directory ../data/tmp/processed_dataset_cannabis_buffed_chunkified_random_192 not found

In [17]:
dataset['test'][0]

{'_id': '648c2ad98e8cadbd29055709',
 'batch_id': 15,
 'domain': 'kinder-grund-sicherung.de',
 'view_url': 'www.kinder-grund-sicherung.de/',
 'lang': 'de',
 'text': 'beantragen zu können führt zu einem einfachen Zugang zur Leistung. Auch das Bewilligungsverfahren ist übersichtlich. 4. Wenig Bürokratie Bisherige Sozialleistungen sind in der einen Kindergrundsicherung zusammengefasst. Die Einkommensprüfung ist einfach. 5. Sozialer Einkommensbegriff Durch Verwendung des sozialrechtlichen Einkommensbegriffs im Rahmen der Kindergrundsicherung wird die Existenzsicherung in den Vordergrund gestellt. 6. Vorrang von Unterhaltsleistungen Der Kindergrundsicherung gehen Unterhaltsleistungen und anderer zur Sicherung des Unterhalts bestimmte Sozialleistungen, wie Unterhaltsvorschuss vor. 7. Nachrrang von Bürgergeld Die Kindergrundsicherung geht dem Bürgergeld vor. Es gibt keine doppelte Zuständigkeit von Jobcenter und Familienkasse als Kindergrundsicherungsbehörde. 8. Förderung von Ausbildung und Ar

Accuracy: 0.834
Accuracy: 0.94

## Save and Output Results

In [18]:
from tabulate import tabulate

In [19]:
import json

# Define the file path to save the dictionary
file_path = "eval_results_icl_zero_shot.json"

# Save the dictionary to disk as JSON
with open(file_path, "w") as file:
    json.dump(eval_results, file)

In [20]:
import json

# Define the file path where the JSON data is saved
file_path = "eval_results_icl_zero_shot.json"

# Load the dictionary from the JSON file
with open(file_path, "r") as file:
    eval_results = json.load(file)

In [21]:
# Identify all topics (assuming all models are evaluated on the same topics)
topics = list(next(iter(eval_results.values())).keys())

# Prepare headers for the table: each topic will have four metrics
headers = ["Model"] + \
    [f"{topic} {metric}" for topic in topics for metric in [
        "Acc.", "Prec.", "Rec.", "F1"]]

# Prepare rows: one row per model, containing metrics for each topic
rows = []
for model, topics_metrics in eval_results.items():
    row = [model]  # Start with the model name
    for topic in topics:
        metrics = topics_metrics.get(topic, {})
        row.extend([metrics.get('accuracy', 0.0), metrics.get(
            'precision', 0.0), metrics.get('recall', 0.0), metrics.get('f1', 0.0)])
    rows.append(row)

# Generate the HTML table
table_html = tabulate(rows, headers=headers, tablefmt="html",
                      showindex="never", floatfmt=".3f")

In [22]:
from IPython.display import display, HTML
display(HTML(table_html))

Model,cannabis Acc.,cannabis Prec.,cannabis Rec.,cannabis F1,energie Acc.,energie Prec.,energie Rec.,energie F1,kinder Acc.,kinder Prec.,kinder Rec.,kinder F1
CohereForAI/aya-101,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6,0.0,0.0,0.0
lmsys/vicuna-13b-v1.5,0.8,1.0,0.5,0.667,0.8,0.667,1.0,0.8,0.8,1.0,0.667,0.8
lmsys/vicuna-7b-v1.5,0.4,0.0,0.0,0.0,0.4,0.4,1.0,0.571,0.8,0.8,1.0,0.889
google/flan-t5-base,0.8,1.0,0.667,0.8,0.6,0.5,1.0,0.667,0.8,1.0,0.667,0.8
google/flan-t5-large,0.8,1.0,0.5,0.667,1.0,0.0,0.0,0.0,0.6,1.0,0.333,0.5
google/flan-t5-xxl,0.8,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.8,1.0,0.75,0.857
