In [None]:
import os
import json
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [2]:
# Load the dataset
whole_dataset = load_dataset("tatsu-lab/alpaca")

# Split the dataset into training and testing sets
split_datasets = whole_dataset['train'].train_test_split(test_size=0.0005, seed=42)

# Access the training and testing sets
train_dataset = split_datasets['train']
test_dataset = split_datasets['test']

In [None]:
# logging.set_verbosity(logging.CRITICAL)
model_path = 'Mistral-7B-v0.1-fine-tuned'
finetuned_model = AutoModelForCausalLM.from_pretrained(model_path)
finetuned_tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
from tqdm.auto import tqdm

logging.set_verbosity_error()

# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, device=0)
pipe = pipeline(
    task="text-generation",
    model=finetuned_model,
    tokenizer=finetuned_tokenizer,
    device=0,
    # top_k=50,  # Set top_k to your desired value
    # num_beams=5,  # Set beam_size to your desired value
    # temperature=1  # Set temperature to your desired value
)

batch_size = 20

num_examples = len(test_dataset)
print(num_examples)
total_batches = (num_examples + batch_size - 1) // batch_size
generated_output = []

for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc="Generating text"):
    batch_indices = range(i, min(i + batch_size, num_examples))
    batch = test_dataset.select(batch_indices)
    prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
    # print(prompts)
    # Generate text for the batch
    results = pipe(prompts, max_new_tokens=64)
    
    for result in results:
        generated_text = result[0]['generated_text']
        generated_output.append(generated_text)

        # Uncomment the following lines if you want to print the prompts and generated text
        prompt = prompts[results.index(result)]
        # print(f"Prompt: {prompt}")
        print(f"Generated Text: {generated_text}")
        print("------")

In [5]:
# generated_texts = [x.split('\n\n### Response:\n')[1] for x in generated_output]
generated_texts = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output]
# generated_texts[0]

In [6]:
def human_evaluation_print(dataset, generated_responses):

    # Make sure you have the correct number of responses
    assert len(dataset) == len(generated_responses), "The number of generated responses must match the number of dataset entries"

    for i in range(len(dataset)):
        reference_answer = dataset[i]['output']
        generated_answer = generated_responses[i]
        print(i)
        print(f"Reference answer:\n{reference_answer}\n\nGenerated answer:\n{generated_answer}\n")

human_evaluation_print(test_dataset, generated_texts)

0
Reference answer:
For someone with arthritis, the best type of exercise would be low-impact activities like yoga, swimming, or walking. These exercises provide the benefits of exercise without exacerbating the symptoms of arthritis.

Generated answer:
Low-impact exercises such as swimming, walking, and yoga are recommended for people with arthritis. These exercises help to strengthen the muscles and joints without putting too much strain on the joints. Additionally, stretching and range of motion exercises can help to improve flexibility and

1
Reference answer:
The atomic mass for lithium is 6.941 u (units). This is determined from the number of protons and neutrons in the nucleus of a lithium atom, which is 3 protons and 4 neutrons. Each proton and neutron has a mass of 1.007 u, resulting in a total mass of 6.941 u.

Generated answer:
The atomic mass for lithium is 6.941. 

The atomic mass for lithium is 6.941. This is the mass of one atom of lithium, which is made up of three prot

**Scores for Each Sample:**

### Sample 1 (Atomic Mass of Lithium):
- Grammatical Correctness: 0.8
- Coherence: 0.8
- Correctness of Answer: 1
- **Average: 0.87**

### Sample 2:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 3:
- Grammatical Correctness: N/A
- Coherence: N/A
- Correctness of Answer: 0
- **Average: 0**

### Sample 4:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 5:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 6:
- Grammatical Correctness: 1
- Coherence: 0.8
- Correctness of Answer: 0.8
- **Average: 0.87**

### Sample 7:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 0.8
- **Average: 0.93**

### Sample 8:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 9:
- Grammatical Correctness: 1
- Coherence: 0.8
- Correctness of Answer: 0.8
- **Average: 0.87**

### Sample 10:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 11:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 12:
- Grammatical Correctness: 0.8
- Coherence: 0.8
- Correctness of Answer: 1
- **Average: 0.87**

### Sample 13:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 0
- **Average: 0.67**

### Sample 14:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 15:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 16:
- Grammatical Correctness: 0.8
- Coherence: 0.8
- Correctness of Answer: 1
- **Average: 0.87**

### Sample 17:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 18:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 19:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 20:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 21:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 22:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 23:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 24:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 0.8
- **Average: 0.93**

### Sample 25:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**

### Sample 26:
- Grammatical Correctness: 1
- Coherence: 1
- Correctness of Answer: 1
- **Average: 1**



**Overall Average Score for All Samples:** 0.95


In [7]:
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import score
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def evaluate_model_performance(dataset, generated_responses):
    # Initialize metrics and lists to save answers
    bleu_scores = []
    rouge_l_scores = []
    bert_f1_scores = []
    perplexity_scores = []

    # Make sure you have the correct number of responses
    assert len(dataset) == len(generated_responses), "The number of generated responses must match the number of dataset entries"

    for i in range(len(dataset)):
        reference_answer = dataset[i]['output']
        generated_answer = generated_responses[i]
        
        # Compute BLEU score
        bleu_score = corpus_bleu([generated_answer], [[reference_answer]])
        bleu_score_normalized = bleu_score.score / 100.0
        bleu_scores.append(bleu_score_normalized)
        
        rouge_l_scores.append(rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True).score(reference_answer, generated_answer)['rougeL'].fmeasure)
        
        bert_f1_scores.append(score([generated_answer], [reference_answer], lang='en')[2].mean().item())

        # Calculate perplexity
        # encodings = tokenizer(generated_answer, return_tensors='pt')
        # with torch.no_grad():
        #     outputs = model(**encodings, labels=encodings['input_ids'])
        #     loss = outputs.loss
        #     perplexity = torch.exp(loss).item()
        # perplexity_scores.append(perplexity)
        # Check if generated_answer is not empty
        if len(generated_answer) > 0:
            encodings = tokenizer(generated_answer, return_tensors='pt')
            with torch.no_grad():
                outputs = model(**encodings, labels=encodings['input_ids'])
                loss = outputs.loss
                perplexity = torch.exp(loss).item()
            perplexity_scores.append(perplexity)
        else:
            # Handle empty generated_answer, e.g., by appending a default value or skipping
            perplexity_scores.append(0.0)


    # Calculate average scores
    average_bleu = sum(bleu_scores) / len(bleu_scores)
    average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
    average_bert_f1 = sum(bert_f1_scores) / len(bert_f1_scores)
    average_perplexity = sum(perplexity_scores) / len(perplexity_scores)

    # Print results
    print(f'BLEU: {average_bleu}')
    print(f'Rouge-L: {average_rouge_l}')
    print(f'BERTScore: {average_bert_f1}')
    print(f'Perplexity: {average_perplexity}')

    return average_bleu, average_rouge_l, average_bert_f1, average_perplexity

average_bleu, average_rouge_l, average_bert_f1, average_perplexity = evaluate_model_performance(test_dataset, generated_texts)


BLEU: 0.10637520896383895
Rouge-L: 0.28829402465887405
BERTScore: 0.8832031024826897
Perplexity: 15.695914568724456


In [8]:
top_k_values = [10, 25, 40, 75]
beam_sizes = [2, 4, 6, 8]
temperatures = [0.25, 0.5, 0.7, 1.0]

# Varying top_k while keeping beam_size and temperature fixed
fixed_beam_size = 1
fixed_temperature = 0.8

for top_k in top_k_values:
    key = f"top_k={top_k}, fixed_beam_size={fixed_beam_size}, fixed_temperature={fixed_temperature}"
    generated_output_2 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_2 = pipe(prompts, max_new_tokens=64, top_k=top_k, num_beams=fixed_beam_size, temperature=fixed_temperature, do_sample=True)

        for result in results_2:
            generated_text_2 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_2.append(generated_text_2)
    
    # generated_texts_2 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_2 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_2]
    
    average_bleu_2, average_rouge_l_2, average_bert_f1_2, average_perplexity_2 = evaluate_model_performance(test_dataset, generated_texts_2)

    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for top_k=10, fixed_beam_size=1, fixed_temperature=0.8:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text for top_k=10, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:12<00:00, 36.33s/it]


BLEU: 0.07673658964472913
Rouge-L: 0.24900730021602321
BERTScore: 0.8739797782014918
Perplexity: 21.311663486339427
----------


Generating text for top_k=25, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:03<00:00, 31.59s/it]


BLEU: 0.08314878179724342
Rouge-L: 0.3034843794880543
BERTScore: 0.8797346552213033
Perplexity: 21.065436274917037
----------


Generating text for top_k=40, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:03<00:00, 31.56s/it]


BLEU: 0.08235278884122037
Rouge-L: 0.27582336176884187
BERTScore: 0.8798080439920779
Perplexity: 25.632464974014848
----------


Generating text for top_k=75, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:03<00:00, 31.57s/it]


BLEU: 0.072184958782227
Rouge-L: 0.24343805213106812
BERTScore: 0.876941356394026
Perplexity: 23.972018082936604
----------


In [9]:
fixed_top_k = 50

for beam_size in beam_sizes:
    key = f"fixed_top_k={fixed_top_k}, beam_size={beam_size}, fixed_temperature={fixed_temperature}"
    generated_output_3 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_3 = pipe(prompts, max_new_tokens=64, top_k=fixed_top_k, num_beams=beam_size, temperature=fixed_temperature, do_sample=True)

        for result in results_3:
            generated_text_3 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_3.append(generated_text_3)
    
    # generated_texts_3 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_3 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_3]

    average_bleu_3, average_rouge_l_3, average_bert_f1_3, average_perplexity_3 = evaluate_model_performance(test_dataset, generated_texts_3)
    
    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for fixed_top_k=50, beam_size=2, fixed_temperature=0.8:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text for fixed_top_k=50, beam_size=2, fixed_temperature=0.8: 100%|██████████| 2/2 [01:11<00:00, 35.73s/it]


BLEU: 0.12227215271106785
Rouge-L: 0.3037296181004547
BERTScore: 0.8567791051334805
Perplexity: 10.405672470728556
----------


Generating text for fixed_top_k=50, beam_size=4, fixed_temperature=0.8: 100%|██████████| 2/2 [01:18<00:00, 39.26s/it]


BLEU: 0.11005320494944103
Rouge-L: 0.2944300662000093
BERTScore: 0.8545365907527782
Perplexity: 11.13361930847168
----------


Generating text for fixed_top_k=50, beam_size=6, fixed_temperature=0.8: 100%|██████████| 2/2 [01:31<00:00, 45.84s/it]


BLEU: 0.11375323415349615
Rouge-L: 0.2953740508937465
BERTScore: 0.8541421007227015
Perplexity: 10.460258104183056
----------


Generating text for fixed_top_k=50, beam_size=8, fixed_temperature=0.8: 100%|██████████| 2/2 [01:47<00:00, 53.84s/it]


BLEU: 0.11254707083161063
Rouge-L: 0.2975668631316125
BERTScore: 0.8577085490579959
Perplexity: 10.476649734708998
----------


In [10]:
for temperature in temperatures:
    key = f"fixed_top_k={fixed_top_k}, fixed_beam_size={fixed_beam_size}, temperature={temperature}"
    generated_output_4 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_4 = pipe(prompts, max_new_tokens=64, top_k=fixed_top_k, num_beams=fixed_beam_size, temperature=temperature, do_sample=True)

        for result in results_4:
            generated_text_4 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_4.append(generated_text_4)
    
    # generated_texts_3 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_4 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_4]

    average_bleu_4, average_rouge_l_4, average_bert_f1_4, average_perplexity_4 = evaluate_model_performance(test_dataset, generated_texts_4)
    
    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.25: 100%|██████████| 2/2 [01:03<00:00, 31.54s/it]


BLEU: 0.10193397174826552
Rouge-L: 0.28625010684537366
BERTScore: 0.8559713871390732
Perplexity: 12.9054487546285
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.5: 100%|██████████| 2/2 [01:03<00:00, 31.51s/it]


BLEU: 0.08989354982086553
Rouge-L: 0.2787655079688409
BERTScore: 0.8827785739192257
Perplexity: 18.191687919475413
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.7: 100%|██████████| 2/2 [01:03<00:00, 31.51s/it]


BLEU: 0.08830518129468479
Rouge-L: 0.26620119722945434
BERTScore: 0.8817243951338308
Perplexity: 22.254915873209637
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=1.0: 100%|██████████| 2/2 [01:03<00:00, 31.56s/it]


BLEU: 0.08150223053665961
Rouge-L: 0.24301592360628205
BERTScore: 0.8716559299716243
Perplexity: 25.44630449789542
----------
