In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [2]:
# Load the dataset
whole_dataset = load_dataset("tatsu-lab/alpaca")

# Split the dataset into training and testing sets
split_datasets = whole_dataset['train'].train_test_split(test_size=0.0005, seed=42)

# Access the training and testing sets
train_dataset = split_datasets['train']
test_dataset = split_datasets['test']

In [None]:
# logging.set_verbosity(logging.CRITICAL)
model_path = 'Llama-2-7b-hf-fine-tuned'
finetuned_model = AutoModelForCausalLM.from_pretrained(model_path)
finetuned_tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
from tqdm.auto import tqdm

logging.set_verbosity_error()

# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, device=0)
pipe = pipeline(
    task="text-generation",
    model=finetuned_model,
    tokenizer=finetuned_tokenizer,
    device=0,
    # top_k=50,  # Set top_k to your desired value
    # num_beams=5,  # Set beam_size to your desired value
    # temperature=1  # Set temperature to your desired value
)

batch_size = 20

num_examples = len(test_dataset)
print(num_examples)
total_batches = (num_examples + batch_size - 1) // batch_size
generated_output = []

for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc="Generating text"):
    batch_indices = range(i, min(i + batch_size, num_examples))
    batch = test_dataset.select(batch_indices)
    prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
    # print(prompts)
    # Generate text for the batch
    results = pipe(prompts, max_new_tokens=64)
    
    for result in results:
        generated_text = result[0]['generated_text']
        generated_output.append(generated_text)

        # Uncomment the following lines if you want to print the prompts and generated text
        prompt = prompts[results.index(result)]
        # print(f"Prompt: {prompt}")
        print(f"Generated Text: {generated_text}")
        print("------")

In [5]:
# generated_texts = [x.split('\n\n### Response:\n')[1] for x in generated_output]
generated_texts = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output]
#generated_texts[0]

In [6]:
def human_evaluation_print(dataset, generated_responses):

    # Make sure you have the correct number of responses
    assert len(dataset) == len(generated_responses), "The number of generated responses must match the number of dataset entries"

    for i in range(len(dataset)):
        reference_answer = dataset[i]['output']
        generated_answer = generated_responses[i]
        print(i)
        print(f"Reference answer:\n{reference_answer}\n\nGenerated answer:\n{generated_answer}\n")

human_evaluation_print(test_dataset, generated_texts)

0
Reference answer:
For someone with arthritis, the best type of exercise would be low-impact activities like yoga, swimming, or walking. These exercises provide the benefits of exercise without exacerbating the symptoms of arthritis.

Generated answer:
The best type of exercise for a person with arthritis is low-impact aerobic exercise, such as walking, swimming, and cycling. These activities can help to improve joint mobility and reduce pain. Additionally, strength training exercises can help to

1
Reference answer:
The atomic mass for lithium is 6.941 u (units). This is determined from the number of protons and neutrons in the nucleus of a lithium atom, which is 3 protons and 4 neutrons. Each proton and neutron has a mass of 1.007 u, resulting in a total mass of 6.941 u.

Generated answer:
The atomic mass for lithium is 6.941 amu. This is the average mass of one atom of lithium, which is the sum of the protons and neutrons in the nucleus of the atom. The atomic mass of an element is

**Scores for Each Sample:**

1. Exercise for Arthritis
   - Grammatical Correctness: 1 (No grammatical errors.)
   - Coherence: 1 (The answer is coherent and well-structured.)
   - Correctness of Answer: 1 (The answer is correct and matches the reference.)
   - Average: 1

2. Atomic Mass of Lithium
   - Grammatical Correctness: 1 (No grammatical errors.)
   - Coherence: 1 (The answer is coherent and well-structured.)
   - Correctness of Answer: 0.8 (The answer is partially correct but incomplete.)
   - Average: 0.93

3. ASCII Characters for Binary Code
   - Grammatical Correctness: 1 (No grammatical errors.)
   - Coherence: 0 (The answer is missing.)
   - Correctness of Answer: 0 (The answer is incorrect as it is missing.)
   - Average: 0.33

4. Trembling with Fear
   - Grammatical Correctness: 1 (No grammatical errors.)
   - Coherence: 1 (The answer is coherent and well-structured.)
   - Correctness of Answer: 1 (The answer is correct and matches the reference.)
   - Average: 1

5. Promoting a Skateboard
   - Grammatical Correctness: 1 (No grammatical errors.)
   - Coherence: 0.8 (The answer is coherent but lacks some details from the reference.)
   - Correctness of Answer: 0.8 (The answer is partially correct but misses some elements from the reference.)
   - Average: 0.87

6. Pick Up the Book
   - Grammatical Correctness: 1 (No grammatical errors.)
   - Coherence: 1 (The answer is coherent and matches the reference.)
   - Correctness of Answer: 1 (The answer is correct.)
   - Average: 1

7. Picnic Vacation
   - Grammatical Correctness: 1 (No grammatical errors.)
   - Coherence: 0.8 (The answer is coherent but lacks some details from the reference.)
   - Correctness of Answer: 0.8 (The answer is partially correct but misses some elements from the reference.)
   - Average: 0.87

8. Big Data
   - Grammatical Correctness: 1 (No grammatical errors.)
   - Coherence: 0.8 (The answer is coherent but lacks some depth from the reference.)
   - Correctness of Answer: 0.8 (The answer is partially correct but misses some important details.)
   - Average: 0.87

9. Wild Goose Chase
   - Grammatical Correctness: 1 (No grammatical errors.)
   - Coherence: 1 (The answer is coherent and well-structured.)
   - Correctness of Answer: 1 (The answer is correct and matches the reference.)
   - Average: 1

10. Online Learning Benefits
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but lacks some structure.)
    - Correctness of Answer: 0.8 (The answer is partially correct but lacks specificity.)
    - Average: 0.87

11. Countries by GDP Per Capita
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 1 (The answer is coherent and well-structured.)
    - Correctness of Answer: 1 (The answer is correct.)
    - Average: 1

12. Sailboat
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 1 (The answer is coherent and well-structured.)
    - Correctness of Answer: 1 (The answer is correct.)
    - Average: 1

13. Importance of Humor
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 1 (The answer is coherent and well-structured.)
    - Correctness of Answer: 1 (The answer is correct.)
    - Average: 1

14. Length of Hypotenuse
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but lacks some clarity.)
    - Correctness of Answer: 0 (The answer is incorrect.)
    - Average: 0.6

15. Symphony Orchestra Instruments
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but lacks some detail.)
    - Correctness of Answer: 1 (The answer is correct.)
    - Average: 0.93

16. GPT
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 1 (The answer is coherent and well-structured.)
    - Correctness of Answer: 1 (The answer is correct.)
    - Average: 1

17. Paris
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 1 (The answer is coherent and well-structured.)
    - Correctness of Answer: 1 (The answer is correct.)
    - Average: 1

18. Understanding Customer's Needs
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but lacks some depth compared to the reference.)
    - Correctness of Answer: 0.8 (The answer is partially correct but misses some elements from the reference.)
    - Average: 0.87

19. Cooking Pork Chops
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but lacks some detail.)
    - Correctness of Answer: 1 (The answer is correct.)
    - Average: 0.93

20. Order Arrival Inquiry
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but the structure is different from the reference.)
    - Correctness of Answer: 0.8 (The answer is partially correct but misses some elements from the reference.)
    - Average: 0.87

21. Arithmetic Mean
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 1 (The answer is coherent and well-structured.)
    - Correctness of Answer: 1 (The answer is correct.)
    - Average: 1

22. Detecting Spam Emails
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but lacks some detail.)
    - Correctness of Answer: 0.8 (The answer is partially correct but misses some elements from the reference.)
    - Average: 0.87

23. Program for Summing Integers
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but lacks some detail.)
    - Correctness of Answer: 0.8 (The answer is partially correct but misses some elements from the reference.)
    - Average: 0.87

24. Reducing Distractions When Studying
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but lacks some detail.)
    - Correctness of Answer: 0.8 (The answer is partially correct but misses some elements from the reference.)
    - Average: 0.87

25. Algorithm for Summing Multiples of 3 or 5
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but incomplete.)
    - Correctness of Answer: 0.8 (The answer is partially correct but incomplete.)
    - Average: 0.87

26. Government Regulations
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but lacks some detail.)
    - Correctness of Answer: 0.8 (The answer is partially correct but misses some elements from the reference.)
    - Average: 0.87

27. Software Development Interview Questions
    - Grammatical Correctness: 1 (No grammatical errors.)
    - Coherence: 0.8 (The answer is coherent but lacks some detail.)
    - Correctness of Answer: 0.8 (The answer is partially correct but misses some elements from the reference.)
    - Average: 0.87

Overall Average Score for All Samples: 0.90

In [14]:
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import score
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def evaluate_model_performance(dataset, generated_responses):
    # Initialize metrics and lists to save answers
    bleu_scores = []
    rouge_l_scores = []
    bert_f1_scores = []
    perplexity_scores = []

    # Make sure you have the correct number of responses
    assert len(dataset) == len(generated_responses), "The number of generated responses must match the number of dataset entries"

    for i in range(len(dataset)):
        reference_answer = dataset[i]['output']
        generated_answer = generated_responses[i]
        
        # Compute BLEU score
        bleu_score = corpus_bleu([generated_answer], [[reference_answer]])
        bleu_score_normalized = bleu_score.score / 100.0
        bleu_scores.append(bleu_score_normalized)
        
        rouge_l_scores.append(rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True).score(reference_answer, generated_answer)['rougeL'].fmeasure)
        
        bert_f1_scores.append(score([generated_answer], [reference_answer], lang='en')[2].mean().item())

        # Calculate perplexity
        # encodings = tokenizer(generated_answer, return_tensors='pt')
        # with torch.no_grad():
        #     outputs = model(**encodings, labels=encodings['input_ids'])
        #     loss = outputs.loss
        #     perplexity = torch.exp(loss).item()
        # perplexity_scores.append(perplexity)
        # Check if generated_answer is not empty
        if len(generated_answer) > 0:
            encodings = tokenizer(generated_answer, return_tensors='pt')
            with torch.no_grad():
                outputs = model(**encodings, labels=encodings['input_ids'])
                loss = outputs.loss
                perplexity = torch.exp(loss).item()
            perplexity_scores.append(perplexity)
        else:
            # Handle empty generated_answer, e.g., by appending a default value or skipping
            perplexity_scores.append(0.0)


    # Calculate average scores
    average_bleu = sum(bleu_scores) / len(bleu_scores)
    average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
    average_bert_f1 = sum(bert_f1_scores) / len(bert_f1_scores)
    average_perplexity = sum(perplexity_scores) / len(perplexity_scores)

    # Print results
    print(f'BLEU: {average_bleu}')
    print(f'Rouge-L: {average_rouge_l}')
    print(f'BERTScore: {average_bert_f1}')
    print(f'Perplexity: {average_perplexity}')

    return average_bleu, average_rouge_l, average_bert_f1, average_perplexity

average_bleu, average_rouge_l, average_bert_f1, average_perplexity = evaluate_model_performance(test_dataset, generated_texts)




BLEU: 0.12009827853485579
Rouge-L: 0.30391537418198705
BERTScore: 0.8596110829600582
Perplexity: 16.868397995277686


In [15]:
top_k_values = [10, 25, 40, 75]
beam_sizes = [2, 4, 6, 8]
temperatures = [0.25, 0.5, 0.7, 1.0]

# Varying top_k while keeping beam_size and temperature fixed
fixed_beam_size = 1
fixed_temperature = 0.8

for top_k in top_k_values:
    key = f"top_k={top_k}, fixed_beam_size={fixed_beam_size}, fixed_temperature={fixed_temperature}"
    generated_output_2 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_2 = pipe(prompts, max_new_tokens=64, top_k=top_k, num_beams=fixed_beam_size, temperature=fixed_temperature, do_sample=True)

        for result in results_2:
            generated_text_2 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_2.append(generated_text_2)
    
    # generated_texts_2 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_2 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_2]
    
    average_bleu_2, average_rouge_l_2, average_bert_f1_2, average_perplexity_2 = evaluate_model_performance(test_dataset, generated_texts_2)

    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for top_k=10, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:01<00:00, 30.73s/it]


BLEU: 0.10995126933767653
Rouge-L: 0.30782161642246103
BERTScore: 0.8548505504926046
Perplexity: 22.84755692658601
----------


Generating text for top_k=25, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:01<00:00, 30.68s/it]


BLEU: 0.08673264278956168
Rouge-L: 0.2836783423553812
BERTScore: 0.8778281454686765
Perplexity: 25.70988580915663
----------


Generating text for top_k=40, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:01<00:00, 30.67s/it]


BLEU: 0.11154961761088465
Rouge-L: 0.29926618247003894
BERTScore: 0.8876879281467862
Perplexity: 16.17821228945697
----------


Generating text for top_k=75, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:01<00:00, 30.66s/it]


BLEU: 0.11576353154933944
Rouge-L: 0.3113299846987313
BERTScore: 0.8910482879038211
Perplexity: 21.890236987007988
----------


In [16]:
fixed_top_k = 50

for beam_size in beam_sizes:
    key = f"fixed_top_k={fixed_top_k}, beam_size={beam_size}, fixed_temperature={fixed_temperature}"
    generated_output_3 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_3 = pipe(prompts, max_new_tokens=64, top_k=fixed_top_k, num_beams=beam_size, temperature=fixed_temperature, do_sample=True)

        for result in results_3:
            generated_text_3 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_3.append(generated_text_3)
    
    # generated_texts_3 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_3 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_3]

    average_bleu_3, average_rouge_l_3, average_bert_f1_3, average_perplexity_3 = evaluate_model_performance(test_dataset, generated_texts_3)
    
    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for fixed_top_k=50, beam_size=2, fixed_temperature=0.8:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text for fixed_top_k=50, beam_size=2, fixed_temperature=0.8: 100%|██████████| 2/2 [01:09<00:00, 34.82s/it]


BLEU: 0.11804278204423653
Rouge-L: 0.31610225260486735
BERTScore: 0.8597343608185097
Perplexity: 19.596147201679372
----------


Generating text for fixed_top_k=50, beam_size=4, fixed_temperature=0.8: 100%|██████████| 2/2 [01:16<00:00, 38.26s/it]


BLEU: 0.11140117538592836
Rouge-L: 0.3050686776975318
BERTScore: 0.8600855911219562
Perplexity: 20.457861529456245
----------


Generating text for fixed_top_k=50, beam_size=6, fixed_temperature=0.8: 100%|██████████| 2/2 [01:29<00:00, 44.85s/it]


BLEU: 0.10778630398033433
Rouge-L: 0.3028281045902536
BERTScore: 0.8579005312036585
Perplexity: 19.076842661257142
----------


Generating text for fixed_top_k=50, beam_size=8, fixed_temperature=0.8: 100%|██████████| 2/2 [01:45<00:00, 52.74s/it]


BLEU: 0.11669874904879159
Rouge-L: 0.3115077934273019
BERTScore: 0.8610804500403227
Perplexity: 19.665610595985694
----------


In [17]:
for temperature in temperatures:
    key = f"fixed_top_k={fixed_top_k}, fixed_beam_size={fixed_beam_size}, temperature={temperature}"
    generated_output_4 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_4 = pipe(prompts, max_new_tokens=64, top_k=fixed_top_k, num_beams=fixed_beam_size, temperature=temperature, do_sample=True)

        for result in results_4:
            generated_text_4 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_4.append(generated_text_4)
    
    # generated_texts_3 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_4 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_4]

    average_bleu_4, average_rouge_l_4, average_bert_f1_4, average_perplexity_4 = evaluate_model_performance(test_dataset, generated_texts_4)
    
    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.25:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.25: 100%|██████████| 2/2 [01:01<00:00, 30.71s/it]


BLEU: 0.11375449331134665
Rouge-L: 0.30798041760736944
BERTScore: 0.8875004185570611
Perplexity: 15.907333497647885
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.5: 100%|██████████| 2/2 [01:01<00:00, 30.62s/it]


BLEU: 0.12011921675159754
Rouge-L: 0.32098351202427794
BERTScore: 0.8887383937835693
Perplexity: 16.374557654062908
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.7: 100%|██████████| 2/2 [01:01<00:00, 30.65s/it]


BLEU: 0.10622552192943695
Rouge-L: 0.3006743067267392
BERTScore: 0.8867903418011136
Perplexity: 15.78405722865352
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=1.0: 100%|██████████| 2/2 [01:01<00:00, 30.65s/it]


BLEU: 0.08906679416907991
Rouge-L: 0.27673926707138546
BERTScore: 0.8765455197404932
Perplexity: 19.68990327693798
----------


Reference: https://www.datacamp.com/tutorial/fine-tuning-llama-2