In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch
from datasets import load_dataset
from trl import SFTTrainer

In [2]:
# Load the dataset
whole_dataset = load_dataset("tatsu-lab/alpaca")

# Split the dataset into training and testing sets
split_datasets = whole_dataset['train'].train_test_split(test_size=0.0005, seed=42)

# Access the training and testing sets
train_dataset = split_datasets['train']
test_dataset = split_datasets['test']

In [None]:
# logging.set_verbosity(logging.CRITICAL)
model_path = 'Phi-2-fine-tuned'
finetuned_model = AutoModelForCausalLM.from_pretrained(model_path)
finetuned_tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
from tqdm.auto import tqdm

logging.set_verbosity_error()

# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, device=0)
pipe = pipeline(
    task="text-generation",
    model=finetuned_model,
    tokenizer=finetuned_tokenizer,
    device=0,
    # top_k=50,  # Set top_k to your desired value
    # num_beams=5,  # Set beam_size to your desired value
    # temperature=1  # Set temperature to your desired value
)

batch_size = 20

num_examples = len(test_dataset)
print(num_examples)
total_batches = (num_examples + batch_size - 1) // batch_size
generated_output = []

for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc="Generating text"):
    batch_indices = range(i, min(i + batch_size, num_examples))
    batch = test_dataset.select(batch_indices)
    prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
    # print(prompts)
    # Generate text for the batch
    results = pipe(prompts, max_new_tokens=64)
    
    for result in results:
        generated_text = result[0]['generated_text']
        generated_output.append(generated_text)

        # Uncomment the following lines if you want to print the prompts and generated text
        prompt = prompts[results.index(result)]
        # print(f"Prompt: {prompt}")
        print(f"Generated Text: {generated_text}")
        print("------")

In [5]:
generated_texts = [x.split('\n\n### Response:\n')[1] for x in generated_output]
# generated_texts[0]

In [6]:
def human_evaluation_print(dataset, generated_responses):

    # Make sure you have the correct number of responses
    assert len(dataset) == len(generated_responses), "The number of generated responses must match the number of dataset entries"

    for i in range(len(dataset)):
        reference_answer = dataset[i]['output']
        generated_answer = generated_responses[i]
        print(i)
        print(f"Reference answer:\n{reference_answer}\n\nGenerated answer:\n{generated_answer}\n")

human_evaluation_print(test_dataset, generated_texts)

0
Reference answer:
For someone with arthritis, the best type of exercise would be low-impact activities like yoga, swimming, or walking. These exercises provide the benefits of exercise without exacerbating the symptoms of arthritis.

Generated answer:
Low-impact exercises such as swimming, cycling, and walking are great for people with arthritis. These exercises are gentle on the joints and can help to reduce pain and stiffness. Strength training exercises such as yoga and Pilates can also be beneficial for people with arthritis, as they can help to

1
Reference answer:
The atomic mass for lithium is 6.941 u (units). This is determined from the number of protons and neutrons in the nucleus of a lithium atom, which is 3 protons and 4 neutrons. Each proton and neutron has a mass of 1.007 u, resulting in a total mass of 6.941 u.

Generated answer:
The atomic mass for lithium is 6.94. This is calculated by adding the number of protons (3) and the number of neutrons (3) in the lithium ato

**Scores for Each Sample:**

1. **Sample 1 (Arthritis Exercise):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 0.8 (The generated answer mentions cycling, which is not typically considered a low-impact exercise for arthritis.)
   - **Average: 0.93**

2. **Sample 2 (Atomic Mass of Lithium):**
   - Grammatical Correctness: 0.8 (The generated answer has a repetitive sentence.)
   - Coherence: 0.8 (The generated answer is not coherent due to the repetition.)
   - Correctness of Answer: 1
   - **Average: 0.87**

3. **Sample 3 (ASCII Characters):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 0 (The generated answer is incorrect.)
   - **Average: 0.67**

4. **Sample 4 (Fear Description):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 1
   - **Average: 1**

5. **Sample 5 (Skateboard Promotion):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 0.9 (The generated answer adds an incomplete sentence about social media.)
   - **Average: 0.97**

6. **Sample 6 (Picnic Vacation):**
   - Grammatical Correctness: 1
   - Coherence: 0.8 (The generated answer is cut off, making it less coherent.)
   - Correctness of Answer: 1
   - **Average: 0.93**

7. **Sample 7 (Big Data):**
   - Grammatical Correctness: 1
   - Coherence: 0.8 (The generated answer is incomplete and less coherent.)
   - Correctness of Answer: 0.8 (The generated answer misses the aspect of customer interaction.)
   - **Average: 0.87**

8. **Sample 8 (Wild Goose Chase):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 1
   - **Average: 1**

9. **Sample 9 (Journals):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 1
   - **Average: 1**

10. **Sample 10 (Countries):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

11. **Sample 11 (Sailboat):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0.8 (The generated answer adds unnecessary details.)
    - **Average: 0.93**

12. **Sample 12 (Humor Importance):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

13. **Sample 13 (Hypotenuse Length):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0 (The generated answer provides an incorrect value.)
    - **Average: 0.67**

14. **Sample 14 (Symphony Orchestra):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0.8 (The generated answer adds unnecessary details.)
    - **Average: 0.93**

15. **Sample 15 (GPT Description):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

16. **Sample 16 (Paris):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0.8 (The generated answer adds unnecessary details.)
    - **Average: 0.93**

17. **Sample 17 (Understanding Customer's Needs):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0.8 (The generated answer is less specific and comprehensive.)
    - **Average: 0.93**

18. **Sample 18 (Cooking Pork Chops):**
    - Grammatically Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

19. **Sample 19 (Customer Service):**
    - Grammatical Correctness: 1
    - Coherence: 0.8 (The generated answer is a bit disjointed.)
    - Correctness of Answer: 0.8 (The generated answer doesn't address the original query about order arrival.)
    - **Average: 0.87**

20. **Sample 20 (Arithmetic Mean):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

21. **Sample 21 (Detecting Spam Emails):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

22. **Sample 22 (Sum Program):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

23. **Sample 23 (Reducing Distractions):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

24. **Sample 24 (Sum Algorithm):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

25. **Sample 25 (Government):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

26. **Sample 26 (Software Development Questions):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0.8 (The generated answer is less comprehensive.)
    - **Average: 0.93**

**Overall Average Score:** 0.94

In [7]:
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import score
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def evaluate_model_performance(dataset, generated_responses):
    # Initialize metrics and lists to save answers
    bleu_scores = []
    rouge_l_scores = []
    bert_f1_scores = []
    perplexity_scores = []

    # Make sure you have the correct number of responses
    assert len(dataset) == len(generated_responses), "The number of generated responses must match the number of dataset entries"

    for i in range(len(dataset)):
        reference_answer = dataset[i]['output']
        generated_answer = generated_responses[i]
        
        # Compute BLEU score
        bleu_score = corpus_bleu([generated_answer], [[reference_answer]])
        bleu_score_normalized = bleu_score.score / 100.0
        bleu_scores.append(bleu_score_normalized)
        
        rouge_l_scores.append(rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True).score(reference_answer, generated_answer)['rougeL'].fmeasure)
        
        bert_f1_scores.append(score([generated_answer], [reference_answer], lang='en')[2].mean().item())

        # Calculate perplexity
        # encodings = tokenizer(generated_answer, return_tensors='pt')
        # with torch.no_grad():
        #     outputs = model(**encodings, labels=encodings['input_ids'])
        #     loss = outputs.loss
        #     perplexity = torch.exp(loss).item()
        # perplexity_scores.append(perplexity)
        # Check if generated_answer is not empty
        if len(generated_answer) > 0:
            encodings = tokenizer(generated_answer, return_tensors='pt')
            with torch.no_grad():
                outputs = model(**encodings, labels=encodings['input_ids'])
                loss = outputs.loss
                perplexity = torch.exp(loss).item()
            perplexity_scores.append(perplexity)
        else:
            # Handle empty generated_answer, e.g., by appending a default value or skipping
            perplexity_scores.append(0.0)


    # Calculate average scores
    average_bleu = sum(bleu_scores) / len(bleu_scores)
    average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
    average_bert_f1 = sum(bert_f1_scores) / len(bert_f1_scores)
    average_perplexity = sum(perplexity_scores) / len(perplexity_scores)

    # Print results
    print(f'BLEU: {average_bleu}')
    print(f'Rouge-L: {average_rouge_l}')
    print(f'BERTScore: {average_bert_f1}')
    print(f'Perplexity: {average_perplexity}')

    return average_bleu, average_rouge_l, average_bert_f1, average_perplexity

average_bleu, average_rouge_l, average_bert_f1, average_perplexity = evaluate_model_performance(test_dataset, generated_texts)


BLEU: 0.38705383834704016
Rouge-L: 0.5804612348410141
BERTScore: 0.9290164444181654
Perplexity: 21.452082633972168


In [8]:
top_k_values = [10, 25, 40, 75]
beam_sizes = [2, 4, 6, 8]
temperatures = [0.25, 0.5, 0.7, 1.0]

# Varying top_k while keeping beam_size and temperature fixed
fixed_beam_size = 1
fixed_temperature = 0.8

for top_k in top_k_values:
    key = f"top_k={top_k}, fixed_beam_size={fixed_beam_size}, fixed_temperature={fixed_temperature}"
    generated_output_2 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_2 = pipe(prompts, max_new_tokens=64, top_k=top_k, num_beams=fixed_beam_size, temperature=fixed_temperature, do_sample=True)

        for result in results_2:
            generated_text_2 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_2.append(generated_text_2)
    
    # generated_texts_2 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_2 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_2]
    
    average_bleu_2, average_rouge_l_2, average_bert_f1_2, average_perplexity_2 = evaluate_model_performance(test_dataset, generated_texts_2)

    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for top_k=10, fixed_beam_size=1, fixed_temperature=0.8:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text for top_k=10, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:03<00:00, 31.99s/it]


BLEU: 0.3510541020017856
Rouge-L: 0.5341504758440547
BERTScore: 0.9189764857292175
Perplexity: 22.09542437835976
----------


Generating text for top_k=25, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:05<00:00, 32.88s/it]


BLEU: 0.33892633976197184
Rouge-L: 0.5354307816020911
BERTScore: 0.9207236060389766
Perplexity: 24.39620429498178
----------


Generating text for top_k=40, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:04<00:00, 32.13s/it]


BLEU: 0.3477686370804529
Rouge-L: 0.5360981619219823
BERTScore: 0.9177809666704249
Perplexity: 22.665053650184912
----------


Generating text for top_k=75, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:08<00:00, 34.22s/it]


BLEU: 0.35338924735824695
Rouge-L: 0.5443096117295361
BERTScore: 0.923617321032065
Perplexity: 19.356942353425204
----------


In [9]:
fixed_top_k = 50

for beam_size in beam_sizes:
    key = f"fixed_top_k={fixed_top_k}, beam_size={beam_size}, fixed_temperature={fixed_temperature}"
    generated_output_3 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_3 = pipe(prompts, max_new_tokens=64, top_k=fixed_top_k, num_beams=beam_size, temperature=fixed_temperature, do_sample=True)

        for result in results_3:
            generated_text_3 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_3.append(generated_text_3)
    
    # generated_texts_3 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_3 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_3]

    average_bleu_3, average_rouge_l_3, average_bert_f1_3, average_perplexity_3 = evaluate_model_performance(test_dataset, generated_texts_3)
    
    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for fixed_top_k=50, beam_size=2, fixed_temperature=0.8:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text for fixed_top_k=50, beam_size=2, fixed_temperature=0.8: 100%|██████████| 2/2 [01:07<00:00, 33.59s/it]


BLEU: 0.38100607461065433
Rouge-L: 0.5782171854689289
BERTScore: 0.9302632764533714
Perplexity: 18.543787055545383
----------


Generating text for fixed_top_k=50, beam_size=4, fixed_temperature=0.8: 100%|██████████| 2/2 [01:11<00:00, 35.79s/it]


BLEU: 0.4246186741839497
Rouge-L: 0.5945756920947856
BERTScore: 0.9354002210828993
Perplexity: 21.820143734967267
----------


Generating text for fixed_top_k=50, beam_size=6, fixed_temperature=0.8: 100%|██████████| 2/2 [01:12<00:00, 36.37s/it]


BLEU: 0.3855511581025339
Rouge-L: 0.5651251978481594
BERTScore: 0.9277476734585233
Perplexity: 20.6197026570638
----------


Generating text for fixed_top_k=50, beam_size=8, fixed_temperature=0.8: 100%|██████████| 2/2 [01:17<00:00, 38.55s/it]


BLEU: 0.42297149285432706
Rouge-L: 0.5941208357438542
BERTScore: 0.937205934966052
Perplexity: 21.74965833734583
----------


In [10]:
for temperature in temperatures:
    key = f"fixed_top_k={fixed_top_k}, fixed_beam_size={fixed_beam_size}, temperature={temperature}"
    generated_output_4 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_4 = pipe(prompts, max_new_tokens=64, top_k=fixed_top_k, num_beams=fixed_beam_size, temperature=temperature, do_sample=True)

        for result in results_4:
            generated_text_4 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_4.append(generated_text_4)
    
    # generated_texts_3 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_4 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_4]

    average_bleu_4, average_rouge_l_4, average_bert_f1_4, average_perplexity_4 = evaluate_model_performance(test_dataset, generated_texts_4)
    
    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.25:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.25: 100%|██████████| 2/2 [01:01<00:00, 30.89s/it]


BLEU: 0.41189804190591417
Rouge-L: 0.595131729722499
BERTScore: 0.9346063159130238
Perplexity: 21.931470959274858
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.5: 100%|██████████| 2/2 [01:03<00:00, 31.96s/it]


BLEU: 0.37135443452174927
Rouge-L: 0.5510741890311018
BERTScore: 0.9246968582824424
Perplexity: 42.8096085301152
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.7: 100%|██████████| 2/2 [01:05<00:00, 32.56s/it]


BLEU: 0.31217096451696497
Rouge-L: 0.5130026279064706
BERTScore: 0.9176002939542135
Perplexity: 19.12071810828315
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=1.0: 100%|██████████| 2/2 [01:03<00:00, 31.99s/it]


BLEU: 0.32099546072633484
Rouge-L: 0.5092024002859925
BERTScore: 0.9161249796549479
Perplexity: 24.89751501436587
----------
