In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch
from datasets import load_dataset
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = "microsoft/phi-2"

new_model = "Phi-2-fine-tuned"

In [2]:
# Load the dataset
whole_dataset = load_dataset("tatsu-lab/alpaca")

# Split the dataset into training and testing sets
split_datasets = whole_dataset['train'].train_test_split(test_size=0.0005, seed=42)

# Access the training and testing sets
train_dataset = split_datasets['train']
test_dataset = split_datasets['test']

In [4]:
# Load base model(Phi-2)
bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model.config.use_cache = False
model.config.pretraining_tp = 1


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.80s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense',
        'fc1',
        'fc2',
    ]
)
model = get_peft_model(model, peft_config)
model.get_memory_footprint()

2413459456

In [6]:


#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./phi-2-role-play",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_strategy="epoch",
    logging_steps=100,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    disable_tqdm=False,
    report_to="none",
)



In [7]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    max_seq_length= 2048,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
trainer.train()
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)



Step,Training Loss
100,0.9617
200,0.7688
300,0.7484
400,0.7232
500,0.7603
600,0.6939
700,0.7536
800,0.7575
900,0.6921
1000,0.723


('Phi-2-fine-tuned/tokenizer_config.json',
 'Phi-2-fine-tuned/special_tokens_map.json',
 'Phi-2-fine-tuned/vocab.json',
 'Phi-2-fine-tuned/merges.txt',
 'Phi-2-fine-tuned/added_tokens.json',
 'Phi-2-fine-tuned/tokenizer.json')

In [3]:
# logging.set_verbosity(logging.CRITICAL)
model_path = 'Phi-2-fine-tuned'
finetuned_model = AutoModelForCausalLM.from_pretrained(model_path)
finetuned_tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.32s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from tqdm.auto import tqdm

logging.set_verbosity_error()

# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, device=0)
pipe = pipeline(
    task="text-generation",
    model=finetuned_model,
    tokenizer=finetuned_tokenizer,
    device=0,
    # top_k=50,  # Set top_k to your desired value
    # num_beams=5,  # Set beam_size to your desired value
    # temperature=1  # Set temperature to your desired value
)

batch_size = 20

num_examples = len(test_dataset)
print(num_examples)
total_batches = (num_examples + batch_size - 1) // batch_size
generated_output = []

for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc="Generating text"):
    batch_indices = range(i, min(i + batch_size, num_examples))
    batch = test_dataset.select(batch_indices)
    prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
    # print(prompts)
    # Generate text for the batch
    results = pipe(prompts, max_new_tokens=64)
    
    for result in results:
        generated_text = result[0]['generated_text']
        generated_output.append(generated_text)

        # Uncomment the following lines if you want to print the prompts and generated text
        prompt = prompts[results.index(result)]
        # print(f"Prompt: {prompt}")
        print(f"Generated Text: {generated_text}")
        print("------")

27


Generating text:  50%|█████     | 1/2 [00:50<00:50, 50.95s/it]

Generated Text: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What would be the best type of exercise for a person who has arthritis?

### Response:
Low-impact exercises such as swimming, walking, and cycling are great for people with arthritis. These exercises are gentle on the joints and can help to reduce pain and stiffness. Strength training exercises such as yoga and Pilates can also be beneficial for people with arthritis, as they can help to
------
Generated Text: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Calculate the atomic mass for lithium.

### Response:
The atomic mass for lithium is 6.94. This is calculated by adding the number of protons (1) and neutrons (6) in the lithium atom. The atomic mass of an element is the average mass of all the isotopes of that element, taking into account their relative
------
Generated

Generating text: 100%|██████████| 2/2 [01:11<00:00, 35.74s/it]

Generated Text: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is arithmetic mean and how to calculate it?

### Response:
The arithmetic mean is a measure of central tendency that is calculated by adding up all the values in a set of data and then dividing the sum by the number of values in the set. It is also known as the average. To calculate the arithmetic mean, you need to add up all the
------
Generated Text: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Describe a method for detecting spam emails.

### Response:
A method for detecting spam emails involves using a combination of techniques such as content analysis, machine learning, and natural language processing. Content analysis involves analyzing the content of the email to identify keywords and phrases that are commonly associated with spam. Machine learning algorithms




In [5]:
generated_texts = [x.split('\n\n### Response:\n')[1] for x in generated_output]
# generated_texts[0]

In [6]:
def human_evaluation_print(dataset, generated_responses):

    # Make sure you have the correct number of responses
    assert len(dataset) == len(generated_responses), "The number of generated responses must match the number of dataset entries"

    for i in range(len(dataset)):
        reference_answer = dataset[i]['output']
        generated_answer = generated_responses[i]
        print(i)
        print(f"Reference answer:\n{reference_answer}\n\nGenerated answer:\n{generated_answer}\n")

human_evaluation_print(test_dataset, generated_texts)

0
Reference answer:
For someone with arthritis, the best type of exercise would be low-impact activities like yoga, swimming, or walking. These exercises provide the benefits of exercise without exacerbating the symptoms of arthritis.

Generated answer:
Low-impact exercises such as swimming, walking, and cycling are great for people with arthritis. These exercises are gentle on the joints and can help to reduce pain and stiffness. Strength training exercises such as yoga and Pilates can also be beneficial for people with arthritis, as they can help to

1
Reference answer:
The atomic mass for lithium is 6.941 u (units). This is determined from the number of protons and neutrons in the nucleus of a lithium atom, which is 3 protons and 4 neutrons. Each proton and neutron has a mass of 1.007 u, resulting in a total mass of 6.941 u.

Generated answer:
The atomic mass for lithium is 6.94. This is calculated by adding the number of protons (1) and neutrons (6) in the lithium atom. The atomic 

**Scores for Each Sample:**

1. **Sample 1 (Arthritis Exercise):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 0.8 (The generated answer mentions cycling, which is not typically considered a low-impact exercise for arthritis.)
   - **Average: 0.93**

2. **Sample 2 (Atomic Mass of Lithium):**
   - Grammatical Correctness: 0.8 (The generated answer has a repetitive sentence.)
   - Coherence: 0.8 (The generated answer is not coherent due to the repetition.)
   - Correctness of Answer: 1
   - **Average: 0.87**

3. **Sample 3 (ASCII Characters):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 0 (The generated answer is incorrect.)
   - **Average: 0.67**

4. **Sample 4 (Fear Description):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 1
   - **Average: 1**

5. **Sample 5 (Skateboard Promotion):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 0.9 (The generated answer adds an incomplete sentence about social media.)
   - **Average: 0.97**

6. **Sample 6 (Picnic Vacation):**
   - Grammatical Correctness: 1
   - Coherence: 0.8 (The generated answer is cut off, making it less coherent.)
   - Correctness of Answer: 1
   - **Average: 0.93**

7. **Sample 7 (Big Data):**
   - Grammatical Correctness: 1
   - Coherence: 0.8 (The generated answer is incomplete and less coherent.)
   - Correctness of Answer: 0.8 (The generated answer misses the aspect of customer interaction.)
   - **Average: 0.87**

8. **Sample 8 (Wild Goose Chase):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 1
   - **Average: 1**

9. **Sample 9 (Journals):**
   - Grammatical Correctness: 1
   - Coherence: 1
   - Correctness of Answer: 1
   - **Average: 1**

10. **Sample 10 (Countries):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

11. **Sample 11 (Sailboat):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0.8 (The generated answer adds unnecessary details.)
    - **Average: 0.93**

12. **Sample 12 (Humor Importance):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

13. **Sample 13 (Hypotenuse Length):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0 (The generated answer provides an incorrect value.)
    - **Average: 0.67**

14. **Sample 14 (Symphony Orchestra):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0.8 (The generated answer adds unnecessary details.)
    - **Average: 0.93**

15. **Sample 15 (GPT Description):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

16. **Sample 16 (Paris):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0.8 (The generated answer adds unnecessary details.)
    - **Average: 0.93**

17. **Sample 17 (Understanding Customer's Needs):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0.8 (The generated answer is less specific and comprehensive.)
    - **Average: 0.93**

18. **Sample 18 (Cooking Pork Chops):**
    - Grammatically Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

19. **Sample 19 (Customer Service):**
    - Grammatical Correctness: 1
    - Coherence: 0.8 (The generated answer is a bit disjointed.)
    - Correctness of Answer: 0.8 (The generated answer doesn't address the original query about order arrival.)
    - **Average: 0.87**

20. **Sample 20 (Arithmetic Mean):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

21. **Sample 21 (Detecting Spam Emails):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

22. **Sample 22 (Sum Program):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

23. **Sample 23 (Reducing Distractions):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

24. **Sample 24 (Sum Algorithm):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

25. **Sample 25 (Government):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 1
    - **Average: 1**

26. **Sample 26 (Software Development Questions):**
    - Grammatical Correctness: 1
    - Coherence: 1
    - Correctness of Answer: 0.8 (The generated answer is less comprehensive.)
    - **Average: 0.93**

**Overall Average Score:** 0.94

In [7]:
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import score
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def evaluate_model_performance(dataset, generated_responses):
    # Initialize metrics and lists to save answers
    bleu_scores = []
    rouge_l_scores = []
    bert_f1_scores = []
    perplexity_scores = []

    # Make sure you have the correct number of responses
    assert len(dataset) == len(generated_responses), "The number of generated responses must match the number of dataset entries"

    for i in range(len(dataset)):
        reference_answer = dataset[i]['output']
        generated_answer = generated_responses[i]
        
        bleu_scores.append(corpus_bleu([generated_answer], [[reference_answer]]).score)
        rouge_l_scores.append(rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True).score(reference_answer, generated_answer)['rougeL'].fmeasure)
        bert_f1_scores.append(score([generated_answer], [reference_answer], lang='en')[2].mean().item())

        # Calculate perplexity
        encodings = tokenizer(generated_answer, return_tensors='pt')
        with torch.no_grad():
            outputs = model(**encodings, labels=encodings['input_ids'])
            loss = outputs.loss
            perplexity = torch.exp(loss).item()
        perplexity_scores.append(perplexity)

    # Calculate average scores
    average_bleu = sum(bleu_scores) / len(bleu_scores)
    average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
    average_bert_f1 = sum(bert_f1_scores) / len(bert_f1_scores)
    average_perplexity = sum(perplexity_scores) / len(perplexity_scores)

    # Print results
    print(f'BLEU: {average_bleu}')
    print(f'Rouge-L: {average_rouge_l}')
    print(f'BERTScore: {average_bert_f1}')
    print(f'Perplexity: {average_perplexity}')

    return average_bleu, average_rouge_l, average_bert_f1, average_perplexity

average_bleu, average_rouge_l, average_bert_f1, average_perplexity = evaluate_model_performance(test_dataset, generated_texts)


BLEU: 38.65230941540421
Rouge-L: 0.577791674336474
BERTScore: 0.927201858273259
Perplexity: 21.4041663452431


In [8]:
top_k_values = [10, 25, 40, 75]
beam_sizes = [2, 4, 6, 8]
temperatures = [0.25, 0.5, 0.7, 1.0]

# Varying top_k while keeping beam_size and temperature fixed
fixed_beam_size = 1
fixed_temperature = 0.8

for top_k in top_k_values:
    key = f"top_k={top_k}, fixed_beam_size={fixed_beam_size}, fixed_temperature={fixed_temperature}"
    generated_output_2 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_2 = pipe(prompts, max_new_tokens=64, top_k=top_k, num_beams=fixed_beam_size, temperature=fixed_temperature, do_sample=True)

        for result in results_2:
            generated_text_2 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_2.append(generated_text_2)
    
    # generated_texts_2 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_2 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_2]
    
    average_bleu_2, average_rouge_l_2, average_bert_f1_2, average_perplexity_2 = evaluate_model_performance(test_dataset, generated_texts_2)

    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for top_k=10, fixed_beam_size=1, fixed_temperature=0.8:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text for top_k=10, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:04<00:00, 32.46s/it]


BLEU: 37.63149152914763
Rouge-L: 0.5559819586921096
BERTScore: 0.9282636002258018
Perplexity: 19.96817938486735
----------


Generating text for top_k=25, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:04<00:00, 32.25s/it]


BLEU: 36.18796545255251
Rouge-L: 0.5396540050324894
BERTScore: 0.9271815926940353
Perplexity: 19.60500263284754
----------


Generating text for top_k=40, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:05<00:00, 32.78s/it]


BLEU: 36.687082103450734
Rouge-L: 0.5515713485611292
BERTScore: 0.9291376206609938
Perplexity: 20.79656198289659
----------


Generating text for top_k=75, fixed_beam_size=1, fixed_temperature=0.8: 100%|██████████| 2/2 [01:04<00:00, 32.11s/it]


BLEU: 36.34214353756424
Rouge-L: 0.5405646073525433
BERTScore: 0.9239199912106549
Perplexity: 23.381509339367902
----------


In [9]:
fixed_top_k = 50

for beam_size in beam_sizes:
    key = f"fixed_top_k={fixed_top_k}, beam_size={beam_size}, fixed_temperature={fixed_temperature}"
    generated_output_3 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_3 = pipe(prompts, max_new_tokens=64, top_k=fixed_top_k, num_beams=beam_size, temperature=fixed_temperature, do_sample=True)

        for result in results_3:
            generated_text_3 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_3.append(generated_text_3)
    
    # generated_texts_3 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_3 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_3]

    average_bleu_3, average_rouge_l_3, average_bert_f1_3, average_perplexity_3 = evaluate_model_performance(test_dataset, generated_texts_3)
    
    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for fixed_top_k=50, beam_size=2, fixed_temperature=0.8:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text for fixed_top_k=50, beam_size=2, fixed_temperature=0.8: 100%|██████████| 2/2 [01:07<00:00, 33.55s/it]


BLEU: 39.67188808929785
Rouge-L: 0.5848735498976347
BERTScore: 0.9295942385991415
Perplexity: 20.6656516039813
----------


Generating text for fixed_top_k=50, beam_size=4, fixed_temperature=0.8: 100%|██████████| 2/2 [01:10<00:00, 35.09s/it]


BLEU: 42.57277846736364
Rouge-L: 0.5977791788493919
BERTScore: 0.9361406454333553
Perplexity: 21.49953254063924
----------


Generating text for fixed_top_k=50, beam_size=6, fixed_temperature=0.8: 100%|██████████| 2/2 [01:14<00:00, 37.15s/it]


BLEU: 38.98229855918357
Rouge-L: 0.5742071191754885
BERTScore: 0.9285258143036453
Perplexity: 21.440537523340296
----------


Generating text for fixed_top_k=50, beam_size=8, fixed_temperature=0.8: 100%|██████████| 2/2 [01:16<00:00, 38.20s/it]


BLEU: 39.768879848971125
Rouge-L: 0.5881864172496409
BERTScore: 0.931098425829852
Perplexity: 20.473666703259504
----------


In [10]:
for temperature in temperatures:
    key = f"fixed_top_k={fixed_top_k}, fixed_beam_size={fixed_beam_size}, temperature={temperature}"
    generated_output_4 = []  # Reset the generated_output_2 for each top_k value

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc=f"Generating text for {key}"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]
        results_4 = pipe(prompts, max_new_tokens=64, top_k=fixed_top_k, num_beams=fixed_beam_size, temperature=temperature, do_sample=True)

        for result in results_4:
            generated_text_4 = result[0]['generated_text']  # Access the first element of the inner list
            generated_output_4.append(generated_text_4)
    
    # generated_texts_3 = [x.split('\n\n### Response:\n')[1] for x in generated_output]
    generated_texts_4 = [x.split('\n\n### Response:\n')[1] if '\n\n### Response:\n' in x else "" for x in generated_output_4]

    average_bleu_4, average_rouge_l_4, average_bert_f1_4, average_perplexity_4 = evaluate_model_performance(test_dataset, generated_texts_4)
    
    # print(f"Results for top_k={top_k}, beam_size={fixed_beam_size}, temperature={fixed_temperature}: BLEU={average_bleu_2}, Rouge-L={average_rouge_l_2}, BERTScore={average_bert_f1_2}, Perplexity={average_perplexity_2}")
    print("----------")

Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.25:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.25: 100%|██████████| 2/2 [01:04<00:00, 32.05s/it]


BLEU: 38.79935954940628
Rouge-L: 0.5841967669626138
BERTScore: 0.9279305647920679
Perplexity: 21.20727077236882
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.5: 100%|██████████| 2/2 [01:04<00:00, 32.11s/it]


BLEU: 37.18471176330375
Rouge-L: 0.5560767714169326
BERTScore: 0.9256197456960324
Perplexity: 22.537801371680366
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=0.7: 100%|██████████| 2/2 [01:03<00:00, 31.52s/it]


BLEU: 36.2094992745774
Rouge-L: 0.5576470343600868
BERTScore: 0.9268559173301414
Perplexity: 22.557431839130544
----------


Generating text for fixed_top_k=50, fixed_beam_size=1, temperature=1.0: 100%|██████████| 2/2 [01:06<00:00, 33.15s/it]


BLEU: 33.941399942768655
Rouge-L: 0.5316848877072834
BERTScore: 0.9161695131549129
Perplexity: 25.383931513185853
----------


Reference: https://www.kaggle.com/code/kingabzpro/fine-tuning-phi-2