In [1]:
!pip install -q  torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate

import torch
import datasets
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

## **Create HuggingFace dataset from json**

In [7]:
import json
json_file_path = '/content/drive/MyDrive/policy_AI/Policy_QA.json'

# Reading the JSON file
with open(json_file_path, 'r') as file:
    data = json.load(file)


In [8]:
import pandas as pd
data_list = [{'input': item['input'], 'output': item['output']} for item in data.values()]
df = pd.DataFrame(data_list)
df

Unnamed: 0,input,output
0,What are the twin goals of Singapore’s NAIS 2....,One is excellence: Singapore will selectively ...
1,"To achieve the vision and goals, Singapore wil...",System 1: Activity Drivers (Enablers: Industry...
2,What are the 15 Actions that Singapore will un...,(1) Anchor new AI Centres of Excellence (CoEs)...
3,How will Singapore contribute to international...,(1) Anchoring key bilateral relationships with...
4,How will Singapore intensify the promotion of ...,(1) Make available tools that enterprises can ...
...,...,...
89,What is the World Bank's Policy on Access to I...,The policy outlines the World Bank's commitmen...
90,Why does the World Bank restrict access to cer...,The policy includes exceptions to protect sens...
91,How can someone request information from the W...,Individuals can request information through th...
92,What are the types of information classified u...,Information is classified into categories such...


In [9]:
from datasets import Dataset, DatasetDict, load_dataset
# Create a Dataset object from your data
qa_dataset = Dataset.from_pandas(df)

# Optionally, split your dataset into training and testing sets
qa_dataset = DatasetDict({
    'train': qa_dataset.train_test_split(test_size=0.2)['train'],
    'test': qa_dataset.train_test_split(test_size=0.2)['test']
})

# Push your dataset to the Hugging Face Hub
#qa_dataset.push_to_hub("jiuyuan/policy_AI")


## **Load dataset from HuggingFace**

In [10]:
# Dataset
data_name = "jiuyuan/policy_AI"
training_data = load_dataset(data_name, split="train")
test_data = load_dataset(data_name, split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/445 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/31.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/75 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/19 [00:00<?, ? examples/s]

In [11]:
# Model and tokenizer names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
refined_model = "llama-2-7b-mlabonne-enhanced"

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

In [None]:
# #Don't use this, extremely slow
# query = """Given my Neo4j database schema:[<Record nodes=[<Node element_id='-8' labels=frozenset({'conference'}) properties={'name': 'conference', 'indexes': [], 'constraints': []}>, <Node element_id='-6' labels=frozenset({'paper'}) properties={'name': 'paper', 'indexes': [], 'constraints': []}>, <Node element_id='-9' labels=frozenset({'affiliation'}) properties={'name': 'affiliation', 'indexes': [], 'constraints': []}>, <Node element_id='-7' labels=frozenset({'author'}) properties={'name': 'author', 'indexes': [], 'constraints': []}>, <Node element_id='-10' labels=frozenset({'domain'}) properties={'name': 'domain', 'indexes': [], 'constraints': []}>] relationships=[<Relationship element_id='-7' nodes=(<Node element_id='-7' labels=frozenset({'author'}) properties={'name': 'author', 'indexes': [], 'constraints': []}>, <Node element_id='-6' labels=frozenset({'paper'}) properties={'name': 'paper', 'indexes': [], 'constraints': []}>) type='author_write_paper' properties={'name': 'author_write_paper'}>, <Relationship element_id='-8' nodes=(<Node element_id='-6' labels=frozenset({'paper'}) properties={'name': 'paper', 'indexes': [], 'constraints': []}>, <Node element_id='-6' labels=frozenset({'paper'}) properties={'name': 'paper', 'indexes': [], 'constraints': []}>) type='paper_cite_paper' properties={'name': 'paper_cite_paper'}>, <Relationship element_id='-10' nodes=(<Node element_id='-6' labels=frozenset({'paper'}) properties={'name': 'paper', 'indexes': [], 'constraints': []}>, <Node element_id='-8' labels=frozenset({'conference'}) properties={'name': 'conference', 'indexes': [], 'constraints': []}>) type='paper_in_venue' properties={'name': 'paper_in_venue'}>, <Relationship element_id='-6' nodes=(<Node element_id='-7' labels=frozenset({'author'}) properties={'name': 'author', 'indexes': [], 'constraints': []}>, <Node element_id='-9' labels=frozenset({'affiliation'}) properties={'name': 'affiliation', 'indexes': [], 'constraints': []}>) type='author_in_affiliation' properties={'name': 'author_in_affiliation'}>, <Relationship element_id='-9' nodes=(<Node element_id='-6' labels=frozenset({'paper'}) properties={'name': 'paper', 'indexes': [], 'constraints': []}>, <Node element_id='-10' labels=frozenset({'domain'}) properties={'name': 'domain', 'indexes': [], 'constraints': []}>) type='paper_in_domain' properties={'name': 'paper_in_domain'}>]>] . Generate cypher query for the question: What are the top 5 main research areas in the 'ACL' conference? Only give cypher query, without any other words."""
# text_gen = pipeline(task="text-generation", model=base_model,return_full_text=False, tokenizer=llama_tokenizer, max_length=1024)
# output = text_gen(f"<s>[INST] {query} [/INST]")
# print(output[0]['generated_text'])

In [12]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['input'])):
        text = f"### Question: {example['input'][i]}\n ### Answer: {example['output'][i]}</s>"
        output_texts.append(text)
    return output_texts


## **Fine tune**

下边两个cell 二选一

In [None]:

# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=30,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=100,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

hyperparameter_combinations = [
    #{'lora_alpha': 16, 'lora_dropout': 0.1, 'r': 8, 'bias': 'none'},
    #{'lora_alpha': 16, 'lora_dropout': 0.2, 'r': 8, 'bias': 'none'},
    #{'lora_alpha': 16, 'lora_dropout': 0.1, 'r': 12, 'bias': 'none'},
    #{'lora_alpha': 32, 'lora_dropout': 0.1, 'r': 8, 'bias': 'none'},
    {'lora_alpha': 32, 'lora_dropout': 0.2, 'r': 8, 'bias': 'none'},
    {'lora_alpha': 32, 'lora_dropout': 0.1, 'r': 12, 'bias': 'none'},

]


results = []

for i, params in enumerate(hyperparameter_combinations):
    peft_parameters = LoraConfig(
        lora_alpha=params['lora_alpha'],
        lora_dropout=params['lora_dropout'],
        r=params['r'],
        bias=params['bias'],
        task_type="CAUSAL_LM"
    )

    fine_tuning = SFTTrainer(
        model=base_model,
        train_dataset=training_data,
        formatting_func=formatting_prompts_func,
        peft_config=peft_parameters,
        tokenizer=llama_tokenizer,
        args=train_params
    )
    fine_tuning.train()

    model_save_path = f"/content/drive/MyDrive/policy_AI/HPT_models/LORA/model_{i}"
    fine_tuning.model.save_pretrained(model_save_path)

    results.append({
        'lora_alpha': params['lora_alpha'],
        'lora_dropout': params['lora_dropout'],
        'r': params['r'],
        'bias': params['bias'],
        'training_loss': fine_tuning.state.log_history[-1]['train_loss']
    })

results_df = pd.DataFrame(results)
results_df.to_csv('/content/drive/MyDrive/policy_AI/HPT_models/LORA/hyperparameter_tuning_results.csv', index=False)

NameError: name 'TrainingArguments' is not defined

In [13]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=30,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=100,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    formatting_func=formatting_prompts_func, #hereeee
    peft_config=peft_parameters,
    # dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params
)

# Training
fine_tuning.train()

# Save Model
fine_tuning.model.save_pretrained(refined_model)



Map:   0%|          | 0/75 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.3671


KeyboardInterrupt: 

In [None]:
fine_tuning.state.log_history[-1]['train_loss']

0.5157599896715398

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
fine_tuning.model.save_pretrained("/content/drive/MyDrive/94812/llama-2-7b-policyAI")

In [None]:
# fine_tuning is the trainer
model_path = "/content/drive/MyDrive/94812/llama-2-7b-policyAI"
fine_tuning.save_model(model_path)
fine_tuning.model.config.save_pretrained(model_path+"config.json")

In [None]:
# model_path = "/content/drive/MyDrive/94812/llama-2-7b-policyAI"

# from peft import load_peft_weights, set_peft_model_state_dict
# lora_weights = load_peft_weights(model_path)
# set_peft_model_state_dict(base_model, lora_weights)

## **Generations & Evaluation**

In [None]:
df_test_all=test_data.to_pandas()
df_train=training_data.to_pandas()
df_test_all

Unnamed: 0,input,output
0,Why is international cooperation on AI important?,International cooperation on AI is crucial due...
1,Can you describe the approach taken by the res...,The research team prepared generic interview p...
2,What concerns do critics have regarding the EU...,"Critics, including EU businesses, have express..."
3,How does the European Union classify AI system...,The European Union's AI Act classifies AI syst...
4,What are the Universal Guidelines for Artifici...,The Universal Guidelines include rights to tra...
5,What role do whistleblowers and complaints pla...,Whistleblowers and individual complaints are c...
6,How's UK's approach different from that of the...,"Unlike the European Union (EU), the UK’s appro..."
7,How does Japan's approach to AI regulation dif...,"Japan focuses on a risk-based, agile, and mult..."
8,"What are foundational AI models, and why are t...","Foundational AI models, such as large language..."
9,What additional measures are needed for intern...,The article suggests that more ambitious trade...


In [None]:
df_test_all['output'].apply(lambda x:len(x.split())).describe(), df_train['output'].apply(lambda x:len(x.split())).describe()

(count    19.000000
 mean     40.631579
 std      11.591235
 min      17.000000
 25%      33.000000
 50%      41.000000
 75%      48.500000
 max      61.000000
 Name: output, dtype: float64,
 count      75.000000
 mean       70.000000
 std       135.589205
 min        17.000000
 25%        30.000000
 50%        39.000000
 75%        52.000000
 max      1020.000000
 Name: output, dtype: float64)

In [None]:
from tqdm.notebook import tqdm
import gc
generations =[]
for i in tqdm(range(len(df_test_all)), "generating..."):
  prompt = f"### Question: {df_test_all['input'][i]}\n ### Answer: </s>"
  # Generate predictions
  inputs = llama_tokenizer(prompt, return_tensors='pt')
  inputs = inputs.to("cuda")
  output = fine_tuning.model.generate(**inputs, max_new_tokens=100,temperature=0.2)
  response = llama_tokenizer.decode(output[0].tolist())
  # print(response)
  # break
  generations.append(response)
  del inputs, output
  gc.collect()
  torch.cuda.empty_cache()

generations

generating...:   0%|          | 0/19 [00:00<?, ?it/s]



["<s> ### Question: Why is international cooperation on AI important?\n ### Answer:</s>s International cooperation on AI is crucial due to the collaborative nature of AI research, the large scale of AI systems, and the need for standards and regulations to ensure the safety and security of these systems. It's important for promoting economic growth, competitiveness, and innovation on a global scale</s>\n\nInternational cooperation on AI is crucial due to the collaborative nature of AI research, the large scale of AI",
 '<s> ### Question: Can you describe the approach taken by the research team in preparing for the interviews with academia and industry experts in the AI assessment?\n ### Answer:</s>s the research team prepared for the interviews, they first reviewed the job descriptions and position descriptions of potential interviewees to identify key individuals who could provide insights into the implementation and impact of AI systems in government. They then developed an interview

In [None]:
df_test_all['fine_tuned_llama_generations']=generations
df_test_all.to_csv('results.csv')

In [None]:
# Model
raw_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
raw_model.config.use_cache = False
raw_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from tqdm.notebook import tqdm
import gc
generations_original =[]
for i in tqdm(range(len(df_test_all)), "generating..."):
  prompt = f"### Question: {df_test_all['input'][i]}\n ### Answer: </s>"
  # Generate predictions
  inputs = llama_tokenizer(prompt, return_tensors='pt')
  inputs = inputs.to("cuda")
  output = raw_model.generate(**inputs, max_new_tokens=100,temperature=0.2)
  response = llama_tokenizer.decode(output[0].tolist())
  # print(response)
  # break
  generations_original.append(response)
  del inputs, output
  gc.collect()
  torch.cuda.empty_cache()

generations_original

generating...:   0%|          | 0/19 [00:00<?, ?it/s]



['<s> ### Question: Why is international cooperation on AI important?\n ### Answer:</s><s> nobody knows\n\nInternational cooperation on AI is important for several reasons:\n\n1. **Sharing knowledge and resources**: By working together, countries can share their expertise and resources in AI, leading to faster progress and more innovative solutions.\n2. **Addressing ethical and social implications**: AI raises complex ethical and social implications, such as privacy, bias, and job displacement. International cooperation can help address',
 '<s> ### Question: Can you describe the approach taken by the research team in preparing for the interviews with academia and industry experts in the AI assessment?\n ### Answer:</s>: Sure! The research team took a multi-faceted approach to preparing for the interviews with academia and industry experts in the AI assessment. Here are some of the key steps we took:\n\n1. Literature Review: We conducted a comprehensive literature review to identify the

In [None]:
df_test_all['original_llama_generations']=generations_original
df_test_all

Unnamed: 0,input,output,original_llama_generations,fine_tuned_llama_generations
0,Why is international cooperation on AI important?,International cooperation on AI is crucial due...,<s> ### Question: Why is international coopera...,<s> ### Question: Why is international coopera...
1,Can you describe the approach taken by the res...,The research team prepared generic interview p...,<s> ### Question: Can you describe the approac...,<s> ### Question: Can you describe the approac...
2,What concerns do critics have regarding the EU...,"Critics, including EU businesses, have express...",<s> ### Question: What concerns do critics hav...,<s> ### Question: What concerns do critics hav...
3,How does the European Union classify AI system...,The European Union's AI Act classifies AI syst...,<s> ### Question: How does the European Union ...,<s> ### Question: How does the European Union ...
4,What are the Universal Guidelines for Artifici...,The Universal Guidelines include rights to tra...,<s> ### Question: What are the Universal Guide...,<s> ### Question: What are the Universal Guide...
5,What role do whistleblowers and complaints pla...,Whistleblowers and individual complaints are c...,<s> ### Question: What role do whistleblowers ...,<s> ### Question: What role do whistleblowers ...
6,How's UK's approach different from that of the...,"Unlike the European Union (EU), the UK’s appro...",<s> ### Question: How's UK's approach differen...,<s> ### Question: How's UK's approach differen...
7,How does Japan's approach to AI regulation dif...,"Japan focuses on a risk-based, agile, and mult...",<s> ### Question: How does Japan's approach to...,<s> ### Question: How does Japan's approach to...
8,"What are foundational AI models, and why are t...","Foundational AI models, such as large language...",<s> ### Question: What are foundational AI mod...,<s> ### Question: What are foundational AI mod...
9,What additional measures are needed for intern...,The article suggests that more ambitious trade...,<s> ### Question: What additional measures are...,<s> ### Question: What additional measures are...


In [None]:
df_test_all['original_llama_generations']= df_test_all['original_llama_generations'].apply(lambda x: x.split('Answer:', 1)[-1] if 'Answer:' in x else x)
df_test_all['original_llama_generations']= df_test_all['original_llama_generations'].apply(lambda x: x.replace('</s>', '').replace('<s>', '').replace('</s>s', '').replace('<s/>', '').replace('s ', ''))
df_test_all['fine_tuned_llama_generations']= df_test_all['fine_tuned_llama_generations'].apply(lambda x: x.split('Answer:', 1)[-1] if 'Answer:' in x else x)
df_test_all['fine_tuned_llama_generations']= df_test_all['fine_tuned_llama_generations'].apply(lambda x: x.replace('</s>', '').replace('<s>', '').replace('</s>s', '').replace('<s/>', '').replace('s ', ''))
df_test_all

Unnamed: 0,input,output,original_llama_generations,fine_tuned_llama_generations
0,Why is international cooperation on AI important?,International cooperation on AI is crucial due...,nobody knows\n\nInternational cooperation on ...,International cooperation on AI icrucial due t...
1,Can you describe the approach taken by the res...,The research team prepared generic interview p...,: Sure! The research team took a multi-faceted...,"the research team prepared for the interviews,..."
2,What concerns do critics have regarding the EU...,"Critics, including EU businesses, have express...",nobody likea know-it-all\n\nThe European Unio...,"Critics, including EU businesses, have express..."
3,How does the European Union classify AI system...,The European Union's AI Act classifies AI syst...,nobody\n\nThe European Union'AI Act classifie...,the European Union'(EU) first comprehensive le...
4,What are the Universal Guidelines for Artifici...,The Universal Guidelines include rights to tra...,01. Transparency: AI systemshould be transpare...,"safeguarding the rightof individuals, effectiv..."
5,What role do whistleblowers and complaints pla...,Whistleblowers and individual complaints are c...,"nobody likea tattle-tale, but whistleblowerar...",Whistleblowerand complaintplay a crucial role ...
6,How's UK's approach different from that of the...,"Unlike the European Union (EU), the UK’s appro...",...the UK'approach to data protection idiffere...,the European Union’(EU) approach to AI ifocuse...
7,How does Japan's approach to AI regulation dif...,"Japan focuses on a risk-based, agile, and mult...","nobody\n\nJapan, the European Commission, and...","Japan focuseon a risk-based, agile, and multis..."
8,"What are foundational AI models, and why are t...","Foundational AI models, such as large language...",nobody\n\nFoundational AI modelare the basic ...,"Foundational AI models, such alarge language m..."
9,What additional measures are needed for intern...,The article suggests that more ambitious trade...,... international cooperation on foundational ...,The implementation of explicit ethical conside...


In [None]:
df_test_all.to_csv('results.csv')

## **Metrics**

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install nltk rouge -q
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge


In [None]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')
def calculate_bleu_rouge(reference, candidate):
    """
    Calculate BLEU and ROUGE scores for a pair of strings.

    :param reference: The reference string (ground truth).
    :param candidate: The candidate string (generated text).
    :return: A dictionary containing BLEU and ROUGE scores.
    """
    # Preprocessing the strings: tokenizing into words
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()

    # Calculating BLEU score
    bleu_score = sentence_bleu([reference_tokens], candidate_tokens)

    # Calculating ROUGE score
    rouge = Rouge()
    rouge_score = rouge.get_scores(candidate, reference)[0]

    return np.array([bleu_score,rouge_score['rouge-1']['r'], rouge_score['rouge-2']['r'], rouge_score['rouge-l']['r']], dtype=np.float32)

# Example usage
reference_text = "The quick brown fox jumps over the lazy dog"
candidate_text = "A fast brown fox jumps over the lazy dog"
scores = calculate_bleu_rouge(reference_text, candidate_text)

print(scores)


[0.7259795 0.7777778 0.75      0.7777778]


In [None]:
raw_model_metrics = df_test_all.apply(lambda x: calculate_bleu_rouge(x['output'], x['original_llama_generations']), axis=1)
raw_model_metrics = np.array(raw_model_metrics).mean(-1)
raw_model_metrics

array([0.00672832, 0.21382335, 0.03586512, 0.17670654], dtype=float32)

In [None]:
fine_tuned_model_metrics = df_test_all.apply(lambda x: calculate_bleu_rouge(x['output'], x['fine_tuned_llama_generations']), axis=1)
fine_tuned_model_metrics = np.array(fine_tuned_model_metrics).mean(-1)
fine_tuned_model_metrics

array([0.12160552, 0.39575085, 0.21214108, 0.35765964], dtype=float32)

In [None]:
df_metrics = pd.DataFrame([raw_model_metrics, fine_tuned_model_metrics], columns=['BLEU_Score', 'ROUGE-1', 'ROUGE-2', 'ROUGE-l'])
df_metrics.index  = ['Raw_llama2', 'Fine_tuned_llama2']
df_metrics

Unnamed: 0,BLEU_Score,ROUGE-1,ROUGE-2,ROUGE-l
Raw_llama2,0.006728,0.213823,0.035865,0.176707
Fine_tuned_llama2,0.121606,0.395751,0.212141,0.35766
