## Install required libraries


In [None]:
!pip install transformers datasets evaluate bert_score sacrebleu rouge_score numpy

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import evaluate
import numpy as np


## Load the model and tokenizer






In [None]:
from huggingface_hub import login
login(token="")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model and tokenizer
model_name = "user/model_name"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Load the test dataset

In [None]:
dataset_name = "user/dataset"
dataset = load_dataset(dataset_name)

## Prepare the dataset

In [None]:
#Change this part according to your own dataset format.
input_texts = [conv['value'] for data in dataset['test'] for conv in data['conversations'] if conv['from'] == 'human']
references = [[conv['value']] for data in dataset['test'] for conv in data['conversations'] if conv['from'] == 'gpt']


## Generate predictions

In [None]:
def generate_predictions(model, tokenizer, input_texts, max_length=256):
    predictions = []
    for text in input_texts:
        inputs = tokenizer.encode(text, return_tensors='pt').to(device)
        outputs = model.generate(inputs, max_length=max_length)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)
    return predictions


In [None]:
predictions = generate_predictions(model, tokenizer, input_texts)

## Initialize evaluation metrics


In [None]:
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

## Compute BertScore


In [None]:
bertscore_results = bertscore.compute(predictions=predictions, references=[ref[0] for ref in references], lang="en")

## Compute Bleu score

In [None]:
bleu_results = bleu.compute(predictions=predictions, references=references)

## Computer Rouge-1, Rouge-2, Rouge-L

In [None]:
rouge_results = rouge.compute(predictions=predictions, references=[ref[0] for ref in references])

In [None]:
avg_precision = np.mean(bertscore_results['precision'])
avg_recall = np.mean(bertscore_results['recall'])
avg_f1 = np.mean(bertscore_results['f1'])

# Print results
print(f"BERTScore - Precision: {avg_precision:.4f}, Recall: {avg_recall:.4f}, F1: {avg_f1:.4f}")
print(f"BLEU Score: {bleu_results['score']:.4f}")
print("ROUGE-1:", rouge_results['rouge1'])
print("ROUGE-2:", rouge_results['rouge2'])
print("ROUGE-L:", rouge_results['rougeL'])


## Save the inference results to an Excel file

In [None]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import os

# Example DataFrame
df = pd.DataFrame({
    'Human Prompt': input_texts,
    'Generated Response': predictions,
    'Reference Response': references
})

# Ensure the directory exists
path = '/content/drive/My Drive/'
if not os.path.exists(path):
    os.makedirs(path)

# Save the DataFrame to an Excel file in the specified directory
file_path = os.path.join(path, 'excel_file.xlsx')
df.to_excel(file_path, index=False)

print(f'File saved to {file_path}')
