In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
model='/content/drive/MyDrive/mistral_model'

In [None]:
!pip install rouge_score
!pip install meteor_score

In [4]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [5]:
import pandas as pd
from datasets import load_metric
from transformers import AutoTokenizer
from unsloth import FastLanguageModel


In [22]:
max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True

In [28]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model, # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.4
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

In [29]:
import pandas as pd
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np

# Load the dataset
test_df = pd.read_csv('NLP_Recipe_test.csv')

# Load the metrics
bleu_metric = load_metric('bleu')
meteor_metric = load_metric('meteor')
bertscore_metric = load_metric('bertscore', lang="en")
rouge_metric = load_metric('rouge')

# Lists to store scores
bleu_scores = []
meteor_scores = []
bertscore_f1_scores = []
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

import numpy as np

# Iterate over the first 5 rows and compute scores
for index, row in test_df.head(5).iterrows():
    ner_input = row['ner']
    true_output = row['ingredients'] + " " + row['steps']

    recipe_prompt = f"""Given the following key ingredients, generate the full ingredient list with quantities and cooking steps:

    ### Key Ingredients:
    {ner_input}

    ### Full Ingredients and Steps:
    """

    inputs = tokenizer(
        [recipe_prompt],
        return_tensors="pt"
    ).to('cuda')

    outputs = model.generate(**inputs, max_new_tokens=512, use_cache=True)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    generated_recipe = generated_text.split("### Full Ingredients and Steps:\n")[1].strip()

    # Compute metrics
    reference = [true_output.split()]
    candidate = generated_recipe.split()

    bleu_score = bleu_metric.compute(predictions=[candidate], references=[reference])['bleu']
    meteor_score = meteor_metric.compute(predictions=[generated_recipe], references=[true_output])['meteor']
    bertscore_results = bertscore_metric.compute(predictions=[generated_recipe], references=[true_output], lang='en')
    bertscore_f1 = np.mean(bertscore_results['f1'])  # Use numpy to calculate mean
    rouge_score = rouge_metric.compute(predictions=[generated_recipe], references=[true_output])

    bleu_scores.append(bleu_score)
    meteor_scores.append(meteor_score)
    bertscore_f1_scores.append(bertscore_f1)
    rouge_1_scores.append(rouge_score['rouge1'].mid.fmeasure)
    rouge_2_scores.append(rouge_score['rouge2'].mid.fmeasure)
    rouge_l_scores.append(rouge_score['rougeL'].mid.fmeasure)

# Compute averages
avg_bleu = np.mean(bleu_scores)
avg_meteor = np.mean(meteor_scores)
avg_bertscore_f1 = np.mean(bertscore_f1_scores)
avg_rouge_1 = np.mean(rouge_1_scores)
avg_rouge_2 = np.mean(rouge_2_scores)
avg_rouge_l = np.mean(rouge_l_scores)

# Print average scores
print("Average BLEU Score:", avg_bleu)
print("Average METEOR Score:", avg_meteor)
print("Average BERTScore F1 Score:", avg_bertscore_f1)
print("Average ROUGE-1 Score:", avg_rouge_1)
print("Average ROUGE-2 Score:", avg_rouge_2)
print("Average ROUGE-L Score:", avg_rouge_l)



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argu

Average BLEU Score: 0.09988397488871129
Average METEOR Score: 0.3257070815241569
Average BERTScore F1 Score: 0.8683696866035462
Average ROUGE-1 Score: 0.4177358046283942
Average ROUGE-2 Score: 0.24378998780986993
Average ROUGE-L Score: 0.35558934878319626
