In [None]:
#%pip install langchain llama-cpp-python 
#CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

### Load Packages

In [None]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
import os
import time

### Define System Prompt and load Llama2 Model

In [None]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

In [None]:
#DEFAULT PROMPT
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
SYSTEM_PROMPT = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS

In [None]:
#Chatbot prompt for later use case
SYSTEM_PROMPT_USECASE = """
<<SYS>>
You are a chatbot that helps people learn a new language on a platform. You always give the answer to the question in an academic language style.
If you don't know the answer to a question, you tell them truthfully that you don't know and don't give false information. You are a helpful, respectful and honest assistant.
Your answers should not contain harmful, unethical, racist, sexist, toxic, dangerous or illegal content. Please make sure that your answers are socially unbiased and positive if nature#.
If a question does not make sense or is not factually coherent, please explain why, rather than answering something incorrectly.<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT_USECASE +E_SYS

In [None]:
#general prompt to test llm for testing questions
SYSTEM_PROMPT_EVAL = """
<<SYS>>
You will be presented with a question and four answer options where only one is correct. You will respond with the letter that represents the correct answer option.
Do not add any answer options, only choose the correct one from the for options presented because one of them is correct.
If you don't know the answer, be honest and state that you don't know.
<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT_EVAL +E_SYS

In [None]:
def get_prompt(instruction):
    return B_INST + SYSTEM_PROMPT + instruction + E_INST

os.listdir("/Users/josi/Llama2_weights")

In [None]:
llm = LlamaCpp(
    model_path= "/Users/josi/Llama2_weights/llama-2-7b-chat.Q4_K_M.gguf",
    temperature=0,
    max_tokens=48,
    top_p=1,
    #callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

### Create Dictionaries with test questions (English)

In [None]:
##QUESTIONS WITH ANSWER OPTIONS
questions_data = [
    {
        "question": "Choose the correct option that represents an informal greeting.",
        "choices": {"a": "Hi", "b": "Good Day", "c": "Greetings", "d": "Dear Mr./Mrs."},
        "correct": "a: Hi"
    },
    {
        "question": "Choose the correct option that represents an informal response to the question: 'How's it going?'",
        "choices": {"a": "Hi, good hbu?", "b": "Greetings, everything is fine, and you?", "c": "Good day, I'm well, thank you. How are you?", "d": "Hello, everything's fine"},
        "correct": "a: Hi, good hbu?"
    },
    {
        "question": "Choose the correct option that represents an informal email closing.",
        "choices": {"a": "Cheers", "b": "Sincerely", "c": "Best regards", "d": "I am awaiting your response"},
        "correct": "a: Cheers"
    },
    {
        "question": "Choose the correct option that represents a formal greeting.",
        "choices": {"a": "Good Day", "b": "Hi", "c": "Hey", "d": "Yo"},
        "correct": "a: Good Day"
    },
    {
        "question": "Choose the correct option that represents a formal response to the question: 'How are you doing?",
        "choices": {"a": "Hello, I am doing well. How about you?", "b": "Hi, good you?", "c": "Not too bad, you?", "d": "good, you?"},
        "correct": "a: Hello, I am doing well. How about you?"
    },
    {
        "question": "Choose the correct option that represents a professional email closing.",
        "choices": {"a": "Sincerely", "b": "Cheers", "c": "See you!", "d": "Until next time."},
        "correct": "a: Sincerely"
    },
    {
        "question": "Choose the correct translation for 'hello' in Spanish.",
        "choices": {"a":"Hola", "b": "Hello", "c":"Hallo", "d": "Bonjour"},
        "correct": "a: Hola"
    },
    {
        "question": "Choose the correct translation for 'Thank you' in German.",
        "choices": {"a": "Danke", "b": "Thanks", "c": "Gracias", "d": "Auf Wiedersehen"},
        "correct": "a: Danke"
    },
    {
        "question": "Choose the correct translation for 'book' in italian.",
        "choices": {"a": "Libro", "b": "Buch", "c": "Plant", "d": "Libre"},
        "correct": "a: Libro"
    },
    {
        "question": "Choose the correct answer that is the capital of France?",
        "choices": {"a": "Paris", "b": "Berlin", "c": "Madrid", "d": "New York"},
        "correct": "a: Paris"
    },
    {
        "question": "Choose the correct answer for the following math problem: 3x + 5 = 20.",
        "choices": {"a": "x = 5", "b": "x = 15", "c": "x = 10", "d": "x = 2"},
        "correct": "a: x = 5"
    },
    {
        "question": "Choose the correct answer for the following statement: If 'cat' is to 'kitten,' what is 'dog' to?",
        "choices": {"a": "'Dog' is to 'puppy'", "b": "'Dog' is to 'kitten'", "c": "'Dog' is to 'cub'", "d": "'Dog' is to 'kitty'"},
        "correct": "a: 'Dog' is to 'puppy'"
    },
    {
        "question": "Choose the correct answer for the following statement: Identify the named Person in the sentence 'Elon Musk is the CEO of a company that produces cars.'",
        "choices": {"a": "Elon Musk", "b": "CEO", "c": "company", "d": "cars"},
        "correct": "a: Elon Musk"
    },
    {
        "question": "Choose the correct answer for the following statement: Extract the date mentioned in the text 'The conference is scheduled for January 25, 2023.'",
        "choices": {"a": "January 25, 2023", "b": "May 25-27, 2023.", "c": "January 27, 2023.", "d": "January 2, 2025"},
        "correct": "a: January 25, 2023"
    },
]


for question in questions_data:
    print("Question:", question["question"])
    print("Answer Options:")
    for option, answer in question["choices"].items():
        print(f"{option}: {answer}")
    print("Correct Answer:", f"{question['correct']}")
    print("\n" + "-"*50 + "\n")

### Evaluate Model based on Accuracy

In [None]:
model_answers = []

def evaluate_model(model, questions_data):
    correct_answers = 0
    total_questions = len(questions_data)

    for question_data in questions_data:
        question = question_data["question"]
        choices = question_data["choices"]
        correct_option = question_data["correct"]

        # Get model's answer for current question
        prompt = f"{question}\nAnswer Options:\n"
        for option, answer in choices.items():
            prompt += f"{option}: {answer}\n"
        prompt = get_prompt(prompt)
        #print(prompt)
        model_answer = model(prompt)

        # Check if selected option in model's answer matches the correct option
        if model_answer.strip() == correct_option.strip():
            correct_answers += 1

        # Print results for each question
        print(f"Question: {question}")
        print("Answer Options:")
        for option, answer in choices.items():
            print(f"{option}: {answer}")
        print(f"Correct Answer: {correct_option}")
        print(f"Model Answer: {model_answer}")
        print("------------")

        # Store the model's answer and the corresponding question in the array
        model_answers.append({
            "question": question,
            "choices": choices,
            "correct_option": correct_option,
            "selected_option": model_answers,
        })

    # print overall evaluation results
    print(f"Total Questions: {total_questions}")
    print(f"Correct Answers: {correct_answers}")
    print(f"Accuracy: {correct_answers / total_questions * 100:.2f}%")

In [None]:
#run method to get accuracy score
evaluate_model(llm, questions_data)

### Evaluate Model based on Rouge (Recall-Oriented Understudy for Gissing Evaluation)

A set of metrics used for evaluating the quality of summaries. It compares the generated summary with one or more reference summaries and calculates precision, recall, and F1-score 

- ROUGE-N (quantifies the overlap of N-grams, contiguous sequences of N items (typically words or characters), between the system-generated summary and the reference summary) 
- ROUGE-1 (unigram overlap): This metric measures the overlap of unigrams (single words) between the generated summary and the reference summary. It focuses on the recall of unigrams.
- ROUGE-2 (bigram overlap): Similar to ROUGE-1, but it measures the overlap of bigrams (pairs of consecutive words) instead of unigrams. It evaluates the recall of bigrams.
- ROUGE-L (longest common subsequence): Instead of measuring word overlap, ROUGE-L focuses on the longest common subsequence (LCS) between the generated and reference summaries. It evaluates the recall of the longest common subsequence

- high precision value suggests that the words or phrases churned out by the machine translation or submodel are primarily accurate
- high recall value, ideally close to 1, implies that the content of the machine-generated output aligns closely with the human-made reference. It signifies the model's proficiency in capturing relevant information



In [None]:
hypothesis = "the #### transcript is a written version of each day 's cnn student news program use this transcript to he    lp students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you     saw on cnn student news"

reference = "this page includes the show transcript use the transcript to help students with reading comprehension and     vocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teac    her or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests     students ' knowledge of even ts in the news"

rouge = Rouge()
scores = rouge.get_scores(hypothesis, reference)
print(scores)

In [None]:
#also check if rouge-score library better

#%pip install rouge
from rouge import Rouge

##Rouge() hat rouge-1, rouge-2 and rouge-l as option 

def evaluate_model(model, questions):
    try:
        correct_answers = 0
        total_questions = len(questions)
        model_answers = []
        rouge_scores = {"rouge-1": {"recall": 0, "precision": 0, "f1-score": 0}} 

        for question_data in questions_data:
            question = question_data["question"]
            choices = question_data["choices"]
            correct_option = question_data["correct"]

            # Get model's answer for current question
            prompt = f"{question}\nAnswer Options:\n"
            for option, answer in choices.items():
                prompt += f"{option}: {answer}\n"
            prompt = get_prompt(prompt)
            model_answer = model(prompt)

            # Check if selected option in model's answer matches the correct option
            model_answer = model_answer.strip()
            correct_option = correct_option.strip()

            if model_answer == correct_option:
                correct_answers += 1

            # Get Rouge Score
            rouge = Rouge()
            scores = rouge.get_scores(correct_option, model_answer) 
            print(scores)
            # Check if Rouge calculation was successful - WIP -> see test below 
            if scores:
                scores = scores[0]
                for metric in ["recall", "precision", "f1-score"]:
                    rouge_scores[f"rouge-1"][metric] += scores.get(f"rouge-1", {}).get(metric, 0)
            

            # Print results for each question
            print(f"Question: {question}")
            print("Answer Options:")
            for option, answer in choices.items():
                print(f"{option}: {answer}")
            print(f"Correct Answer: {correct_option}")
            print(f"Model Answer: {model_answer}")
            print("------------")

            # Store the model's answer and the corresponding question in the array
            model_answers.append({
                "question": question,
                "choices": choices,
                "correct_option": correct_option,
                "selected_option": model_answers,
            })


        # Calculate average Rouge scores
        for metric in ["recall", "precision", "f1-score"]:
            avg_rouge = rouge_scores[f"rouge-1"][metric] / total_questions

        # Print overall evaluation results
        print(f"Total Questions: {total_questions}")
        print(f"Correct Answers: {correct_answers}")
        print(f"Accuracy: {correct_answers / total_questions * 100:.2f}%")
        print("Average Rouge Score: ", avg_rouge)

        return model_answers, rouge_scores

    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

In [None]:
model_answers, rouge_scores = evaluate_model(llm, questions_data)

In [None]:
##test rouge library - https://pypi.org/project/rouge/ 
## -> not clear which rouge metric meant: theres rouge-1/2/l - only referencing paper
##Note: "f" stands for f1_score, "p" stands for precision, "r" stands for recall.


##1 sentence
hypothesis = "I am doing well. How about you?"
reference = "Hello, I am doing well. How about you?"
#reference = "Good day! I'm doing well, thank you for asking. How about you?"

rouge = Rouge()
scores = rouge.get_scores(hypothesis, reference)
print(scores)

In [None]:
##multiple sentences
import json

# Load some sentences
with open('./tests/data.json') as f:
  data = json.load(f)

hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data]))
rouge = Rouge()
scores = rouge.get_scores(hyps, refs)
# or
scores = rouge.get_scores(hyps, refs, avg=True)


##two files
from rouge import FilesRouge

files_rouge = FilesRouge()
scores = files_rouge.get_scores(hyp_path, ref_path)
# or
scores = files_rouge.get_scores(hyp_path, ref_path, avg=True)

### Correctness Evaluation

In [None]:
%pip install bitsandbytes
%pip install datasets

### Performance Evaluation - Perplexity: measure that quantifies how well the model predicts a sample of text. Lower perplexity values indicate better performance 

In [None]:
import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer
import bitsandbytes as bnb

model_name = '/kaggle/input/llama-2/pytorch/7b-chat-hf/1'

tokenizer = LlamaTokenizer.from_pretrained(model_name)

free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'

n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  device_map='auto',
  load_in_4bit=True,
  max_memory=max_memory,
  do_sample=True,
  torch_dtype="auto"
)

from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")


import torch
from tqdm import tqdm

max_length = model.config.max_length
stride = 512
seq_len = encodings.input_ids.size(1)
device = "cuda"

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())