In [None]:
%pip install langchain llama-cpp-python 
#CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

### Load Packages

In [None]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
import os

### Define System Prompt and load Llama2 Model

In [None]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
# DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information"

In [None]:
#Chatbot prompt for later use case
DEFAULT_SYSTEM_PROMPT = """
<<SYS>>
You are a chatbot that helps people learn a new language. You always give the answer to the question in a formal manner.
If you don't know the answer to a question, you tell them truthfully that you don't know and don't give false information. You are a helpful, respectful and honest assistant.
Your answers should not contain harmful, unethical, racist, sexist, toxic, dangerous or illegal content. Please make sure that your answers are socially unbiased and positive.
If a question does not make sense or is not factually coherent, please explain why, rather than answering something incorrectly.<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + DEFAULT_SYSTEM_PROMPT +E_SYS

In [None]:
#general prompt to test llm for testing questions
SYSTEM_PROMPT_2 = """
<<SYS>>
You are alwys giving the answer to the question in a short and very simple manner and you answer in the language style that is asked from you. You leave out unnecessary words and if you have several options you would like to present only name one. If the answer is only one word you don't put a dot and the end. If you are asked to translate something you translate it.
If you don't know the answer to a question, you tell them truthfully that you don't know and don't give false information. You are a helpful, respectful and honest assistant.
Your answers should not contain harmful, unethical, racist, sexist, toxic, dangerous or illegal content. Please make sure that your answers are socially unbiased and positive.
If a question does not make sense or is not factually coherent, please explain why, rather than answering something incorrectly.<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT_2 +E_SYS

In [None]:
def get_prompt(instruction):
    return B_INST + SYSTEM_PROMPT + instruction + E_INST

os.listdir("/Users/josi/Llama2_weights")

In [None]:
llm = LlamaCpp(
    #model_path="/Users/josi/Llama2_weights/llama-2-7b.Q4_K_M.gguf",
    model_path = "/Users/josi/Llama2_weights/llama-2-7b-chat.Q5_K_M.gguf",
    temperature=0.75,
    max_tokens=2048,
    top_p=1,
    # callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

### Create Dictionaries with test questions (English)

In [None]:
# Define test questions for different categories
informal_questions = {
    "Write an informal greeting.": "Hey",
    "Write an informal response to the question: 'How's it going?'": "Hi, going good, hbu?",
    "Write an informal message inviting someone to dinner tonight": "Hey, wanna join us for dinner tonight?",
    "Compose an informal email closing.": "Cheers, [Your Name]",
}

formal_questions = {
    "Write a formal greeting.": "Hello",
    "Write a formal response to the question: 'How is it going?'": "Hello, I am doing well. How about you?",
    "Write a formal message inviting someone to dinner tonight": "Hello, would you like to join us for dinner tonight?",
    "Compose a professional email closing.": "Sincerely, [Your Name]",
}

academic_questions = {
    "Describe the french revolution in two sentences using academic language.": "The historical event of the French Revolution was marked by widespread social upheaval...",
    "Define gravity in two sentences using academic language.": "Gravity is a force that attracts two objects with mass",
}

translation_questions = {
    "Translate 'hello' to Spanish.": "Hola",
    "Translate 'good morning' to French.": "Bonjour",
    "Translate 'Thank you' to German.": "Danke",
    "How do you say 'hello' in Japanese?": "Konnichiwa (こんにちは)",
    "Provide the Italian translation for 'book'.": "Libro",
}

general_knowledge_questions = {
    "What is the capital of France?": "The capital of France is Paris",
    "What is the square root of 25?": "The square root of 25 is 5",
    "What is the circumference of the earth?":"The circumference of the earth is approximately 40,075 kilometers",
    "question": "Explain the concept of photosynthesis.", "answer": "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods with the help of chlorophyll.",
}

textual_entailment = {
    "question": "Given the statement 'The sun rises in the east,' is the statement 'The earth is flat' likely true or false?", "answer": "False",
    "question": "If 'A implies B' and 'B is false,' what can you conclude about A?", "answer": "Nothing definitive can be concluded about A.",
}

ambiguity_handling = {
    "question": "Provide two different interpretations of the phrase 'bank account.'", "answer": "1. An account with a financial institution. 2. The land alongside a river.",
    "question": "What could the word 'bat' mean in the context of baseball and in the context of an animal?", "answer": "In baseball, a 'bat' is a piece of equipment used to hit the ball. In the context of an animal, a 'bat' is a flying mammal.",
}

reasoning_problem_solving = {
    "question": "If a car travels at 60 miles per hour, how long will it take to cover 120 miles?", "answer": "2 hours",
    "question": "Solve the following math problem: 3x + 5 = 20.", "answer": "x = 5",
}

analogical_reasoning = {
    "question": "If 'cat' is to 'kitten,' what is 'dog' to?", "answer": "'Dog' is to 'puppy'",
    "question": "Identify the relationship between 'pen' and 'ink' and apply the same relationship to 'keyboard.'", "answer": "'Keyboard' is to 'electricity'",
}

commonsense_reasoning = {
    "question": "What might be a common reaction to receiving a gift?", "answer": "A common reaction to receiving a gift is expressing gratitude and appreciation.",
    "question": "Predict a possible consequence of leaving food out in the sun for too long.", "answer": "Leaving food out in the sun for too long may lead to spoilage and bacterial growth.",
}

# Named Entity Recognition (NER)
ner = {
    "question": "Identify the named entities in the sentence 'Elon Musk is the CEO of SpaceX.'", "answer": "Named entities: Elon Musk, SpaceX.",
    "question": "Extract the dates mentioned in the text 'The conference is scheduled for January 25-27, 2023.'", "answer": "Dates: January 25-27, 2023.",
}

# Combine all test questions into a single dictionary
test_questions = {**informal_questions, **formal_questions, **academic_questions, **translation_questions, **general_knowledge_questions,**textual_entailment,**ambiguity_handling,**reasoning_problem_solving,**analogical_reasoning,**commonsense_reasoning,**ner}


### Evaluate Model based on Accuracy

In [None]:
model_answers = []
def evaluate_model(model, questions):
    correct_answers = 0
    total_questions = len(questions)

    for question, expected_answer in questions.items():
        # Get the model's answer for the current question
        prompt = get_prompt(question)
        model_answer = model(prompt)

        # Remove trailing dots from the model's answer and Compare the model's answer to the expected answer (case-insensitive, whitespace-insensitive)
        model_answer = model_answer.rstrip('.')
        if model_answer.strip().lower() == expected_answer.strip().lower():
            correct_answers += 1

        # Check if any of the expected words are present in the model's answer
        #if any(word.lower() in model_answer.lower() for word in expected_answer.split()):
        #    correct_answers += 1

        # Print the results for each question
        print(f"Question: {question}")
        print(f"Expected Answer: {expected_answer}")
        print(f"Model Answer: {model_answer}")
        print("------------")

        # Store the model's answer and the corresponding question in the array
        model_answers.append({
            "question": question,
            "expected_answer": expected_answer,
            "model_answer": model_answer,
        })

    # Print overall evaluation results
    print(f"Total Questions: {total_questions}")
    print(f"Correct Answers: {correct_answers}")
    print(f"Accuracy: {correct_answers / total_questions * 100:.2f}%")


In [None]:
evaluate_model(llm, test_questions)

### Evaluate Model based on Rouge (Recall-Oriented Understudy for Gissing Evaluation)

In [None]:
### ROUGE (Recall-Oriented Understudy for Gissing Evaluation): a set of metrics used for evaluating the quality of summaries. It compares the generated summary with one or more reference summaries and calculates precision, recall, and F1-score 

#%pip install rouge
from rouge import Rouge

def evaluate_model(model, questions):
    try:
        correct_answers = 0
        total_questions = len(questions)
        model_answers = []
        rouge_scores = {"rouge-1": {"precision": 0, "recall": 0, "f1-score": 0}}

        for question, expected_answer in questions.items():
            # Get the model's answer for the current question
            prompt = get_prompt(question)
            model_answer = model(prompt)

            # format answer to match properly 
            model_answer = model_answer.rstrip('.')
            #and calculate accuracy (which is currently defined as approximate match where only any word has to match the correct answer.)
            if any(word.lower() in model_answer.lower() for word in expected_answer.split()):
                correct_answers += 1

            # Concatenate model's answer and expected answer into strings for Rouge evaluation
            candidate_summary = " ".join(model_answer.split())
            reference_summary = " ".join(expected_answer.split())

            # Update Rouge scores
            rouge = Rouge()
            scores = rouge.get_scores(candidate_summary, reference_summary)

            # Check if Rouge calculation was successful - WIP
            if scores:
                scores = scores[0]
                for metric in ["precision", "recall", "f1-score"]:
                    rouge_scores[f"rouge-1"][metric] += scores.get(f"rouge-1", {}).get(metric, 0)

            # store model answers in array
            model_answers.append({
                "question": question,
                "expected_answer": expected_answer,
                "model_answer": model_answer,
            })

            #print each model answer with score
            print(f"Question: {question}")
            print(f"Expected Answer: {expected_answer}")
            print(f"Model Answer: {model_answer}")
            print("------------")


        # Calculate average Rouge scores
        for metric in ["precision", "recall", "f1-score"]:
            rouge_scores[f"rouge-1"][metric] /= total_questions

        # Print overall evaluation results
        print(f"Total Questions: {total_questions}")
        print(f"Correct Answers: {correct_answers}")
        print(f"Accuracy: {correct_answers / total_questions * 100:.2f}%")
        print("Rouge Scores:")
        print(rouge_scores)

        return model_answers, rouge_scores

    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

In [None]:

model_answers, rouge_scores = evaluate_model(llm, test_questions)


###current output: 
#Total Questions: 19
#Correct Answers: 16
#Accuracy: 84.21%
#Rouge Scores:
#{'rouge-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0}}


In [None]:
###Correctness Evaluation

In [None]:
%pip install bitsandbytes
%pip install datasets

In [None]:
###Performance Evaluation - Perplexity: measure that quantifies how well the model predicts a sample of text. Lower perplexity values indicate better performance 
import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer
import bitsandbytes as bnb

model_name = '/kaggle/input/llama-2/pytorch/7b-chat-hf/1'

tokenizer = LlamaTokenizer.from_pretrained(model_name)

free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'

n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  device_map='auto',
  load_in_4bit=True,
  max_memory=max_memory,
  do_sample=True,
  torch_dtype="auto"
)

from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")


import torch
from tqdm import tqdm

max_length = model.config.max_length
stride = 512
seq_len = encodings.input_ids.size(1)
device = "cuda"

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())