In [75]:
#%pip install langchain llama-cpp-python 
#CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

### Load Packages

In [76]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
import os

### Define System Prompt and load Llama2 Model

In [77]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

In [78]:
#DEFAULT PROMPT
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. 
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
SYSTEM_PROMPT = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS

In [79]:
#Chatbot prompt for later use case
SYSTEM_PROMPT_USECASE = """
<<SYS>>
You are a chatbot that helps people learn a new language on a platform. You always give the answer to the question in an academic language style.
If you don't know the answer to a question, you tell them truthfully that you don't know and don't give false information. You are a helpful, respectful and honest assistant.
Your answers should not contain harmful, unethical, racist, sexist, toxic, dangerous or illegal content. Please make sure that your answers are socially unbiased and positive if nature#.
If a question does not make sense or is not factually coherent, please explain why, rather than answering something incorrectly.<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT_USECASE +E_SYS

In [91]:
#general prompt to test llm for testing questions
SYSTEM_PROMPT_EVAL_ACCURACY = """
<<SYS>>
You will be presented with a question and several answer options where only one is correct. You will respond with the letter that represents the correct answer option.
Do not add any answer options, only choose the correct one from the for options presented because one of them is correct.
If you don't know the answer, be honest and state that you don't know.
<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT_EVAL_ACCURACY +E_SYS

In [82]:
def get_prompt(instruction):
    return B_INST + SYSTEM_PROMPT + instruction + E_INST

os.listdir("/Users/josi/Llama2_weights")

['.DS_Store', 'llama-2-7b.Q4_K_M.gguf', 'llama-2-7b-chat.Q4_K_M.gguf']

In [92]:
llm = LlamaCpp(
    model_path= "/Users/josi/Llama2_weights/llama-2-7b-chat.Q4_K_M.gguf",
    temperature=0,
    max_tokens=48,
    top_p=1,
    #callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /Users/josi/Llama2_weights/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  40

### Create Dictionaries with test questions (English)

In [95]:
##Questions with Answer options for ACCURACY
questions_accuracy = [

    #Informal Style
    {
        "question": "Choose the correct option that represents an informal greeting.",
        "choices": {"a": "Hi", "b": "Good Day", "c": "Greetings", "d": "Dear Mr./Mrs."},
        "correct": "a: Hi"
    },
    {
        "question": "Choose the correct option that represents an informal response to the question: 'How's it going?'",
        "choices": {"a": "Hi, good hbu?", "b": "Greetings, everything is fine, and you?", "c": "Good day, I'm well, thank you. How are you?", "d": "Hello, everything's fine"},
        "correct": "a: Hi, good hbu?"
    },
    {
        "question": "Choose the correct option that represents an informal email closing.",
        "choices": {"a": "Cheers", "b": "Sincerely", "c": "Best regards", "d": "I am awaiting your response"},
        "correct": "a: Cheers"
    },

    #Formal Style
    {
        "question": "Choose the correct option that represents a formal greeting.",
        "choices": {"a": "Good Day", "b": "Hi", "c": "Hey", "d": "Yo"},
        "correct": "a: Good Day"
    },
    {
        "question": "Choose the correct option that represents a formal response to the question: 'How are you doing?",
        "choices": {"a": "Hello, I am doing well. How about you?", "b": "Hi, good you?", "c": "Not too bad, you?", "d": "good, you?"},
        "correct": "a: Hello, I am doing well. How about you?"
    },
    {
        "question": "Choose the correct option that represents a professional email closing.",
        "choices": {"a": "Sincerely", "b": "Cheers", "c": "See you!", "d": "Until next time."},
        "correct": "a: Sincerely"
    },

    #Academic Style
      {
        "question": "Choose the correct option that represents an academic paper introduction.",
        "choices": {"a": "Today we present our topic XYZ and we will also focus on ABC.", "b": "In this paper XYZ is presented with a focus on ABC.", "c": "Hi! In our paper we write about XYZ and ABC.", "d": "I show you something about XYZ in this paper."},
        "correct": "b: In this paper XYZ is presented with a focus on ABC."
    },


    #Detect correct language style
    {
        "question": "The improvements canʼt be introduced due to funding restrictions.",
        "choices": {"a": "Formal Language Style", "b": "Informal Language Style", "c": "Academic Language Style"},
        "correct": "b: Informal Language Style"
    },
    {
        "question": " It was raining cats and dogs.",
        "choices": {"a": "Formal Language Style", "b": "Informal Language Style", "c": "Academic Language Style"},
        "correct": "b: Informal Language Style"
    },
    {
        "question": "Improvements cannot be introduced due to funding restrictions.",
        "choices": {"a": "Formal Language Style", "b": "Informal Language Style", "c": "Academic Language Style"},
        "correct": "a: Formal Language Style"
    },
    {
        "question": "During the interview, students were asked about their experiences.",
        "choices": {"a": "Formal Language Style", "b": "Informal Language Style", "c": "Academic Language Style"},
        "correct": "a: Formal Language Style"
    },
    {
        "question": "New results in this area are produced by the research group.",
        "choices": {"a": "Formal Language Style", "b": "Informal Language Style", "c": "Academic Language Style"},
        "correct": "c: Academic Language Style"
    },

   


    #Translation
    {
        "question": "Choose the correct translation for 'hello' in Spanish.",
        "choices": {"a":"Hola", "b": "Hello", "c":"Hallo", "d": "Bonjour"},
        "correct": "a: Hola"
    },
    {
        "question": "Choose the correct translation for 'Thank you' in German.",
        "choices": {"a": "Danke", "b": "Thanks", "c": "Gracias", "d": "Auf Wiedersehen"},
        "correct": "a: Danke"
    },
    {
        "question": "Choose the correct translation for 'book' in italian.",
        "choices": {"a": "Libro", "b": "Buch", "c": "Plant", "d": "Libre"},
        "correct": "a: Libro"
    },
    {
        "question": "Choose the correct answer that is the capital of France?",
        "choices": {"a": "Paris", "b": "Berlin", "c": "Madrid", "d": "New York"},
        "correct": "a: Paris"
    },

    #General
    {
        "question": "Choose the correct answer for the following math problem: 3x + 5 = 20.",
        "choices": {"a": "x = 5", "b": "x = 15", "c": "x = 10", "d": "x = 2"},
        "correct": "a: x = 5"
    },
    {
        "question": "Choose the correct answer for the following statement: If 'cat' is to 'kitten,' what is 'dog' to?",
        "choices": {"a": "'Dog' is to 'puppy'", "b": "'Dog' is to 'kitten'", "c": "'Dog' is to 'cub'", "d": "'Dog' is to 'kitty'"},
        "correct": "a: 'Dog' is to 'puppy'"
    },

    #NER
    {
        "question": "Choose the correct answer for the following statement: Identify the named Person in the sentence 'Elon Musk is the CEO of a company that produces cars.'",
        "choices": {"a": "Elon Musk", "b": "CEO", "c": "company", "d": "cars"},
        "correct": "a: Elon Musk"
    },
    {
        "question": "Choose the correct answer for the following statement: Extract the date mentioned in the text 'The conference is scheduled for January 25, 2023.'",
        "choices": {"a": "January 25, 2023", "b": "May 25-27, 2023.", "c": "January 27, 2023.", "d": "January 2, 2025"},
        "correct": "a: January 25, 2023"
    },
]


for question in questions_accuracy:
    print("Question:", question["question"])
    print("Answer Options:")
    for option, answer in question["choices"].items():
        print(f"{option}: {answer}")
    print("Correct Answer:", f"{question['correct']}")
    print("\n" + "-"*50 + "\n")

Question: Choose the correct option that represents an informal greeting.
Answer Options:
a: Hi
b: Good Day
c: Greetings
d: Dear Mr./Mrs.
Correct Answer: a: Hi

--------------------------------------------------

Question: Choose the correct option that represents an informal response to the question: 'How's it going?'
Answer Options:
a: Hi, good hbu?
b: Greetings, everything is fine, and you?
c: Good day, I'm well, thank you. How are you?
d: Hello, everything's fine
Correct Answer: a: Hi, good hbu?

--------------------------------------------------

Question: Choose the correct option that represents an informal email closing.
Answer Options:
a: Cheers
b: Sincerely
c: Best regards
d: I am awaiting your response
Correct Answer: a: Cheers

--------------------------------------------------

Question: Choose the correct option that represents a formal greeting.
Answer Options:
a: Good Day
b: Hi
c: Hey
d: Yo
Correct Answer: a: Good Day

--------------------------------------------------


### Evaluate Model based on Accuracy

In [96]:
model_answers = []

def evaluate_model_accuracy(model, questions_accuracy):
    try:
        correct_answers = 0
        total_questions = len(questions_accuracy)

        for question_data in questions_accuracy:
            question = question_data["question"]
            choices = question_data["choices"]
            correct_option = question_data["correct"]

            # Get model's answer for current question
            prompt = f"{question}\nAnswer Options:\n"
            for option, answer in choices.items():
                prompt += f"{option}: {answer}\n"
            prompt = get_prompt(prompt)
            #print(prompt)
            model_answer = model(prompt)

            # Check if selected option in model's answer matches the correct option
            if model_answer.strip() == correct_option.strip():
                correct_answers += 1

            # Print results for each question
            print(f"Question: {question}")
            print("Answer Options:")
            for option, answer in choices.items():
                print(f"{option}: {answer}")
            print(f"Correct Answer: {correct_option}")
            print(f"Model Answer: {model_answer}")
            print("------------")

            # Store the model's answer and the corresponding question in the array
            model_answers.append({
                "question": question,
                "choices": choices,
                "correct_option": correct_option,
                "selected_option": model_answers,
            })

        # print overall evaluation results
        print(f"Total Questions: {total_questions}")
        print(f"Correct Answers: {correct_answers}")
        print(f"Accuracy: {correct_answers / total_questions * 100:.2f}%")

    except Exception as e:
            print(f"An error occurred: {e}")
            return None, None
    
#run method to get accuracy score
evaluate_model_accuracy(llm, questions_accuracy)

Llama.generate: prefix-match hit


Question: Choose the correct option that represents an informal greeting.
Answer Options:
a: Hi
b: Good Day
c: Greetings
d: Dear Mr./Mrs.
Correct Answer: a: Hi
Model Answer:   a: Hi
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       1.47 ms /     5 runs   (    0.29 ms per token,  3399.05 tokens per second)
llama_print_timings: prompt eval time =    5280.95 ms /    43 tokens (  122.81 ms per token,     8.14 tokens per second)
llama_print_timings:        eval time =     565.37 ms /     4 runs   (  141.34 ms per token,     7.08 tokens per second)
llama_print_timings:       total time =    5877.17 ms
Llama.generate: prefix-match hit


Question: Choose the correct option that represents an informal response to the question: 'How's it going?'
Answer Options:
a: Hi, good hbu?
b: Greetings, everything is fine, and you?
c: Good day, I'm well, thank you. How are you?
d: Hello, everything's fine
Correct Answer: a: Hi, good hbu?
Model Answer:   c: Good day, I'm well, thank you. How are you?
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       5.15 ms /    19 runs   (    0.27 ms per token,  3690.04 tokens per second)
llama_print_timings: prompt eval time =    8207.84 ms /    71 tokens (  115.60 ms per token,     8.65 tokens per second)
llama_print_timings:        eval time =    2706.46 ms /    18 runs   (  150.36 ms per token,     6.65 tokens per second)
llama_print_timings:       total time =   10997.63 ms
Llama.generate: prefix-match hit


Question: Choose the correct option that represents an informal email closing.
Answer Options:
a: Cheers
b: Sincerely
c: Best regards
d: I am awaiting your response
Correct Answer: a: Cheers
Model Answer:   b: Sincerely
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       2.05 ms /     7 runs   (    0.29 ms per token,  3419.64 tokens per second)
llama_print_timings: prompt eval time =    4180.72 ms /    36 tokens (  116.13 ms per token,     8.61 tokens per second)
llama_print_timings:        eval time =     901.72 ms /     6 runs   (  150.29 ms per token,     6.65 tokens per second)
llama_print_timings:       total time =    5116.39 ms
Llama.generate: prefix-match hit


Question: Choose the correct option that represents a formal greeting.
Answer Options:
a: Good Day
b: Hi
c: Hey
d: Yo
Correct Answer: a: Good Day
Model Answer:   b: Hi
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       1.42 ms /     5 runs   (    0.28 ms per token,  3518.65 tokens per second)
llama_print_timings: prompt eval time =    3733.87 ms /    32 tokens (  116.68 ms per token,     8.57 tokens per second)
llama_print_timings:        eval time =     587.34 ms /     4 runs   (  146.84 ms per token,     6.81 tokens per second)
llama_print_timings:       total time =    4345.02 ms
Llama.generate: prefix-match hit


Question: Choose the correct option that represents a formal response to the question: 'How are you doing?
Answer Options:
a: Hello, I am doing well. How about you?
b: Hi, good you?
c: Not too bad, you?
d: good, you?
Correct Answer: a: Hello, I am doing well. How about you?
Model Answer:   The correct answer is (a): "Hello, I am doing well. How about you?"
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       6.57 ms /    21 runs   (    0.31 ms per token,  3197.81 tokens per second)
llama_print_timings: prompt eval time =    7084.40 ms /    58 tokens (  122.14 ms per token,     8.19 tokens per second)
llama_print_timings:        eval time =    4325.87 ms /    20 runs   (  216.29 ms per token,     4.62 tokens per second)
llama_print_timings:       total time =   11511.44 ms
Llama.generate: prefix-match hit


Question: Choose the correct option that represents a professional email closing.
Answer Options:
a: Sincerely
b: Cheers
c: See you!
d: Until next time.
Correct Answer: a: Sincerely
Model Answer:   d: Until next time
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       2.02 ms /     7 runs   (    0.29 ms per token,  3472.22 tokens per second)
llama_print_timings: prompt eval time =    4871.71 ms /    36 tokens (  135.33 ms per token,     7.39 tokens per second)
llama_print_timings:        eval time =    1141.10 ms /     6 runs   (  190.18 ms per token,     5.26 tokens per second)
llama_print_timings:       total time =    6047.54 ms
Llama.generate: prefix-match hit


Question: Choose the correct option that represents an academic paper introduction.
Answer Options:
a: Today we present our topic XYZ and we will also focus on ABC.
b: In this paper XYZ is presented with a focus on ABC.
c: Hi! In our paper we write about XYZ and ABC.
d: I show you something about XYZ in this paper.
Correct Answer: b: In this paper XYZ is presented with a focus on ABC.
Model Answer:   The correct answer is (b): "In this paper XYZ is presented with a focus on ABC."
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       7.28 ms /    24 runs   (    0.30 ms per token,  3297.16 tokens per second)
llama_print_timings: prompt eval time =   11474.04 ms /    82 tokens (  139.93 ms per token,     7.15 tokens per second)
llama_print_timings:        eval time =    4022.55 ms /    23 runs   (  174.89 ms per token,     5.72 tokens per second)
llama_print_timings:       total time =   15605.92 ms
Llama.generate: prefix-match hit


Question: The improvements canʼt be introduced due to funding restrictions.
Answer Options:
a: Formal Language Style
b: Informal Language Style
c: Academic Language Style
Correct Answer: b: Informal Language Style
Model Answer:   c: Academic Language Style
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       2.20 ms /     8 runs   (    0.27 ms per token,  3641.33 tokens per second)
llama_print_timings: prompt eval time =    5895.32 ms /    43 tokens (  137.10 ms per token,     7.29 tokens per second)
llama_print_timings:        eval time =    1178.55 ms /     7 runs   (  168.36 ms per token,     5.94 tokens per second)
llama_print_timings:       total time =    7112.96 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       2.35 ms /     8 runs   (    0.29 ms per token,  3398.47 tokens per second)
llama_print_timings: prompt eval time =    5382.18 ms /    39 tokens (  138.00 ms per token,     7.25 tokens per second)
llama_print_timings:        eval time =    1186.43 ms /     7 runs   (  169.49 ms per token,     5.90 tokens per second)
llama_print_timings:       total time =    6608.68 ms
Llama.generate: prefix-

Question:  It was raining cats and dogs.
Answer Options:
a: Formal Language Style
b: Informal Language Style
c: Academic Language Style
Correct Answer: b: Informal Language Style
Model Answer:   b: Informal Language Style
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       2.33 ms /     8 runs   (    0.29 ms per token,  3427.59 tokens per second)
llama_print_timings: prompt eval time =    5683.43 ms /    42 tokens (  135.32 ms per token,     7.39 tokens per second)
llama_print_timings:        eval time =    1155.16 ms /     7 runs   (  165.02 ms per token,     6.06 tokens per second)
llama_print_timings:       total time =    6879.42 ms
Llama.generate: prefix-match hit


Question: Improvements cannot be introduced due to funding restrictions.
Answer Options:
a: Formal Language Style
b: Informal Language Style
c: Academic Language Style
Correct Answer: a: Formal Language Style
Model Answer:   c: Academic Language Style
------------
Question: During the interview, students were asked about their experiences.
Answer Options:
a: Formal Language Style
b: Informal Language Style
c: Academic Language Style
Correct Answer: a: Formal Language Style
Model Answer:   c: Academic Language Style
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       2.24 ms /     8 runs   (    0.28 ms per token,  3565.06 tokens per second)
llama_print_timings: prompt eval time =    5765.39 ms /    42 tokens (  137.27 ms per token,     7.28 tokens per second)
llama_print_timings:        eval time =    1210.68 ms /     7 runs   (  172.95 ms per token,     5.78 tokens per second)
llama_print_timings:       total time =    7014.49 ms
Llama.generate: prefix-match hit


Question: New results in this area are produced by the research group.
Answer Options:
a: Formal Language Style
b: Informal Language Style
c: Academic Language Style
Correct Answer: c: Academic Language Style
Model Answer:   c: Academic Language Style
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       2.21 ms /     8 runs   (    0.28 ms per token,  3616.64 tokens per second)
llama_print_timings: prompt eval time =    5248.69 ms /    42 tokens (  124.97 ms per token,     8.00 tokens per second)
llama_print_timings:        eval time =    1188.12 ms /     7 runs   (  169.73 ms per token,     5.89 tokens per second)
llama_print_timings:       total time =    6474.72 ms
Llama.generate: prefix-match hit


Question: Choose the correct translation for 'hello' in Spanish.
Answer Options:
a: Hola
b: Hello
c: Hallo
d: Bonjour
Correct Answer: a: Hola
Model Answer:   a: Hola
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       1.76 ms /     6 runs   (    0.29 ms per token,  3416.86 tokens per second)
llama_print_timings: prompt eval time =    4876.03 ms /    40 tokens (  121.90 ms per token,     8.20 tokens per second)
llama_print_timings:        eval time =     902.57 ms /     6 runs   (  150.43 ms per token,     6.65 tokens per second)
llama_print_timings:       total time =    5812.78 ms
Llama.generate: prefix-match hit


Question: Choose the correct translation for 'Thank you' in German.
Answer Options:
a: Danke
b: Thanks
c: Gracias
d: Auf Wiedersehen
Correct Answer: a: Danke
Model Answer:   a: Danke
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       1.78 ms /     6 runs   (    0.30 ms per token,  3365.11 tokens per second)
llama_print_timings: prompt eval time =    4372.16 ms /    36 tokens (  121.45 ms per token,     8.23 tokens per second)
llama_print_timings:        eval time =     852.37 ms /     5 runs   (  170.47 ms per token,     5.87 tokens per second)
llama_print_timings:       total time =    5255.12 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       1.75 ms /     6 runs   (    0.29 ms per token,  3430.53 tokens per second)
llama_print_timings: prompt eval time =    4757.94 ms /    32 tokens (  148.69 ms per token,     6.73 tokens per second)
llama_print_timings:        eval time =    1023.54 ms /     6 runs   (  170.59 ms per token,     5.86 tokens per second)
llama_print_timings:       total time =    5815.72 ms
Llama.generate: prefix-

Question: Choose the correct translation for 'book' in italian.
Answer Options:
a: Libro
b: Buch
c: Plant
d: Libre
Correct Answer: a: Libro
Model Answer:   a: Libro
------------
Question: Choose the correct answer that is the capital of France?
Answer Options:
a: Paris
b: Berlin
c: Madrid
d: New York
Correct Answer: a: Paris
Model Answer:   a: Paris
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       1.49 ms /     5 runs   (    0.30 ms per token,  3355.70 tokens per second)
llama_print_timings: prompt eval time =    5011.81 ms /    34 tokens (  147.41 ms per token,     6.78 tokens per second)
llama_print_timings:        eval time =     702.68 ms /     4 runs   (  175.67 ms per token,     5.69 tokens per second)
llama_print_timings:       total time =    5744.23 ms
Llama.generate: prefix-match hit


Question: Choose the correct answer for the following math problem: 3x + 5 = 20.
Answer Options:
a: x = 5
b: x = 15
c: x = 10
d: x = 2
Correct Answer: a: x = 5
Model Answer:   Sure, I'm ready to help! The correct answer for the math problem 3x + 5 = 20 is:
d: x = 2
So, the answer is (d) x = 
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =      13.54 ms /    48 runs   (    0.28 ms per token,  3546.10 tokens per second)
llama_print_timings: prompt eval time =    9543.95 ms /    56 tokens (  170.43 ms per token,     5.87 tokens per second)
llama_print_timings:        eval time =    9029.46 ms /    47 runs   (  192.12 ms per token,     5.21 tokens per second)
llama_print_timings:       total time =   18772.41 ms
Llama.generate: prefix-match hit


Question: Choose the correct answer for the following statement: If 'cat' is to 'kitten,' what is 'dog' to?
Answer Options:
a: 'Dog' is to 'puppy'
b: 'Dog' is to 'kitten'
c: 'Dog' is to 'cub'
d: 'Dog' is to 'kitty'
Correct Answer: a: 'Dog' is to 'puppy'
Model Answer:   b: 'Dog' is to 'kitten'.
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       3.04 ms /    14 runs   (    0.22 ms per token,  4611.33 tokens per second)
llama_print_timings: prompt eval time =   10734.35 ms /    80 tokens (  134.18 ms per token,     7.45 tokens per second)
llama_print_timings:        eval time =    2337.18 ms /    14 runs   (  166.94 ms per token,     5.99 tokens per second)
llama_print_timings:       total time =   13143.74 ms
Llama.generate: prefix-match hit


Question: Choose the correct answer for the following statement: Identify the named Person in the sentence 'Elon Musk is the CEO of a company that produces cars.'
Answer Options:
a: Elon Musk
b: CEO
c: company
d: cars
Correct Answer: a: Elon Musk
Model Answer:   b: CEO
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       1.96 ms /     6 runs   (    0.33 ms per token,  3069.05 tokens per second)
llama_print_timings: prompt eval time =    7572.69 ms /    53 tokens (  142.88 ms per token,     7.00 tokens per second)
llama_print_timings:        eval time =     881.87 ms /     5 runs   (  176.37 ms per token,     5.67 tokens per second)
llama_print_timings:       total time =    8497.09 ms
Llama.generate: prefix-match hit


Question: Choose the correct answer for the following statement: Extract the date mentioned in the text 'The conference is scheduled for January 25, 2023.'
Answer Options:
a: January 25, 2023
b: May 25-27, 2023.
c: January 27, 2023.
d: January 2, 2025
Correct Answer: a: January 25, 2023
Model Answer:   a: January 25, 2023
------------
Total Questions: 20
Correct Answers: 8
Accuracy: 40.00%



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       4.33 ms /    14 runs   (    0.31 ms per token,  3230.27 tokens per second)
llama_print_timings: prompt eval time =   16650.84 ms /    90 tokens (  185.01 ms per token,     5.41 tokens per second)
llama_print_timings:        eval time =    4555.21 ms /    13 runs   (  350.40 ms per token,     2.85 tokens per second)
llama_print_timings:       total time =   21289.25 ms


### Evaluate Model based on Rouge (Recall-Oriented Understudy for Gissing Evaluation)

A set of metrics used for evaluating the quality of summaries. It compares the generated summary with one or more reference summaries and calculates precision, recall, and F1-score 

- ROUGE-N quantifies the overlap of N-grams, contiguous sequences of N items (typically words or characters), between the system-generated summary and the reference summary) 
    - ROUGE-1 (unigram overlap): This metric measures the overlap of unigrams (single words) between the generated summary and the reference summary. It focuses on the recall of unigrams.
    - ROUGE-2 (bigram overlap): Similar to ROUGE-1, but it measures the overlap of bigrams (pairs of consecutive words) instead of unigrams. It evaluates the recall of bigrams.
- ROUGE-L (longest common subsequence): Instead of measuring word overlap, ROUGE-L focuses on the longest common subsequence (LCS) between the generated and reference summaries. It evaluates the recall of the longest common subsequence

- high precision value suggests that the words or phrases churned out by the machine translation or submodel are primarily accurate
- high recall value, ideally close to 1, implies that the content of the machine-generated output aligns closely with the human-made reference. It signifies the model's proficiency in capturing relevant information



In [106]:
#general prompt to test llm for testing questions
SYSTEM_PROMPT_EVAL_ROUGE = """
<<SYS>>
You will be presented with a question where you are expected to respond only with the correct answer in a short and simple manner. Use one sentence maximum to answer the question. 
Only respond with the correct answer itself and leave out fill words like "sure" or "certainly".
If you don't know the answer, be honest and state that you don't know. Don't give any false information.
<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT_EVAL_ROUGE + E_SYS

Prompt:"""
<<SYS>>
You will be presented with a question where you are expected to respond only with the correct answer in a short and simple manner. Use one sentence maximum to answer the question. 
Only respond with the correct answer itself and leave out fill words like "sure" or "certainly".
If you don't know the answer, be honest and state that you don't know. Don't give any false information.
<</SYS>>
"""


results: Average Recall: 0.376
Average Precision: 0.582
Average F1-score: 0.416

In [115]:
# Define test questions with different categories for ROUGE
textual_entailment = {
    "If 'A implies B' and 'B is false,' what can you conclude about A?":"Nothing definitive can be concluded about A.",
}

reasoning_problem_solving = {
    "If a car travels at 60 miles per hour, how long will it take to cover 120 miles?": "2 hours",
}

analogical_reasoning = {
    "If 'cat' is to 'kitten,' what is 'dog' to?":"'Dog' is to 'puppy'",
}

# Named Entity Recognition (NER)
ner = {
    "Identify the named person in the sentence 'Elon Musk is the CEO of SpaceX.'": "Elon Musk",
    "Extract the date mentioned in the text 'The conference is scheduled for January 25, 2023.'": "January 25, 2023.",
}

identification ={
    "Given the following user review, answer the question. User review: The CO-1T is a great pair of headphones! The sound quality is the best out there, and I can hear every detail of my music. Question: Why is the CO-1T a great wireless headphone?": "Because the audio experience is unrivaled",
    "How many objects are named in the following sequence: The dog was running towards the tree with a ball in his mouth.": "One object is named, a dog.",
    "Given the statement, answer the following question. Statement: The sky is blue and the building is grey. Question: What color does the sky have?": "Blue"
}

languageStyles = {
  "What language style is used in the following Statement: Improvements cannot be introduced due to funding restrictions." : "Formal.",
  "What language style is used in the following Statement: New results in this area are produced by the research group." : "Academic.",
  "What language style is used in the following Statement: I donʼt believe that the results are accurate." : "Informal",
  "What language style is used in the following Statement: The results are not believed to be accurate." : "Formal",
  "What language style is described by the following statement: The style is more casual and spontaneous and is used to communicate with friends and family either in writing or conversations." : "Informal",
  "What language style is described by the following statement: The style is used when writing or speaking for professional purpose and does not use colloquialism, contractions or first-person pronouns." : "Formal",
  "What language style is described by the following statement: The style is used for higher school purposes like thesis or research papers and does not use first-person pronouns and is not personal but factual" : "Academic",
}

# Combine all test questions into a single dictionary
questions_rouge = {**textual_entailment,**reasoning_problem_solving,**analogical_reasoning,**ner,**identification,**languageStyles}
#print(questions_rouge)

In [87]:
#%pip install rouge - ##test rouge library - https://pypi.org/project/rouge/ 
from rouge import Rouge

##example calc for rouge
hypothesis = "the #### transcript is a written version of each day 's cnn student news program use this transcript to help students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you     saw on cnn student news"

reference = "this page includes the show transcript use the transcript to help students with reading comprehension and vocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teac    her or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests     students ' knowledge of even ts in the news"

rouge = Rouge()
scores = rouge.get_scores(hypothesis, reference)
print(scores)

[{'rouge-1': {'r': 0.42857142857142855, 'p': 0.5833333333333334, 'f': 0.49411764217577864}, 'rouge-2': {'r': 0.18571428571428572, 'p': 0.3170731707317073, 'f': 0.23423422957552154}, 'rouge-l': {'r': 0.3877551020408163, 'p': 0.5277777777777778, 'f': 0.44705881864636676}}]


In [120]:
#maybe check if rouge-score library better

def evaluate_model_rouge(model, questions):
    try:
        total_questions = len(questions)
        model_answers = []
        rouge_scores_list_r1 = []
        rouge_scores_list_r2 = []
        rouge_scores_list_rl= []
        rouge_scores = {"rouge": {"recall": 0, "precision": 0, "f1-score": 0}} 

        for question, expected_answer in questions_rouge.items():
            prompt = f"{question}\nAnswer Options:\n"
            prompt = get_prompt(prompt)
            #print(prompt)
            model_answer = model(prompt)

            model_answer = model_answer.strip()
            expected_answer = expected_answer.strip()

            # Get Rouge Score - Rouge() hat rouge-1, rouge-2 and rouge-l as option 
            rouge = Rouge()
            scores = rouge.get_scores(expected_answer, model_answer) 

            #save rouge-1, rouge-2 and rouge-l values for each question in list
            scores_list_r1 = rouge.get_scores(expected_answer, model_answer)[0]
            rouge_scores_list_r1.append({
                "r": scores_list_r1.get('rouge-1', {}).get('r', 0),
                "p": scores_list_r1.get('rouge-1', {}).get('p', 0),
                "f": scores_list_r1.get('rouge-1', {}).get('f', 0),
            })
            scores_list_r2 = rouge.get_scores(expected_answer, model_answer)[0]
            rouge_scores_list_r2.append({
                "r": scores_list_r2.get('rouge-2', {}).get('r', 0),
                "p": scores_list_r2.get('rouge-2', {}).get('p', 0),
                "f": scores_list_r2.get('rouge-2', {}).get('f', 0),
            })
            scores_list_rl = rouge.get_scores(expected_answer, model_answer)[0]
            rouge_scores_list_rl.append({
                "r": scores_list_rl.get('rouge-l', {}).get('r', 0),
                "p": scores_list_rl.get('rouge-l', {}).get('p', 0),
                "f": scores_list_rl.get('rouge-l', {}).get('f', 0),
            })
            
            
            # Print the results for each question
            print(f"Question: {question}")
            print(f"Expected Answer: {expected_answer}")
            print(f"Model Answer: {model_answer}")
            print(f"ROUGE Scores: {scores}")
            print("------------")

            # Store the model's answer and the corresponding question in the array
            model_answers.append({
                "question": question,
                "expected_answer": expected_answer,
                "model_answer": model_answer,
            })

        # Calculate the average 'r', 'p', and 'f' values for rouge-1, rouge-2 and rouge-l
        average_r_r1 = round(sum(entry["r"] for entry in rouge_scores_list_r1) / total_questions,3) 
        average_p_r1 = round(sum(entry["p"] for entry in rouge_scores_list_r1) / total_questions,3)
        average_f_r1 = round(sum(entry["f"] for entry in rouge_scores_list_r1) / total_questions,3)

        average_r_r2 = round(sum(entry["r"] for entry in rouge_scores_list_r2) / total_questions,3) 
        average_p_r2 = round(sum(entry["p"] for entry in rouge_scores_list_r2) / total_questions,3)
        average_f_r2 = round(sum(entry["f"] for entry in rouge_scores_list_r2) / total_questions,3)
        
        average_r_rl = round(sum(entry["r"] for entry in rouge_scores_list_rl) / total_questions,3) 
        average_p_rl = round(sum(entry["p"] for entry in rouge_scores_list_rl) / total_questions,3)
        average_f_rl = round(sum(entry["f"] for entry in rouge_scores_list_rl) / total_questions,3)

        print(f"Questions total: {total_questions}")
        print(f"Average Recall (ROUGE-1): {average_r_r1}")
        print(f"Average Precision (ROUGE-1): {average_p_r1}")
        print(f"Average F1-score (ROUGE-1): {average_f_r1}")
        print(f"Average Recall (ROUGE-2): {average_r_r2}")
        print(f"Average Precision (ROUGE-2): {average_p_r2}")
        print(f"Average F1-score (ROUGE-2): {average_f_r2}")
        print(f"Average Recall (ROUGE-L): {average_r_rl}")
        print(f"Average Precision (ROUGE-L): {average_p_rl}")
        print(f"Average F1-score (ROUGE-L): {average_f_rl}")

        return model_answers, rouge_scores

    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None
    
#run method
model_answers, rouge_scores = evaluate_model_rouge(llm, questions_rouge)



Llama.generate: prefix-match hit


Question: If 'A implies B' and 'B is false,' what can you conclude about A?
Expected Answer: Nothing definitive can be concluded about A.
Model Answer: A cannot be concluded about.
ROUGE Scores: [{'rouge-1': {'r': 0.8, 'p': 0.5714285714285714, 'f': 0.6666666618055556}, 'rouge-2': {'r': 0.5, 'p': 0.3333333333333333, 'f': 0.39999999520000007}, 'rouge-l': {'r': 0.6, 'p': 0.42857142857142855, 'f': 0.499999995138889}}]
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       2.68 ms /     8 runs   (    0.33 ms per token,  2988.42 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    2057.70 ms /     8 runs   (  257.21 ms per token,     3.89 tokens per second)
llama_print_timings:       total time =    2095.71 ms
Llama.generate: prefix-match hit


Question: If a car travels at 60 miles per hour, how long will it take to cover 120 miles?
Expected Answer: 2 hours
Model Answer: The correct answer is: 2 hours
ROUGE Scores: [{'rouge-1': {'r': 0.3333333333333333, 'p': 1.0, 'f': 0.4999999962500001}, 'rouge-2': {'r': 0.2, 'p': 1.0, 'f': 0.33333333055555564}, 'rouge-l': {'r': 0.3333333333333333, 'p': 1.0, 'f': 0.4999999962500001}}]
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       2.85 ms /    10 runs   (    0.28 ms per token,  3510.00 tokens per second)
llama_print_timings: prompt eval time =    5532.32 ms /    34 tokens (  162.72 ms per token,     6.15 tokens per second)
llama_print_timings:        eval time =    1442.08 ms /     9 runs   (  160.23 ms per token,     6.24 tokens per second)
llama_print_timings:       total time =    7022.37 ms
Llama.generate: prefix-match hit


Question: If 'cat' is to 'kitten,' what is 'dog' to?
Expected Answer: 'Dog' is to 'puppy'
Model Answer: Puppy
ROUGE Scores: [{'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}}]
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       1.16 ms /     4 runs   (    0.29 ms per token,  3460.21 tokens per second)
llama_print_timings: prompt eval time =    3193.78 ms /    24 tokens (  133.07 ms per token,     7.51 tokens per second)
llama_print_timings:        eval time =     816.22 ms /     4 runs   (  204.06 ms per token,     4.90 tokens per second)
llama_print_timings:       total time =    4032.60 ms
Llama.generate: prefix-match hit


Question: Identify the named person in the sentence 'Elon Musk is the CEO of SpaceX.'
Expected Answer: Elon Musk
Model Answer: Elon Musk
ROUGE Scores: [{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}, 'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}, 'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}}]
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       1.63 ms /     6 runs   (    0.27 ms per token,  3678.72 tokens per second)
llama_print_timings: prompt eval time =    3785.21 ms /    30 tokens (  126.17 ms per token,     7.93 tokens per second)
llama_print_timings:        eval time =     790.84 ms /     5 runs   (  158.17 ms per token,     6.32 tokens per second)
llama_print_timings:       total time =    4602.61 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       7.29 ms /    23 runs   (    0.32 ms per token,  3155.44 tokens per second)
llama_print_timings: prompt eval time =    3996.18 ms /    34 tokens (  117.53 ms per token,     8.51 tokens per second)
llama_print_timings:        eval time =    3480.77 ms /    22 runs   (  158.22 ms per token,     6.32 tokens per second)
llama_print_timings:       total time =    7566.08 ms
Llama.generate: prefix-

Question: Extract the date mentioned in the text 'The conference is scheduled for January 25, 2023.'
Expected Answer: January 25, 2023.
Model Answer: Sure! The correct answer is:
January 25, 2023.
ROUGE Scores: [{'rouge-1': {'r': 0.375, 'p': 1.0, 'f': 0.5454545414876033}, 'rouge-2': {'r': 0.2857142857142857, 'p': 1.0, 'f': 0.4444444409876544}, 'rouge-l': {'r': 0.375, 'p': 1.0, 'f': 0.5454545414876033}}]
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =      11.99 ms /    48 runs   (    0.25 ms per token,  4003.34 tokens per second)
llama_print_timings: prompt eval time =    8171.73 ms /    71 tokens (  115.09 ms per token,     8.69 tokens per second)
llama_print_timings:        eval time =    7595.61 ms /    47 runs   (  161.61 ms per token,     6.19 tokens per second)
llama_print_timings:       total time =   15949.11 ms
Llama.generate: prefix-match hit


Question: Given the following user review, answer the question. User review: The CO-1T is a great pair of headphones! The sound quality is the best out there, and I can hear every detail of my music. Question: Why is the CO-1T a great wireless headphone?
Expected Answer: Because the audio experience is unrivaled
Model Answer: Great answer! Here's the next question:
User review: The new iPhone XS Max has an amazing camera! It takes incredible photos and videos. Question: What makes the iPhone XS Max camera so good?
ROUGE Scores: [{'rouge-1': {'r': 0.034482758620689655, 'p': 0.16666666666666666, 'f': 0.05714285430204096}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.034482758620689655, 'p': 0.16666666666666666, 'f': 0.05714285430204096}}]
------------
Question: How many objects are named in the following sequence: The dog was running towards the tree with a ball in his mouth.
Expected Answer: One object is named, a dog.
Model Answer: One object is named in the sequence: 


llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       4.11 ms /    13 runs   (    0.32 ms per token,  3163.79 tokens per second)
llama_print_timings: prompt eval time =    3858.95 ms /    32 tokens (  120.59 ms per token,     8.29 tokens per second)
llama_print_timings:        eval time =    2314.17 ms /    13 runs   (  178.01 ms per token,     5.62 tokens per second)
llama_print_timings:       total time =    6230.55 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       0.93 ms /     3 runs   (    0.31 ms per token,  3232.76 tokens per second)
llama_print_timings: prompt eval time =    5074.27 ms /    40 tokens (  126.86 ms per token,     7.88 tokens per second)
llama_print_timings:        eval time =     459.65 ms /     3 runs   (  153.22 ms per token,     6.53 tokens per second)
llama_print_timings:       total time =    5556.53 ms
Llama.generate: prefix-

Question: Given the statement, answer the following question. Statement: The sky is blue and the building is grey. Question: What color does the sky have?
Expected Answer: Blue
Model Answer: Blue
ROUGE Scores: [{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}}]
------------
Question: What language style is used in the following Statement: Improvements cannot be introduced due to funding restrictions.
Expected Answer: Formal.
Model Answer: The language style used in the statement is Formal.
ROUGE Scores: [{'rouge-1': {'r': 0.1111111111111111, 'p': 1.0, 'f': 0.1999999982}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.1111111111111111, 'p': 1.0, 'f': 0.1999999982}}]
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       3.89 ms /    13 runs   (    0.30 ms per token,  3345.34 tokens per second)
llama_print_timings: prompt eval time =    3902.61 ms /    32 tokens (  121.96 ms per token,     8.20 tokens per second)
llama_print_timings:        eval time =    1920.55 ms /    12 runs   (  160.05 ms per token,     6.25 tokens per second)
llama_print_timings:       total time =    5882.09 ms
Llama.generate: prefix-match hit


Question: What language style is used in the following Statement: New results in this area are produced by the research group.
Expected Answer: Academic.
Model Answer: The language style of the statement is: Technical or Academic.
ROUGE Scores: [{'rouge-1': {'r': 0.1, 'p': 1.0, 'f': 0.18181818016528928}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.1, 'p': 1.0, 'f': 0.18181818016528928}}]
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       4.83 ms /    16 runs   (    0.30 ms per token,  3315.38 tokens per second)
llama_print_timings: prompt eval time =    2641.67 ms /    21 tokens (  125.79 ms per token,     7.95 tokens per second)
llama_print_timings:        eval time =    2565.18 ms /    15 runs   (  171.01 ms per token,     5.85 tokens per second)
llama_print_timings:       total time =    5268.38 ms
Llama.generate: prefix-match hit


Question: What language style is used in the following Statement: I donʼt believe that the results are accurate.
Expected Answer: Informal
Model Answer: The language style of the statement is Informal.
ROUGE Scores: [{'rouge-1': {'r': 0.125, 'p': 1.0, 'f': 0.2222222202469136}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.125, 'p': 1.0, 'f': 0.2222222202469136}}]
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       3.42 ms /    12 runs   (    0.28 ms per token,  3508.77 tokens per second)
llama_print_timings: prompt eval time =    2256.16 ms /    20 tokens (  112.81 ms per token,     8.86 tokens per second)
llama_print_timings:        eval time =    1699.29 ms /    11 runs   (  154.48 ms per token,     6.47 tokens per second)
llama_print_timings:       total time =    4000.97 ms
Llama.generate: prefix-match hit


Question: What language style is used in the following Statement: The results are not believed to be accurate.
Expected Answer: Formal
Model Answer: The language style of the statement is Formal.
ROUGE Scores: [{'rouge-1': {'r': 0.125, 'p': 1.0, 'f': 0.2222222202469136}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.125, 'p': 1.0, 'f': 0.2222222202469136}}]
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       3.45 ms /    12 runs   (    0.29 ms per token,  3475.24 tokens per second)
llama_print_timings: prompt eval time =    2041.35 ms /    18 tokens (  113.41 ms per token,     8.82 tokens per second)
llama_print_timings:        eval time =    1584.85 ms /    11 runs   (  144.08 ms per token,     6.94 tokens per second)
llama_print_timings:       total time =    3670.45 ms
Llama.generate: prefix-match hit


Question: What language style is described by the following statement: The style is more casual and spontaneous and is used to communicate with friends and family either in writing or conversations.
Expected Answer: Informal
Model Answer: Informal
ROUGE Scores: [{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}}]
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       1.16 ms /     4 runs   (    0.29 ms per token,  3451.25 tokens per second)
llama_print_timings: prompt eval time =    4415.41 ms /    40 tokens (  110.39 ms per token,     9.06 tokens per second)
llama_print_timings:        eval time =     670.69 ms /     4 runs   (  167.67 ms per token,     5.96 tokens per second)
llama_print_timings:       total time =    5111.53 ms
Llama.generate: prefix-match hit


Question: What language style is described by the following statement: The style is used when writing or speaking for professional purpose and does not use colloquialism, contractions or first-person pronouns.
Expected Answer: Formal
Model Answer: The language style described in the statement is: Formal
ROUGE Scores: [{'rouge-1': {'r': 0.1111111111111111, 'p': 1.0, 'f': 0.1999999982}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.1111111111111111, 'p': 1.0, 'f': 0.1999999982}}]
------------



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       3.39 ms /    13 runs   (    0.26 ms per token,  3840.47 tokens per second)
llama_print_timings: prompt eval time =    4741.20 ms /    37 tokens (  128.14 ms per token,     7.80 tokens per second)
llama_print_timings:        eval time =    2085.69 ms /    12 runs   (  173.81 ms per token,     5.75 tokens per second)
llama_print_timings:       total time =    6884.63 ms
Llama.generate: prefix-match hit


Question: What language style is described by the following statement: The style is used for higher school purposes like thesis or research papers and does not use first-person pronouns and is not personal but factual
Expected Answer: Academic
Model Answer: A. Academic
ROUGE Scores: [{'rouge-1': {'r': 0.5, 'p': 1.0, 'f': 0.6666666622222223}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.5, 'p': 1.0, 'f': 0.6666666622222223}}]
------------
Questions total: 15
Average Recall (ROUGE-1): 0.408
Average Precision (ROUGE-1): 0.827
Average F1-score (ROUGE-1): 0.469
Average Recall (ROUGE-2): 0.149
Average Precision (ROUGE-2): 0.249
Average F1-score (ROUGE-2): 0.166
Average Recall (ROUGE-L): 0.394
Average Precision (ROUGE-L): 0.817
Average F1-score (ROUGE-L): 0.458



llama_print_timings:        load time =   13434.66 ms
llama_print_timings:      sample time =       1.40 ms /     6 runs   (    0.23 ms per token,  4301.08 tokens per second)
llama_print_timings: prompt eval time =    4233.26 ms /    36 tokens (  117.59 ms per token,     8.50 tokens per second)
llama_print_timings:        eval time =     729.23 ms /     5 runs   (  145.85 ms per token,     6.86 tokens per second)
llama_print_timings:       total time =    4991.00 ms


In [89]:
##multiple sentences
import json

# Load some sentences
with open('./tests/data.json') as f:
  data = json.load(f)

hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data]))
rouge = Rouge()
scores = rouge.get_scores(hyps, refs)
# or
scores = rouge.get_scores(hyps, refs, avg=True)


##two files
from rouge import FilesRouge

files_rouge = FilesRouge()
scores = files_rouge.get_scores(hyp_path, ref_path)
# or
scores = files_rouge.get_scores(hyp_path, ref_path, avg=True)

FileNotFoundError: [Errno 2] No such file or directory: './tests/data.json'

### Correctness Evaluation

In [None]:
#%pip install bitsandbytes
#%pip install datasets

### Performance Evaluation - Perplexity: measure that quantifies how well the model predicts a sample of text. Lower perplexity values indicate better performance 

In [None]:
import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer
import bitsandbytes as bnb

model_name = '/kaggle/input/llama-2/pytorch/7b-chat-hf/1'

tokenizer = LlamaTokenizer.from_pretrained(model_name)

free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'

n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  device_map='auto',
  load_in_4bit=True,
  max_memory=max_memory,
  do_sample=True,
  torch_dtype="auto"
)

from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")


import torch
from tqdm import tqdm

max_length = model.config.max_length
stride = 512
seq_len = encodings.input_ids.size(1)
device = "cuda"

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())

In [None]:
##check Langchains QAEvalChain
#https://github.com/langchain-ai/langchain/blob/a6b39afe0e34621fafac885af05bbbb445ea5ac0/langchain/evaluation/qa/eval_chain.py#L42C1-L42C1