In [None]:
#%pip install langchain llama-cpp-python 
#CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

### Load Packages

In [None]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
import os

### Define System Prompt and load Llama2 Model

In [None]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

In [None]:
#DEFAULT PROMPT
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. 
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
SYSTEM_PROMPT = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS

In [None]:
#Chatbot prompt for later use case
SYSTEM_PROMPT_USECASE = """
<<SYS>>
You are a chatbot that helps people learn a new language on a platform. You always give the answer to the question in an academic language style.
If you don't know the answer to a question, you tell them truthfully that you don't know and don't give false information. You are a helpful, respectful and honest assistant.
Your answers should not contain harmful, unethical, racist, sexist, toxic, dangerous or illegal content. Please make sure that your answers are socially unbiased and positive if nature#.
If a question does not make sense or is not factually coherent, please explain why, rather than answering something incorrectly.<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT_USECASE +E_SYS

In [None]:
#general prompt to test llm for testing questions
SYSTEM_PROMPT_EVAL_ACCURACY = """
<<SYS>>
You will be presented with a question and several answer options where only one is correct. You will respond with the letter that represents the correct answer option.
Do not add any answer options, only choose the correct one from the for options presented because one of them is correct.
If you don't know the answer, be honest and state that you don't know.
<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT_EVAL_ACCURACY +E_SYS

In [None]:
def get_prompt(instruction):
    return B_INST + SYSTEM_PROMPT + instruction + E_INST

os.listdir("/Users/josi/Llama2_weights")

In [None]:
llm = LlamaCpp(
    model_path= "/Users/josi/llama_weights/llama-2-7b-chat.Q4_K_M.gguf",
    temperature=0.3,
    max_tokens=96,
    top_p=1,
    #callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

### Create Dictionaries with test questions (English)

In [None]:
##Questions with Answer options for ACCURACY
questions_accuracy = [

    #Informal Style
    {
        "question": "Choose the correct option that represents an Informal greeting.",
        "choices": {"a": "Hi", "b": "Good Day", "c": "Greetings", "d": "Dear Mr./Mrs."},
        "correct": "a: Hi"
    },
    {
        "question": "Choose the correct option that represents an Informal response to the question: 'How's it going?'",
        "choices": {"a": "Hi, good hbu?", "b": "Greetings, everything is fine, and you?", "c": "Good day, I'm well, thank you. How are you?", "d": "Hello, everything's fine"},
        "correct": "a: Hi, good hbu?"
    },
    {
        "question": "Choose the correct option that represents an informal email closing.",
        "choices": {"a": "Cheers", "b": "Sincerely", "c": "Best regards", "d": "I am awaiting your response"},
        "correct": "a: Cheers"
    },

    #Formal Style
    {
        "question": "Choose the correct option that represents a formal greeting.",
        "choices": {"a": "Good Day", "b": "Hi", "c": "Hey", "d": "Yo"},
        "correct": "a: Good Day"
    },
    {
        "question": "Choose the correct option that represents a formal response to the question: 'How are you doing?",
        "choices": {"a": "Hello, I am doing well. How about you?", "b": "Hi, good you?", "c": "Not too bad, you?", "d": "good, you?"},
        "correct": "a: Hello, I am doing well. How about you?"
    },
    {
        "question": "Choose the correct option that represents a professional email closing.",
        "choices": {"a": "Sincerely", "b": "Cheers", "c": "See you!", "d": "Until next time."},
        "correct": "a: Sincerely"
    },

    #academic Style
    {
        "question": "Choose the correct option that represents an academic paper introduction.",
        "choices": {"a": "Today we present our topic XYZ and we will also focus on ABC.", "b": "In this paper XYZ is presented with a focus on ABC.", "c": "Hi! In our paper we write about XYZ and ABC.", "d": "I show you something about XYZ in this paper."},
        "correct": "b: In this paper XYZ is presented with a focus on ABC."
    },



    #Detect correct language style
    {
        "question": "The improvements canʼt be introduced due to funding restrictions.",
        "choices": {"a": "Formal Language Style", "b": "Informal Language Style", "c": "Academic Language Style"},
        "correct": "b: Informal Language Style"
    },
    {
        "question": " It was raining cats and dogs.",
        "choices": {"a": "Formal Language Style", "b": "Informal Language Style", "c": "Academic Language Style"},
        "correct": "b: Informal Language Style"
    },
    {
        "question": "Improvements cannot be introduced due to funding restrictions.",
        "choices": {"a": "Formal Language Style", "b": "Informal Language Style", "c": "Academic Language Style"},
        "correct": "a: Formal Language Style"
    },
    {
        "question": "During the interview, students were asked about their experiences.",
        "choices": {"a": "Formal Language Style", "b": "Informal Language Style", "c": "Academic Language Style"},
        "correct": "a: Formal Language Style"
    },
    {
        "question": "New results in this area are produced by the research group.",
        "choices": {"a": "Formal Language Style", "b": "Informal Language Style", "c": "Academic Language Style"},
        "correct": "c: Academic Language Style"
    },

    #Translation
    {
        "question": "Choose the correct translation for 'hello' in Spanish.",
        "choices": {"a":"Hola", "b": "Hello", "c":"Hallo", "d": "Bonjour"},
        "correct": "a: Hola"
    },
    {
        "question": "Choose the correct translation for 'Thank you' in German.",
        "choices": {"a": "Danke", "b": "Thanks", "c": "Gracias", "d": "Auf Wiedersehen"},
        "correct": "a: Danke"
    },
    {
        "question": "Choose the correct translation for 'book' in italian.",
        "choices": {"a": "Libro", "b": "Buch", "c": "Plant", "d": "Libre"},
        "correct": "a: Libro"
    },
    {
        "question": "Choose the correct answer that is the capital of France?",
        "choices": {"a": "Paris", "b": "Berlin", "c": "Madrid", "d": "New York"},
        "correct": "a: Paris"
    },

    #General
    {
        "question": "Choose the correct answer for the following math problem: 3x + 5 = 20.",
        "choices": {"a": "x = 5", "b": "x = 15", "c": "x = 10", "d": "x = 2"},
        "correct": "a: x = 5"
    },
    {
        "question": "Choose the correct answer for the following statement: If 'cat' is to 'kitten,' what is 'dog' to?",
        "choices": {"a": "'Dog' is to 'puppy'", "b": "'Dog' is to 'kitten'", "c": "'Dog' is to 'cub'", "d": "'Dog' is to 'kitty'"},
        "correct": "a: 'Dog' is to 'puppy'"
    },

    #NER
    {
        "question": "Choose the correct answer for the following statement: Identify the named Person in the sentence 'Elon Musk is the CEO of a company that produces cars.'",
        "choices": {"a": "Elon Musk", "b": "CEO", "c": "company", "d": "cars"},
        "correct": "a: Elon Musk"
    },
    {
        "question": "Choose the correct answer for the following statement: Extract the date mentioned in the text 'The conference is scheduled for January 25, 2023.'",
        "choices": {"a": "January 25, 2023", "b": "May 25-27, 2023.", "c": "January 27, 2023.", "d": "January 2, 2025"},
        "correct": "a: January 25, 2023"
    },


    {
    "question": "What language style is used in the following statement: Can you believe this weather? Do you think it'll rain later?",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "b: Informal Language Style"
},
{
    "question": "What language style is used in the following statement: Hey, wanna grab a bite to eat later?",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "b: Informal Language Style"
},
{
    "question": "What language style is used in the following statement: What's up with that new movie everyone's talking about? Have you seen the trailer?",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "b: Informal Language Style"
},
{
    "question": "What language style is used in the following statement: Do you know if Sarah's coming to the party tonight? I heard she might not make it.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "b: Informal Language Style"
},
{
    "question": "What language style is used in the following statement: So, what's the deal with that new song everyone's listening to?",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "b: Informal Language Style"
},
{
    "question": "What language style is used in the following statement: Did you catch the game last night?",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "b: Informal Language Style"
},
{
    "question": "What language style is used in the following statement: Are you free tomorrow afternoon? Wanna hang out and grab some coffee?",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "b: Informal Language Style"
},
{
    "question": "What language style is used in the following statement: What do you reckon we should do this weekend? Any fun ideas?",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "b: Informal Language Style"
},
{
    "question": "What language style is used in the following statement: Did you see that meme I sent you earlier? It had me cracking up!",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "b: Informal Language Style"
},
{
    "question": "What language style is used in the following statement: I donʼt believe that the results are accurate.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "b: Informal Language Style"
},
{
    "question": "What language style is used in the following e-mail: Hi Beth, Just a quick message to thank you for dinner last night. Absolutely delicious, as always, and I really enjoyed meeting your friend Alice. \
    She’s a laugh, isn’t she? I’m hoping to3 get tickets for the film festival next week, so I’ll be in touch to see if there’s anything you fancy seeing. Give my regards to Conrad when he gets back from Poland, and once again, thanks for last night. Love Sophie",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "b: Informal Language Style"
},
{
    "question": "What language style is used in the following statement: Improvements cannot be introduced due to funding restrictions.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: Could you please provide an update on the progress of the project by the end of the day?",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: I would like to inquire about the availability of the conference room for our meeting next week.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: Per our previous discussion, I am following up on the status of the pending tasks.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: Kindly confirm the receipt of this email and let me know if any further action is required.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: In accordance with company policy, employees are required to complete the mandatory training module.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: Please be advised that the deadline for submitting the report has been extended to next Friday.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: I am writing to formally request an extension on the deadline for the project proposal.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: Per your request, attached is the revised version of the contract for your review.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: I apologize for any inconvenience caused and appreciate your understanding in this matter.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: It has been proved that the arguments so far are without foundation..",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: It is possible to consider the results from a different viewpoint.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "a: Formal Language Style"
},
{
    "question": "What language style is used in the following statement: New results in this area are produced by the research group.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "c: Academic Language Style"
},
{
    "question": "What language style is used in the following statement: How do socioeconomic factors influence voting behavior in democratic societies?",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "c: Academic Language Style"
},
{
    "question": "What language style is used in the following statement: What is the significance of the findings reported in the latest research paper on climate change?",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "c: Academic Language Style"
},
{
    "question": "What language style is used in the following statement: How do cognitive biases affect decision-making processes in individuals and organizations?",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "c: Academic Language Style"
},
{
    "question": "What language style is used in the following statement: The study investigates the correlation between socioeconomic status and educational attainment in urban communities.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "c: Academic Language Style"
},
{
    "question": "What language style is used in the following statement: The analysis explores the impact of climate change on agricultural productivity in developing countries.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "c: Academic Language Style"
},
{
    "question": "What language style is used in the following statement: The research paper examines the role of cultural factors in shaping consumer behavior in global markets.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "c: Academic Language Style"
},
{
    "question": "What language style is used in the following statement: The literature review synthesizes existing studies on the effects of social media use on mental health outcomes.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "c: Academic Language Style"
},
{
    "question": "What language style is used in the following statement: The theoretical framework draws upon sociological theories to analyze patterns of social inequality in urban neighborhoods.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "c: Academic Language Style"
},
{
    "question": "What language style is used in the following statement: The analysis examines demographic trends in urbanization.",
    "choices": {
        "a": "Formal Language Style",
        "b": "Informal Language Style",
        "c": "Academic Language Style"
    },
    "correct": "c: Academic Language Style"
}

]


for question in questions_accuracy:
    print("Question:", question["question"])
    print("Answer Options:")
    for option, answer in question["choices"].items():
        print(f"{option}: {answer}")
    print("Correct Answer:", f"{question['correct']}")
    print("\n" + "-"*50 + "\n")

### Evaluate Model based on Accuracy

In [None]:
model_answers = []

def evaluate_model_accuracy(model, questions_accuracy):
    try:
        correct_answers = 0
        total_questions = len(questions_accuracy)

        for question_data in questions_accuracy:
            question = question_data["question"]
            choices = question_data["choices"]
            correct_option = question_data["correct"]

            # Get model's answer for current question
            prompt = f"{question}\nAnswer Options:\n"
            for option, answer in choices.items():
                prompt += f"{option}: {answer}\n"
            prompt = get_prompt(prompt)
            #print(prompt)
            model_answer = model(prompt)

            # Check if selected option in model's answer matches the correct option
            if model_answer.strip() == correct_option.strip():
                correct_answers += 1

            # Print results for each question
            print(f"Question: {question}")
            print("Answer Options:")
            for option, answer in choices.items():
                print(f"{option}: {answer}")
            print(f"Correct Answer: {correct_option}")
            print(f"Model Answer: {model_answer}")
            print("------------")

            # Store the model's answer and the corresponding question in the array
            model_answers.append({
                "question": question,
                "choices": choices,
                "correct_option": correct_option,
                "selected_option": model_answers,
            })

        # print overall evaluation results
        print(f"Total Questions: {total_questions}")
        print(f"Correct Answers: {correct_answers}")
        print(f"Accuracy: {correct_answers / total_questions * 100:.2f}%")

    except Exception as e:
            print(f"An error occurred: {e}")
            return None, None
    
#run method to get accuracy score
evaluate_model_accuracy(llm, questions_accuracy)

### Evaluate Model based on Rouge (Recall-Oriented Understudy for Gissing Evaluation)

A set of metrics used for evaluating the quality of summaries. It compares the generated summary with one or more reference summaries and calculates precision, recall, and F1-score 

- ROUGE-N quantifies the overlap of N-grams, contiguous sequences of N items (typically words or characters), between the system-generated summary and the reference summary) 
    - ROUGE-1 (unigram overlap): This metric measures the overlap of unigrams (single words) between the generated summary and the reference summary. It focuses on the recall of unigrams.
    - ROUGE-2 (bigram overlap): Similar to ROUGE-1, but it measures the overlap of bigrams (pairs of consecutive words) instead of unigrams. It evaluates the recall of bigrams.
- ROUGE-L (longest common subsequence): Instead of measuring word overlap, ROUGE-L focuses on the longest common subsequence (LCS) between the generated and reference summaries. It evaluates the recall of the longest common subsequence

- high precision value suggests that the words or phrases churned out by the machine translation or submodel are primarily accurate
- high recall value, ideally close to 1, implies that the content of the machine-generated output aligns closely with the human-made reference. It signifies the model's proficiency in capturing relevant information



In [None]:
#general prompt to test llm for testing questions
SYSTEM_PROMPT_EVAL_ROUGE = """
<<SYS>>
You will be presented with a question where you are expected to respond only with the correct answer in a short and simple manner. Use one sentence maximum to answer the question. 
Only respond with the correct answer itself and leave out fill words like "sure" or "certainly".
If you don't know the answer, be honest and state that you don't know. Don't give any false information.
<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT_EVAL_ROUGE + E_SYS

##CURRENT OUTPUT (extraction of language style)
#Average Recall: 0.871
#Average Precision: 0.871
#Average F1-score: 0.871

##CURRENT OUTPUT (without extraction of language style)
#Average Recall: 0.112
#Average Precision: 0.843
#Average F1-score: 0.177

In [None]:
#prompt used in llm_service GRAMMAR AGENT (but without language style defined)
SYSTEM_PROMPT_EVAL_ROUGE_USECASE_GRAMMAR = """
<<SYS>>
You are a friendly language teacher for the language english.ou nicely correct and analyse the grammar and semantical mistakes of the sentences provided by the student. You do not halucinate or interpret the text of the user.\
			    If a message does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer, please don't share false information. \
				If words in the provided message are grammatically correct still analyse the semantic meaning of the sentence and correct it if needed. Think about the correction and analysis step by step.
<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT_EVAL_ROUGE_USECASE_GRAMMAR + E_SYS




In [None]:
#prompt used in llm_service CONVERSATION AGENT (but without language style defined)
SYSTEM_PROMPT_EVAL_ROUGE_USECASE_CONVO = """
<<SYS>>
The following is a friendly conversation between a human and an AI. The AI answers precisely only talks in english. No word is other than english. If the AI does not know the answer to a question, it truthfully says it does not know.
<</SYS>>
"""
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT_EVAL_ROUGE_USECASE_CONVO + E_SYS

##CURRENT OUTPUT (extraction of language style)
#Average Recall: 0.
#Average Precision: 0.
#Average F1-score: 0.

##CURRENT OUTPUT (without extraction of language style)
#Average Recall: 0.
#Average Precision: 0.
#Average F1-score: 0.

In [None]:
# Define test questions with different categories for ROUGE
textual_entailment = {
    "If 'A implies B' and 'B is false,' what can you conclude about A?":"Nothing definitive can be concluded about A.",
}

reasoning_problem_solving = {
    "If a car travels at 60 miles per hour, how long will it take to cover 120 miles?": "2 hours",
}

analogical_reasoning = {
    "If 'cat' is to 'kitten,' what is 'dog' to?":"'Dog' is to 'puppy'",
}

ner = {
    "Identify the named person in the sentence 'Elon Musk is the CEO of SpaceX.'": "Elon Musk",
    "Extract the date mentioned in the text 'The conference is scheduled for January 25, 2023.'": "January 25, 2023.",
}

identification ={
    "How many objects are named in the following sequence: The dog was running towards the tree with a ball in his mouth.": "One object is named, a dog.",
    "Given the statement, answer the following question. Statement: The sky is blue and the building is grey. Question: What color does the sky have?": "Blue"
}

languageStyles = {

  # Informal Style
  "What language style is used in the following Statement: Can you believe this weather? Do you think it'll rain later?" : "Informal",
  "What language style is used in the following Statement: Hey, wanna grab a bite to eat later?" : "Informal",
  "What language style is used in the following Statement: What's up with that new movie everyone's talking about? Have you seen the trailer?" : "Informal",
  "What language style is used in the following Statement: Do you know if Sarah's coming to the party tonight? I heard she might not make it." : "Informal",
  "What language style is used in the following Statement: So, what's the deal with that new song everyone's listing to?" : "Informal",
  "What language style is used in the following Statement: Did you catch the game last night?" : "Informal",
  "What language style is used in the following Statement: Are you free tomorrow afternoon? Wanna hang out and grab some coffee?" : "Informal",
  "What language style is used in the following Statement: What do you reckon we should do this weekend? Any fun ideas?" : "Informal",
  "What language style is used in the following Statement: Did you see that meme I sent you earlier? It had me cracking up!" : "Informal",
  "What language style is used in the following Statement: I donʼt believe that the results are accurate." : "Informal",
  "What language style is used in the following Statement: Hi, Helen! How’s it going?" : "Informal",
  "What language style is used in the following Statement: Thanks. See you later." : "Informal",
  "What language style is used in the following Statement: No, I wasn’t at Steve’s party." : "Informal",
  "What language style is used in the following Statement: Sure, let’s go get a drink." : "Informal",
  "What language style is used in the following Statement: What are you up to this evening?" : "Informal",
  "What language style is used in the following Statement: Oh, that’s a shame." : "Informal",
  "What language style is used in the following Statement: That didn’t make sense to me." : "Informal",
  "What language style is used in the following Statement: Seriously, you’ve got to be kidding me!?" : "Informal",
  "What language style is used in the following Statement: I donʼt believe that the results are accurate." : "Informal",
  "What language style is used in the following Statement: He’ll have to do another five tests before he can stop the experiment." : "Informal",
  "What language style is described by the following statement: The style is more casual and spontaneous and is used to communicate with friends and family either in writing or conversations." : "Informal",
    "What language style is used in the following e-mail: Hi Beth, Just a quick message to thank you for dinner last night. Absolutely delicious, as always, and I really enjoyed meeting your friend Alice. \
    She’s a laugh, isn’t she? I’m hoping to3 get tickets for the film festival next week, so I’ll be in touch to see if there’s anything you fancy seeing. Give my regards to Conrad when he gets back from Poland, and once again, thanks for last night. Love Sophie" : "Informal",

  # Formal Style
  "What language style is used in the following Statement: Improvements cannot be introduced due to funding restrictions." : "Formal",
  "What language style is used in the following Statement: Could you please provide an update on the progress of the project by the end of the day?" : "Formal",
  "What language style is used in the following Statement: I would like to inquire about the availability of the conference room for our meeting next week." : "Formal",
  "What language style is used in the following Statement: Per our previous discussion, I am following up on the status of the pending tasks." : "Formal",
  "What language style is used in the following Statement: Kindly confirm the receipt of this email and let me know if any further action is required." : "Formal",
  "What language style is used in the following Statement: In accordance with company policy, employees are required to complete the mandatory training module." : "Formal",
  "What language style is used in the following Statement: Please be advised that the deadline for submitting the report has been extended to next Friday." : "Formal",
  "What language style is used in the following Statement: I am writing to formally request an extension on the deadline for the project proposal." : "Formal",
  "What language style is used in the following Statement: Per your request, attached is the revised version of the contract for your review." : "Formal",
  "What language style is used in the following Statement: I apologize for any inconvenience caused and appreciate your understanding in this matter." : "Formal",
  "What language style is used in the following Statement: Good morning, Mrs. Smith, how are you doing?" : "Formal",
  "What language style is used in the following Statement: Only food purchased here may be eaten on the premises." : "Formal",
  "What language style is described by the following statement: The style is used when writing or speaking for professional purpose and does not use colloquialism, contractions or first-person pronouns." : "Formal",
  "What language style is used in the following e-mail: Dear Sir or Madam, I am writing in response to3 your advertisement about job opportunities for graduates. I have just completed a degree in Economics at Durham University, and \
    I would be grateful if you could send me further details6 of the graduate training schemes you mention. I am available for interview at any time. I look forward to hearing from you. Yours faithfully" : "Formal",
  "What language style is used in the following Statement: Please respect our no radio playing policy." : "Formal",
  "What language style is used in the following Statement: This could be an effective plan." : "Formal",
  "What language style is used in the following Statement: There were two different methods of research." : "Formal",
  "What language style is used in the following Statement: The project will be completed next year." : "Formal",
  "What language style is used in the following Statement: Five more tests will be necessary before the experiment can be concluded." : "Formal",
  "What language style is used in the following Statement: It is possible to consider the results from a different viewpoint." : "Formal",
  "What language style is used in the following Statement: It has been proved that the arguments so far are without foundation." : "Formal",
  "What language style is used in the following Statement: There are a several reasons why the questionnaire should be revised." : "Formal",
  "What language style is used in the following Statement: I look forward to meeting you next week." : "Formal",

  # academic Style
  "What language style is described by the following statement: The style is used for higher school purposes like thesis or research papers and does not use first-person pronouns and is not personal but factual" : "Academic",
  "What language style is used in the following Statement: New results in this area are produced by the research group." : "Academic",
  "What language style is used in the following Statement: How do socioeconomic factors influence voting behavior in democratic societies?" : "Academic",
  "What language style is used in the following Statement: What is the significance of the findings reported in the latest research paper on climate change?" : "Academic",
  "What language style is used in the following Statement: How do cognitive biases affect decision-making processes in individuals and organizations?" : "Academic",
  "What language style is used in the following Statement: The study investigates the correlation between socioeconomic status and educational attainment in urban communities." : "Academic",
  "What language style is used in the following Statement: The analysis explores the impact of climate change on agricultural productivity in developing countries." : "Academic",
  "What language style is used in the following Statement: The research paper examines the role of cultural factors in shaping consumer behavior in global markets." : "Academic",
  "What language style is used in the following Statement: The literature review synthesizes existing studies on the effects of social media use on mental health outcomes." : "Academic",
  "What language style is used in the following Statement: The theoretical framework draws upon sociological theories to analyze patterns of social inequality in urban neighborhoods." : "Academic",
  "What language style is used in the following Statement: The analysis examines demographic trends in urbanization." : "Academic",
  "What language style is used in the following Statement: The findings highlight disparities in access to healthcare services." : "Academic",
  "What language style is used in the following Statement: The literature review synthesizes recent advancements in machine learning algorithms." : "Academic",
  "What language style is used in the following Statement: The analysis underscores the importance of data privacy in digital healthcare systems." : "Academic",
  "What language style is used in the following Statement: These findings indicate that the model is valid." : "Academic",
  "What language style is used in the following Statement: The following section will show how the model is validated by the findings." : "Academic",
  "What language style is used in the following Statement: As Halliday(1973) shows, language is intrinsically social." : "Academic",
  "What language style is used in the following Statement: Many researchers have utilised X to measure Y." : "Academic",
  "What language style is used in the following Statement: A number of techniques have been developed to..." : "Academic",
  "What language style is used in the following Statement: Different authors have measured X in a variety of ways." : "Academic",
  "What language style is used in the following Statement: Several methods currently exist for the measurement of X." : "Academic",
  "What language style is used in the following Statement: There are three main types of study design used to identify X." : "Academic",
  "What language style is used in the following Statement: In most recent studies, X has been measured in four different ways." : "Academic",
  "What language style is used in the following Statement: In recent years, two different approaches have attempted to account for X." : "Academic",
  "What language style is used in the following Statement: A variety of methods are used to assess X. Each has its advantages and drawbacks." : "Academic",
  "What language style is used in the following Statement: Two of the most common methods for estimating X are the use of Y and the measurement of Z." : "Academic",
}

chat_messages_grammar ={
"Todai i taked a nap.":"There are a few mistakes in your sentence. The correct sentence should be: Today i took a nap.",
"Why is tis spici":"There are a few mistakes in your sentence. The correct sentence should be: Why is this spicy?",
"wen will the wether be better?":"There are a few mistakes in your sentence. The correct phrase should be when will the weather be better?",
"when i went home i eat":"There is a small mistake in your sentence. The correct phrase should be When I am home i will eat.",
"Tere is a man walkin down the stares.":"There is a small mistake in your sentence. The correct sentence should be There is a man walking down the stairs.",
"I broked my pencil.":"There is a mistake in your sentence. The correct phrase should be I broke my pencil.",
"Whos going to the party?":"There is a mistake in your sentence. The correct question should be Who is going to the party?",
"Im going too the store.":"There is a mistake in your sentence. The correct phrase should be I'm going to the store.",
"Their is a cat outside.":"There is a mistake in your sentence. The correct phrase should be There is a cat outside.",
"She seen the movie already.":"There is a mistake in your sentence. The correct phrase should be She has seen the movie already.",
"Can you please send me youre address?":"There are a few mistakes in your sentence. The correct sentence should be Can you please send me your address?",
"Im going too cook dinner.":"There is a mistake in your sentence. The correct phrase should be I'm going to cook dinner.",
"I went too the gym yesterday.":"There is a mistake in your sentence. The correct phrase should be I went to the gym yesterday.",
"The dog chased it is tail.":"There is a mistake in your sentence. The correct phrase should be The dog chased its tail.",
"We should of left earlier.":"There is a mistake in your sentence. The correct phrase should be We should have left earlier.",
"I seen that movie before.":"There is a mistake in your sentence. The correct phrase should be I have seen that movie before.",
"The books are laying on the table.":"There is a mistake in your sentence. The correct phrase should be The books are lying on the table.",
"I would of gone if you asked me.":"There is a mistake in your sentence. The correct phrase should be I would have gone if you asked me.",
"Whos shoes are those?":"There is a mistake in your sentence. The correct question should be Whose shoes are those?",
"I can't find my keys no where.":"There is a mistake in your sentence. The correct phrase should be I can't find my keys anywhere.",
"I goed to the store yesterday.":"There is a mistake in your sentence. The correct phrase should be I went to the store yesterday.",
"I could of done better on the test.":"There is a mistake in your sentence. The correct phrase should be I could have done better on the test.",
"The sun shins brigt.":"There is a mistake in your sentence. The correct phrase should be The sun is shining bright.",
"Your always so kind to me.":"There is a mistake in your sentence. The correct phrase should be You're always so kind to me.",
"He's been here since two hour.":"There is a mistake in your sentence. The correct phrase should be He's been here for two hours.",
"She doesn't needs any help.":"There is a mistake in your sentence. The correct phrase should be She doesn't need any help.",
"I should of listened to your advice.":"There is a mistake in your sentence. The correct phrase should be I should have listened to your advice.",
"I feel gooder today.":"There is a mistake in your sentence. The correct phrase should be I feel better today.",
"They're are going to the movies tonight.":"There is a mistake in your sentence. The correct phrase should be They are going to the movies tonight.",
"Her and me are best friends.":"There is a mistake in your sentence. The correct phrase should be She and I are best friends.",
}

# Combine all test questions into a single dictionary
#questions_rouge = {**textual_entailment,**reasoning_problem_solving,**analogical_reasoning,**ner,**identification,**languageStyles}
questions_rouge = {**languageStyles}
questions_rouge = {**chat_messages_grammar}


In [None]:
#%pip install rouge ##- test rouge library - https://pypi.org/project/rouge/ 
from rouge import Rouge

##example calc for rouge
hypothesis = "the #### transcript is a written version of each day 's cnn student news program use this transcript to help students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you     saw on cnn student news"

reference = "this page includes the show transcript use the transcript to help students with reading comprehension and vocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teac    her or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests     students ' knowledge of even ts in the news"

rouge = Rouge()
scores = rouge.get_scores(hypothesis, reference)
print(scores)

In [None]:

#Function that extracts only the language style named 
def extract_language_style(input_string):

    # Check if the input string is empty
    if not input_string:
        return ""
    
    words = input_string.split()
    words = [word.strip(",.!?").lower() for word in words]
    allowed_words = {"informal", "formal", "academic"}

    for word in words:
        if word in allowed_words:
            return word
    return "False"


In [None]:
#maybe check if rouge-score library better

#ROUGE WITH extraction of language style
def evaluate_model_rouge(model, questions):
    try:
        total_questions = len(questions)
        model_answers = []
        rouge_scores_list_r1 = []
        rouge_scores_list_r2 = []
        rouge_scores_list_rl= []
        rouge_scores = {"rouge": {"recall": 0, "precision": 0, "f1-score": 0}} 

        for question, expected_answer in questions_rouge.items():
            prompt = f"{question}\nAnswer Options:\n"
            prompt = get_prompt(prompt)
            #print(prompt)
            model_answer = model(prompt)

            #remove leading & trailing spaces, extract language style from model_answer if answer is unnecessarily long
            model_answer = model_answer.strip()
            expected_answer = expected_answer.strip().lower()

            model_answer = extract_language_style(model_answer)

            # Get Rouge Score - Rouge() hat rouge-1, rouge-2 and rouge-l as option 
            rouge = Rouge()
            scores = rouge.get_scores(expected_answer, model_answer) 

            #save rouge-1, rouge-2 and rouge-l values for each question in list
            scores_list_r1 = rouge.get_scores(expected_answer, model_answer)[0]
            rouge_scores_list_r1.append({
                "r": scores_list_r1.get('rouge-1', {}).get('r', 0),
                "p": scores_list_r1.get('rouge-1', {}).get('p', 0),
                "f": scores_list_r1.get('rouge-1', {}).get('f', 0),
            })
            scores_list_r2 = rouge.get_scores(expected_answer, model_answer)[0]
            rouge_scores_list_r2.append({
                "r": scores_list_r2.get('rouge-2', {}).get('r', 0),
                "p": scores_list_r2.get('rouge-2', {}).get('p', 0),
                "f": scores_list_r2.get('rouge-2', {}).get('f', 0),
            })
            scores_list_rl = rouge.get_scores(expected_answer, model_answer)[0]
            rouge_scores_list_rl.append({
                "r": scores_list_rl.get('rouge-l', {}).get('r', 0),
                "p": scores_list_rl.get('rouge-l', {}).get('p', 0),
                "f": scores_list_rl.get('rouge-l', {}).get('f', 0),
            })
            
            # Print the results for each question
            print(f"Question: {question}")
            print(f"Expected Answer: {expected_answer}")
            print(f"Model Answer: {model_answer}")
            print(f"ROUGE Scores: {scores}")
            print("------------")

            # Store the model's answer and the corresponding question in the array
            model_answers.append({
                "question": question,
                "expected_answer": expected_answer,
                "model_answer": model_answer,
            })

        # Calculate the average 'r', 'p', and 'f' values for rouge-1, rouge-2 and rouge-l
        average_r_r1 = round(sum(entry["r"] for entry in rouge_scores_list_r1) / total_questions,3) 
        average_p_r1 = round(sum(entry["p"] for entry in rouge_scores_list_r1) / total_questions,3)
        average_f_r1 = round(sum(entry["f"] for entry in rouge_scores_list_r1) / total_questions,3)

        average_r_r2 = round(sum(entry["r"] for entry in rouge_scores_list_r2) / total_questions,3) 
        average_p_r2 = round(sum(entry["p"] for entry in rouge_scores_list_r2) / total_questions,3)
        average_f_r2 = round(sum(entry["f"] for entry in rouge_scores_list_r2) / total_questions,3)
        
        average_r_rl = round(sum(entry["r"] for entry in rouge_scores_list_rl) / total_questions,3) 
        average_p_rl = round(sum(entry["p"] for entry in rouge_scores_list_rl) / total_questions,3)
        average_f_rl = round(sum(entry["f"] for entry in rouge_scores_list_rl) / total_questions,3)

        print(f"Questions total: {total_questions}")
        print(f"Average Recall (ROUGE-1): {average_r_r1}")
        print(f"Average Precision (ROUGE-1): {average_p_r1}")
        print(f"Average F1-score (ROUGE-1): {average_f_r1}")
        print(f"Average Recall (ROUGE-2): {average_r_r2}")
        print(f"Average Precision (ROUGE-2): {average_p_r2}")
        print(f"Average F1-score (ROUGE-2): {average_f_r2}")
        print(f"Average Recall (ROUGE-L): {average_r_rl}")
        print(f"Average Precision (ROUGE-L): {average_p_rl}")
        print(f"Average F1-score (ROUGE-L): {average_f_rl}")

        return model_answers, rouge_scores

    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None
    
#run method
model_answers, rouge_scores = evaluate_model_rouge(llm, questions_rouge)



In [None]:
#ROUGE WITHOUT extraction of language style
def evaluate_model_rouge_(model, questions):
    try:
        total_questions = len(questions)
        model_answers = []
        rouge_scores_list_r1 = []
        rouge_scores_list_r2 = []
        rouge_scores_list_rl= []
        rouge_scores = {"rouge": {"recall": 0, "precision": 0, "f1-score": 0}} 

        for question, expected_answer in questions_rouge.items():
            prompt = f"{question}\nAnswer Options:\n"
            prompt = get_prompt(prompt)
            model_answer = model(prompt)

            #remove leading & trailing spaces, extract language style from model_answer if answer is unnecessarily long
            model_answer = model_answer.strip()
            expected_answer = expected_answer.lower()

            #model_answer = extract_language_style(model_answer)

            # Get Rouge Score - Rouge() hat rouge-1, rouge-2 and rouge-l as option 
            rouge = Rouge()
            scores = rouge.get_scores(expected_answer, model_answer) 

            #save rouge-1, rouge-2 and rouge-l values for each question in list
            scores_list_r1 = rouge.get_scores(expected_answer, model_answer)[0]
            rouge_scores_list_r1.append({
                "r": scores_list_r1.get('rouge-1', {}).get('r', 0),
                "p": scores_list_r1.get('rouge-1', {}).get('p', 0),
                "f": scores_list_r1.get('rouge-1', {}).get('f', 0),
            })
            scores_list_r2 = rouge.get_scores(expected_answer, model_answer)[0]
            rouge_scores_list_r2.append({
                "r": scores_list_r2.get('rouge-2', {}).get('r', 0),
                "p": scores_list_r2.get('rouge-2', {}).get('p', 0),
                "f": scores_list_r2.get('rouge-2', {}).get('f', 0),
            })
            scores_list_rl = rouge.get_scores(expected_answer, model_answer)[0]
            rouge_scores_list_rl.append({
                "r": scores_list_rl.get('rouge-l', {}).get('r', 0),
                "p": scores_list_rl.get('rouge-l', {}).get('p', 0),
                "f": scores_list_rl.get('rouge-l', {}).get('f', 0),
            })
            
            # Print the results for each question
            print(f"Question: {question}")
            print(f"Expected Answer: {expected_answer}")
            print(f"Model Answer: {model_answer}")
            print(f"ROUGE Scores: {scores}")
            print("------------")

            # Store the model's answer and the corresponding question in the array
            model_answers.append({
                "question": question,
                "expected_answer": expected_answer,
                "model_answer": model_answer,
            })

        # Calculate the average 'r', 'p', and 'f' values for rouge-1, rouge-2 and rouge-l
        average_r_r1 = round(sum(entry["r"] for entry in rouge_scores_list_r1) / total_questions,3) 
        average_p_r1 = round(sum(entry["p"] for entry in rouge_scores_list_r1) / total_questions,3)
        average_f_r1 = round(sum(entry["f"] for entry in rouge_scores_list_r1) / total_questions,3)

        average_r_r2 = round(sum(entry["r"] for entry in rouge_scores_list_r2) / total_questions,3) 
        average_p_r2 = round(sum(entry["p"] for entry in rouge_scores_list_r2) / total_questions,3)
        average_f_r2 = round(sum(entry["f"] for entry in rouge_scores_list_r2) / total_questions,3)
        
        average_r_rl = round(sum(entry["r"] for entry in rouge_scores_list_rl) / total_questions,3) 
        average_p_rl = round(sum(entry["p"] for entry in rouge_scores_list_rl) / total_questions,3)
        average_f_rl = round(sum(entry["f"] for entry in rouge_scores_list_rl) / total_questions,3)

        print(f"Questions total: {total_questions}")
        print(f"Average Recall (ROUGE-1): {average_r_r1}")
        print(f"Average Precision (ROUGE-1): {average_p_r1}")
        print(f"Average F1-score (ROUGE-1): {average_f_r1}")
        print(f"Average Recall (ROUGE-2): {average_r_r2}")
        print(f"Average Precision (ROUGE-2): {average_p_r2}")
        print(f"Average F1-score (ROUGE-2): {average_f_r2}")
        print(f"Average Recall (ROUGE-L): {average_r_rl}")
        print(f"Average Precision (ROUGE-L): {average_p_rl}")
        print(f"Average F1-score (ROUGE-L): {average_f_rl}")

        return model_answers, rouge_scores

    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None
    
#run method
model_answers, rouge_scores = evaluate_model_rouge_(llm, questions_rouge)



In [None]:
##multiple sentences
import json

# Load some sentences
with open('./tests/data.json') as f:
  data = json.load(f)

hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data]))
rouge = Rouge()
scores = rouge.get_scores(hyps, refs)
# or
scores = rouge.get_scores(hyps, refs, avg=True)


##two files
from rouge import FilesRouge

files_rouge = FilesRouge()
scores = files_rouge.get_scores(hyp_path, ref_path)
# or
scores = files_rouge.get_scores(hyp_path, ref_path, avg=True)

### Correctness Evaluation

In [None]:
#%pip install bitsandbytes
#%pip install datasets

### Performance Evaluation - Perplexity: measure that quantifies how well the model predicts a sample of text. Lower perplexity values indicate better performance 

In [None]:
import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer
import bitsandbytes as bnb

model_name = '/kaggle/input/llama-2/pytorch/7b-chat-hf/1'

tokenizer = LlamaTokenizer.from_pretrained(model_name)

free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'

n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  device_map='auto',
  load_in_4bit=True,
  max_memory=max_memory,
  do_sample=True,
  torch_dtype="auto"
)

from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")


import torch
from tqdm import tqdm

max_length = model.config.max_length
stride = 512
seq_len = encodings.input_ids.size(1)
device = "cuda"

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())

In [None]:
##check Langchains QAEvalChain
#https://github.com/langchain-ai/langchain/blob/a6b39afe0e34621fafac885af05bbbb445ea5ac0/langchain/evaluation/qa/eval_chain.py#L42C1-L42C1