<a href="https://colab.research.google.com/github/joshuashing1/MCNN_Simulation/blob/main/MLC_MMLU_Eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install langchain
!pip install langchain_openai
!pip install openai
!pip install numpy
!pip install --pre -U -f https://mlc.ai/wheels mlc-llm-nightly mlc-ai-nightly
!pip install git-lfs

In [None]:
import os
import json
import numpy as np
import pandas as pd
import mlc_llm; print(mlc_llm)

In [None]:
from datasets import load_dataset
from langchain_openai import ChatOpenAI
from langchain.prompts import HumanMessagePromptTemplate
from mlc_llm import MLCEngine

In [None]:
# raw dataset
mmlu_dataset = load_dataset(
    path = 'cais/mmlu',
    name = 'all',
    trust_remote_code = True,
    split = 'dev'
)
print(mmlu_dataset)

# pandas version of dataset
df_mmlu_full = mmlu_dataset.to_pandas()

# select number of data
df_mmlu = df_mmlu_full.head(2)
print(df_mmlu['choices'])

#choices = df_mmlu['choices'].apply(lambda x: ', '.join(x))
#print(question)
# print(df_mmlu['choices'].dtype)

In [None]:
## Data Wrangling for True Values
true_choices = []

# Enter in true choices into list
def true_value_conversion(x):
    if x == 0:
        return 'A'
    elif x == 1:
        return 'B'
    elif x == 2:
        return 'C'
    else:
        return 'D'

for answer, row in df_mmlu.iterrows():
    true_choices.append(true_value_conversion(row['answer']))
    # true_choices.append(str(row['answer']))

In [None]:
## Data Wrangling for LLM Values
llm_choices = []

df_mmlu['choices'] = df_mmlu['choices'].astype(str)
print(df_mmlu['choices'])

# Define the prefixes
prefixes = ['A. ', 'B. ', 'C. ', 'D. ']

# Define a function to add prefixes to the elements of the list
def add_prefixes(lst, prefixes):
    return [f"{prefix}{item}" for prefix, item in zip(prefixes, lst)]

# Apply the function to the DataFrame column
# df_mmlu['choices_with_prefix'] = add_prefixes(df_mmlu['choices'], prefixes)
df_mmlu['choices_with_prefix'] = df_mmlu['choices'].apply(lambda x: add_prefixes(x, prefixes))

print(df_mmlu['choices_with_prefix'])

In [None]:
# Call on the engine API
model = "HF://mlc-ai/Phi-3-mini-128k-instruct-q4f16_2-MLC"
engine = MLCEngine(model)

In [None]:
for i in range(len(df_mmlu)):
    response = engine.chat.completions.create(
        messages=[{"role": "user", "content": "What is 1+1? Think step-by-step to select the best option. The options are A. 2, B. 3, C. 4, D. 5. A"},
        {"role": "user", "content": "Which colour is primary colour? Think step-by-step to select the best option. The options are A. yellow, B. cyan, C. red, D. pink. C"},
        {"role": "user", "content": "What is the capital city of China? Think step-by-step to select the best option. The options are A. Hong Kong, B. Beijing, C. Shanghai, D. Tianjin. B"},
        {"role": "user", "content": "She is ___ home. Think step-by-step to select the best option. The options are A. at, B. in, C. on, D. from. A"},
        {"role": "user", "content": "60 minutes is ___? Think step-by-step to select the best option. The options are A. one hour, B. half an hour, C. two hours, D. three hours. A"},
        {"role": "user", "content": f"{df_mmlu.loc[i, "question"]}. Choose the best option without explaination. {df_mmlu.loc[i, "choices"]}."}],
        model=model,
        #stream=False,
    )
    llm_choices.append(response.choices[0].message.content[0])

print(llm_choices)

In [None]:
# evaluation on MMLU dataset
# for i in range(len(true_choices)):
for response in engine.chat.completions.create(
    messages=[#{"role": "user", "content": "What is 1+1? Think step-by-step to select the best option. The options are A. 2, B. 3, C. 4, D. 5. A"},
    #{"role": "user", "content": "Which colour is primary colour? Think step-by-step to select the best option. The options are A. yellow, B. cyan, C. red, D. pink. C"},
    #{"role": "user", "content": "What is the capital city of China? Think step-by-step to select the best option. The options are A. Hong Kong, B. Beijing, C. Shanghai, D. Tianjin. B"},
    #{"role": "user", "content": "Which of the following is not a security exploit? Choose the best option without explaination. The options are A. Eavesdropping, B. Cross-site scripting, C. Authentication, D. SQL Injection.",}
    {"role": "user", "content": "She is ___ home. Choose the best option without explaination. The options are A. at, B. in, C. on, D. from. A"}],
    model=model,
    stream=True,
):
    for choice in response.choices:
        print(choice.delta.content, end="", flush=True)
print("\n")

# Get last letter of the \n string
llm_choices.append("\n")

# terminate engine per iteration
engine.terminate()

In [None]:
# check accuracy
correct_choices = []

for i in range(len(true_choices)):
    if llm_choices[i] == true_choices[i]:
        correct_choices.append(1)
    else:
        correct_choices.append(0)

llm_accuracy = np.mean(correct_choices)
print(llm_accuracy)