# Install and Import Libraries

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl

In [None]:
# Import necessary libraries
import os
import time
import torch
import pandas as pd
from time import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Import Hugging Face libraries
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from torch.utils.data import Dataset, DataLoader
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer, setup_chat_format
from huggingface_hub import notebook_login

In [None]:
# Login to Hugging Face Hub
notebook_login()

# Evaluation TuringQ

In [None]:
# Load the dataset
dataset = load_dataset("llm-lab/TuringQ")

In [None]:
# Load the desired model to be evaluated

model = "meta-llama/Llama-2-7b-chat-hf"

# Load the tokenizer and pipeline
tokenizer = AutoTokenizer.from_pretrained(model, use_auth_token=True)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

def query_model(
    user_message,
    temperature=0.7,
    max_length=1024
):
    """
    Query the model with a user message and return the generated response.

    Args:
        user_message (str): The message from the user.
        temperature (float): Sampling temperature for text generation.
        max_length (int): Maximum length of the generated response.

    Returns:
        str: The generated response from the model.
    """

    # Define the prompt with Chain of Thoughts approach
    prompt = """
    You are a knowledgeable AI assistant specialized in Theory of Computation and Complexity. You will be answering questions related to this domain.

    To provide a clear and structured response, you will follow the Chain of Thoughts approach:

    Chain of Thoughts:

    1. Analyze the question and identify core concepts, algorithms or problems.
    2. Build a step-by-step solution approach, stating assumptions, defining variables/notations, and listing intermediate steps.
    3. For proofs or complex calculations, show work explicitly, using relevant theorems, lemmas, or properties.
    4. For true/false statements, provide clear justification or counterexample.
    5. Review your Chain of Thoughts for logical soundness and completeness.

    Use clear and concise language, avoiding unnecessary jargon.

    Question: """ + user_message + """

    Final Answer:
    """
    messages = [
        {"role": "user", "content": prompt},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    sequences = pipeline(
        prompt,
        do_sample=True,
        top_p=0.9,
        temperature=temperature,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=pipeline.model.config.eos_token_id
    )

    answer = sequences[0]['generated_text']
    print(answer)
    return answer

# Automatic Evaluation

In [None]:
# Load your desired model, based on experiments meta-llama/Meta-Llama-3-8B-Instruct is the best for evaluation
model = "meta-llama/Meta-Llama-3-8B-Instruct"
# Load the tokenizer and pipeline
tokenizer = AutoTokenizer.from_pretrained(model, use_auth_token=True)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

def query_model(
    masked_answer,
    masked_solution,
    temperature=0.7,
    max_length=2048
):
    """
    Query the model with a masked answer and solution, and return the generated score.

    Args:
        masked_answer (str): The answer to be evaluated.
        masked_solution (str): The correct solution for comparison.
        temperature (float): Sampling temperature for text generation.
        max_length (int): Maximum length of the generated response.

    Returns:
        str: The generated score from the model.
    """
    prompt="""
    You are an automated grading system for evaluating answers in the field of theory of computation and complexity. Your task is to assign a score (1, 2, 3, or 4) to a given answer based on its correctness and alignment with the provided solution, following the rubrics outlined below.
    Rubrics:
    Level 4 (Excellent):
    - Answer is completely correct and aligns perfectly with the provided solution.
    - Proofs, descriptions, true/false justifications, and calculations match the solution with no errors or omissions.
    - Demonstrates a comprehensive understanding of the concepts.
    Level 3 (Good):
    - Answer is mostly correct, with only minor deviations or omissions compared to the provided solution.
    - Proofs, descriptions, justifications, and calculations are largely accurate but may have a few minor flaws
    - Shows a strong grasp of the key concepts.
    Level 2 (Flawed):
    - Answer has some significant differences or incorrect elements compared to the provided solution.
    - Proofs, descriptions, justifications, and calculations contain several errors or omissions, but the core approach is partially valid.
    - Demonstrates a basic understanding of the concepts but lacks depth.
    Level 1 (Poor):
    - Answer deviates substantially from the provided solution.
    - Proofs, descriptions, justifications, and calculations are mostly incorrect or entirely missing crucial components.
    - Exhibits a lack of understanding of the fundamental concepts.
    Please note that the length of the answer should not be a factor in determining the score. The focus should be solely on the correctness and alignment with the provided solution.
    Given Answer: """ + masked_answer + """\n Solution: """+ masked_solution + """ \n Based on the rubrics and the provided solution, assign a score (1, 2, 3, or 4) to the given answer
    """

    messages = [
        {"role": "user", "content": prompt},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Define terminators for the generated text
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    # Generate the response from the model
    sequences = pipeline(
        prompt,
        do_sample=True,
        top_p=0.9,
        temperature=temperature,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=pipeline.model.config.eos_token_id
    )

    answer = sequences[0]['generated_text']
    return answer


#Finetunning Process

In [None]:
# Define model ID and compute data type
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
compute_dtype = torch.bfloat16

# Configure BitsAndBytes for 4-bit quantization
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True)

# Load model configuration
model_config = AutoConfig.from_pretrained(
    model_id,
    trust_remote_code=True,
    max_new_tokens=1024
)

# Load model with quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load training data from TuringQ file
train_df = pd.read_excel("training.xlsx")
train_dataset = Dataset.from_pandas(train_df)

# Load validation data from TuringQ file
test_df = pd.read_excel("validation.xlsx")
test_dataset = Dataset.from_pandas(test_df)

def format_qa_template(example):
    question = example["Question"]
    answer = example["Answer"]
    return {"text": question, "labels": answer}

# Apply formatting to train and validation datasets
train_dataset = train_dataset.map(format_qa_template)
test_dataset = test_dataset.map(format_qa_template)

# Combine train and validation datasets
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Setup chat format for model and tokenizer
model, tokenizer = setup_chat_format(model, tokenizer)
model = prepare_model_for_kbit_training(model)

# Configure PEFT (Parameter-Efficient Fine-Tuning)
peft_config = LoraConfig(
        lora_alpha=64,
        lora_dropout=0.05,
        r=4,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",]
)

# Define training arguments
training_arguments = TrainingArguments(
        output_dir="./results_llama3_sft/",
        evaluation_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=4,
        log_level="info",
        save_strategy="steps",
        save_steps=500,
        save_total_limit=2,
        logging_steps=50,
        learning_rate=5e-6,
        eval_steps=500,
        max_steps=4000,
        num_train_epochs=3,
        warmup_steps=100,
        lr_scheduler_type="cosine",
        weight_decay=0.01,
        fp16=True,
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",

)

# Initialize SFTTrainer with model, datasets, and training arguments
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=800,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()

#Inference Llama3-8B-ft-TuringQ

In [None]:
# 1. Load the base model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
# 2. Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# 3. Set up chat format
base_model, tokenizer = setup_chat_format(base_model, tokenizer)


# 2. Load the PEFT adapter
peft_model_id = "PardisSzah/PEFT_TuringQ_llama3_FT"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = PeftModel.from_pretrained(base_model, peft_model_id)


# 4. Merge the base model and PEFT adapter
model = model.merge_and_unload()

# 5. Set the model to evaluation mode
model.eval()


In [None]:
# Load the dataset
dataset = load_dataset("lighteval/MATH", split="test")
df = pd.DataFrame(dataset)

# Calculate the fraction needed to get 500 samples
sample_fraction = 500 / len(df)

# Create a stratified sample
sample, _ = train_test_split(df, stratify=df[['level', 'type']], train_size=sample_fraction, random_state=42)


In [None]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

def query_model(system_message, user_message, temperature=0.7, max_length=1024):
    user_message = "Question: " + user_message + " Answer:"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]
    prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    sequences = pipeline(
        prompt,
        do_sample=True,
        top_p=0.9,
        temperature=temperature,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=pipeline.model.config.eos_token_id
    )
    answer = sequences[0]['generated_text']
    return answer

system_message = """
You are an AI assistant designed to answer math questions.
Please provide a step-by-step solution to the problem.
"""

# Generate answers using Llama 3 8B model
sample['llama3_peft_answer_math'] = ''
for idx, row in tqdm(sample.iterrows(), total=len(sample)):
    question = row['problem']
    answer = query_model(system_message, question, temperature=0.1, max_length=600)
    print(answer)
    sample.at[idx, 'llama3_peft_answer_math'] = answer
