<a href="https://colab.research.google.com/github/jonesLevin/Data-Science-Competitions/blob/main/Eedi_Mining_Misconceptions_in_Mathematics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Eedi Misconception Mining

In [1]:
!pip install -q peft transformers accelerate bitsandbytes
!pip install -q flash-attn==0.2.4

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!unzip eedi-mining-misconceptions-in-mathematics.zip

Archive:  eedi-mining-misconceptions-in-mathematics.zip
  inflating: misconception_mapping.csv  
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [3]:
import os
import re
import gc
import argparse
import json

import numpy as np
import pandas as pd

from tqdm import tqdm
from google.colab import userdata

import torch
import torch.nn.functional as F
from torch import Tensor
import peft
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoModel

os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
pd.set_option('display.max_rows', 300)

In [4]:
IS_SUBMISSION = True

model_name = 'microsoft/Phi-3.5-mini-instruct'
df_train = pd.read_csv('train.csv').fillna(-1)
df_test = pd.read_csv('test.csv')
df_misconceptions = pd.read_csv('misconception_mapping.csv')

## First Retrieval

In [5]:
if not IS_SUBMISSION:
    df_ret = df_train.copy()
else:
    df_ret = df_test.copy()

In [6]:
df_ret

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText
0,1869,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets
1,1870,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify
2,1871,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct


In [7]:
TEMPLATE_INPUT = '{QUESTION}\nCorrect answer: {CORRECT_ANSWER}\nStudent wrong answer: {STUDENT_WRONG_ANSWER}'

def format_input(row, wrong_choice):
  assert wrong_choice in "ABCD"

  question_text = row.get('QuestionText', 'No question text provided')
  subject_name = row.get("SubjectName", "Unknown subject")
  construct_name = row.get("ConstructName", "Unknown construct")
  correct_answer = row.get("CorrectAnswer", "Unknown")

  assert wrong_choice != correct_answer
  correct_answer_text = row.get(f"Answer{correct_answer}Text", "No correct answer text available")
  wrong_answer_text = row.get(f"Answer{wrong_choice}Text", "No wrong answer text available")

  formatted_question = f"""Question: {question_text}
  SubjectName: {subject_name} ConstructName: {construct_name}"""

  # Return the extracted data
  ret = {
      "QUESTION": formatted_question,
      "CORRECT_ANSWER": correct_answer_text,
      "STUDENT_WRONG_ANSWER": wrong_answer_text,
      "MISCONCEPTION_ID": row.get('Misconception{wrong_choice}Id'),
  }

  ret["PROMPT"] = TEMPLATE_INPUT.format(**ret)

  return ret

In [8]:
items = []
target_ids = []

for _, row in tqdm(df_ret.iterrows(), total=len(df_ret)):
  for choice in ['A', 'B', 'C', 'D']:
    if choice == row["CorrectAnswer"]:
      continue
    if not IS_SUBMISSION and row[f'Misconception{choice}Id'] == -1:
      continue

    correct_col = f"Answer{row['CorrectAnswer']}Text"
    item = {'QuestionId_Answer': '{}_{}'.format(row['QuestionId'], choice)}
    item['Prompt'] = format_input(row, choice)['PROMPT']
    items.append(item)
    target_ids.append(int(row.get(f'Misconception{choice}Id', -1)))

df_input = pd.DataFrame(items)

100%|██████████| 3/3 [00:00<00:00, 1713.83it/s]


In [9]:
df_input

Unnamed: 0,QuestionId_Answer,Prompt
0,1869_B,Question: \[\n3 \times 2+4-5\n\]\nWhere do the...
1,1869_C,Question: \[\n3 \times 2+4-5\n\]\nWhere do the...
2,1869_D,Question: \[\n3 \times 2+4-5\n\]\nWhere do the...
3,1870_A,"Question: Simplify the following, if possible:..."
4,1870_B,"Question: Simplify the following, if possible:..."
5,1870_C,"Question: Simplify the following, if possible:..."
6,1871_A,Question: Tom and Katie are discussing the \( ...
7,1871_C,Question: Tom and Katie are discussing the \( ...
8,1871_D,Question: Tom and Katie are discussing the \( ...


In [10]:
def get_detailed_instruct(task_description: str, query: str) -> str:
  return f'<instruct>{task_description}\n<query>{query}'

def get_detailed_example(task_description: str, query: str, response: str) -> str:
  return f'<instruct>{task_description}\n<query>{query}\n<response>{response}'

def get_new_queries(queries, query_max_len, examples_prefix, tokenizer):
  inputs = tokenizer(
      queries,
      max_length=query_max_len - len(tokenizer('<s>', add_special_tokens=False)['input_ids']) - len(
          tokenizer('\n<response></s>', add_special_tokens=False)['input_ids']),
      return_token_type_ids=False,
      truncation=True,
      return_tensors=None,
      add_special_tokens=False
    )

  prefix_ids = tokenizer(examples_prefix, add_special_tokens=False)['input_ids']
  suffix_ids = tokenizer('\n<response>', add_special_tokens=False)['input_ids']
  new_max_length = (len(prefix_ids) + len(suffix_ids) + query_max_len + 8) // 8 * 8 + 8
  new_queries = tokenizer.batch_decode(inputs['input_ids'])

  for i in range(len(new_queries)):
    new_queries[i] = examples_prefix + new_queries[i] + '\n<response>'

  return new_max_length, new_queries

In [11]:
task =  "Given a math multiple-choice problem with a student's wrong answer, retrieve the math misconceptions behind the wrong answers the student chose"
queries = [
    get_detailed_instruct(task, q) for q in df_input['Prompt']
]
documents = df_misconceptions['MisconceptionName'].tolist()
query_max_len, doc_max_len = 320, 48
# LORA_PATH = '/content/lora'
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
examples_prefix = ''
new_query_max_len, new_queries = get_new_queries(queries, query_max_len, examples_prefix, tokenizer)

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [12]:
import json
with open('data.json', 'w') as f:
  data = {'texts': new_queries + documents}
  f.write(json.dumps(data))

In [13]:
%%writefile run_embed.py

MAX_LENGTH = 512

def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
  left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
  if left_padding:
    return last_hidden_states[:, -1]
  else:
    sequence_lengths = attention_mask.sum(dim=1) - 1
    batch_size = last_hidden_states.shape[0]
    return last_hidden_states[
        torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths
    ]

def get_embeddings_in_batches(model, tokenizer, texts, max_length, batch_size=32):
  embeddings = []
  for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
    batch_texts = texts[i : i + batch_size]
    batch_dict = tokenizer(
      batch_texts,
      max_length=max_length,
      padding=True,
      truncation=True,
      return_tensors="pt",
    ).to("cuda")

    with torch.inference_mode():
      outputs = model(**batch_dict)
      batch_embeddings = last_token_pool(
          outputs.last_hidden_state, batch_dict["attention_mask"]
      )
      batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1).cpu()
      embeddings.append(batch_embeddings)

  return torch.cat(embeddings, dim=0)

Writing run_embed.py


In [14]:
def load_model_and_tokenizer(load_in_4bit=True):
  model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=0,
        torch_dtype=torch.float16,
        load_in_4bit=load_in_4bit,
        trust_remote_code=True,
    )
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  model.resize_token_embeddings(len(tokenizer))

  return model, tokenizer

In [15]:
def main(args):
  output_file = args.input_text.replace(
        ".json", ".pt.fold.{}.{}.embed".format(*args.fold)
  )
  if os.path.exists(output_file):
    print(f"Output file {output_file} already exists. Skipping...")
    return
  model, tokenizer = load_model_and_tokenizer(load_in_4bit=args.load_in_4bit)
  texts = json.load(open(args.input_text))["texts"][args.fold[0] :: args.fold[1]]
  embeddings = get_embeddings_in_batches(
    model,
    tokenizer,
    texts,
    max_length=MAX_LENGTH,
    batch_size=4,
  )
  text2embeds = {text: emb for text, emb in zip(texts, embeddings)}
  torch.save(text2embeds, output_file)

In [None]:
# if __name__ == "__main__":
#   parser = argparse.ArgumentParser()
#   parser.add_argument(
#       "--base_model",
#       type=str,
#       default="Qwen/Qwen2.5-7B",
#       help="Path to the base model",
#   )
#   parser.add_argument(
#       "--input_text",
#       type=str,
#       default=".cache/data.json",
#   )
#   parser.add_argument(
#       "--load_in_4bit",
#       action="store_true",
#       help="Load model in 4-bit mode",
#   )
#   parser.add_argument("--fold", nargs=2, type=int, default=[0, 1])
#   args = parser.parse_args()
#   main(args)

if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument(
      "--base_model",
      type=str,
      default="Qwen/Qwen2.5-7B",
      help="Path to the base model",
  )
  parser.add_argument(
      "--input_text",
      type=str,
      default=".cache/data.json",
  )
  parser.add_argument(
      "--load_in_4bit",
      action="store_true",
      help="Load model in 4-bit mode",
  )
  parser.add_argument("--fold", nargs=2, type=int, default=[0, 1])
  # Instead of parser.parse_args(), provide the arguments as a list
  #  The code will parse these arguments and continue without error.
  args = parser.parse_args(['--input_text', 'data.json']) # Replace with your desired arguments
  main(args)

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]