# Import

In [10]:
!pip install -r /workspace/requirements.txt


[0m

In [11]:
!which python


/opt/conda/envs/eedi/bin/python


In [12]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

ModuleNotFoundError: No module named 'numpy.strings'

# Data Load

In [None]:
train                 = pd.read_csv("./eedi-mining-misconceptions-in-mathematics/train.csv")
test                  = pd.read_csv("./eedi-mining-misconceptions-in-mathematics/test.csv")

misconception_mapping = pd.read_csv("./eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
sample_submission     = pd.read_csv("./eedi-mining-misconceptions-in-mathematics/sample_submission.csv")

In [None]:
misconception_mapping

In [None]:
test

In [None]:
train['A_answer_misconception'] = np.where(train['MisconceptionAId'].notna(),
                                           train['AnswerAText'] + '***' + train['MisconceptionAId'].astype(str),
                                           None)

train['B_answer_misconception'] = np.where(train['MisconceptionBId'].notna(),
                                           train['AnswerBText'] + '***' + train['MisconceptionBId'].astype(str),
                                           None)

train['C_answer_misconception'] = np.where(train['MisconceptionCId'].notna(),
                                           train['AnswerCText'] + '***' + train['MisconceptionCId'].astype(str),
                                           None)

train['D_answer_misconception'] = np.where(train['MisconceptionDId'].notna(),
                                           train['AnswerDText'] + '***' + train['MisconceptionDId'].astype(str),
                                           None)
# test['A_answer_misconception'] = np.where(test['MisconceptionAId'].notna(),
#                                            test['AnswerAText'] + '***' + test['MisconceptionAId'].astype(str),
#                                            None)

# test['B_answer_misconception'] = np.where(test['MisconceptionBId'].notna(),
#                                            test['AnswerBText'] + '***' + test['MisconceptionBId'].astype(str),
#                                            None)

# test['C_answer_misconception'] = np.where(test['MisconceptionCId'].notna(),
#                                            test['AnswerCText'] + '***' + test['MisconceptionCId'].astype(str),
#                                            None)

# test['D_answer_misconception'] = np.where(test['MisconceptionDId'].notna(),
#                                            test['AnswerDText'] + '***' + test['MisconceptionDId'].astype(str),
#                                            None)


In [None]:
train.iloc[1]['A_answer_misconception']

# Preprocess

In [None]:
def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_question_text"] = df["ConstructName"] +" " +df["QuestionText"]
    return df

test = make_all_question_text(test)
train = make_all_question_text(train)

In [None]:
print(test.shape)
print(test.columns)

In [None]:
def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    df['CorrectAnswerText'] = df.apply(lambda row: row[f"Answer{row['CorrectAnswer']}Text"], axis=1)

    df = pd.melt(
        df[
            [
                "QuestionId",
                "all_question_text",
                "CorrectAnswer",
                "CorrectAnswerText",
                "AnswerAText",
                "AnswerBText",
                "AnswerCText",
                "AnswerDText"
            ]
        ],
        id_vars    = ["QuestionId", "all_question_text", "CorrectAnswer", "CorrectAnswerText"],
        var_name   = 'Answer',
        value_name = 'value'
    )
    return df

def wide_to_long_train(df: pd.DataFrame) -> pd.DataFrame:
    df['CorrectAnswerText'] = df.apply(lambda row: row[f"Answer{row['CorrectAnswer']}Text"], axis=1)

    df = pd.melt(
        df[
            [
                "QuestionId",
                "all_question_text",
                "CorrectAnswer",
                "CorrectAnswerText",
                "A_answer_misconception",
                "B_answer_misconception",
                "C_answer_misconception",
                "D_answer_misconception"
            ]
        ],
        id_vars    = ["QuestionId", "all_question_text", "CorrectAnswer", "CorrectAnswerText"],
        var_name   = 'Answer',
        value_name = 'value'
    )
    df[['AnswerText', 'Misconception_ID']] = df['value'].str.split('\\*\\*\\*', expand=True)
    df = df[df['Misconception_ID'].apply(lambda x: isinstance(x, str))]
    df['Misconception_ID'] = df['Misconception_ID'].astype(float).astype(int)
    return df

test_long = wide_to_long(test)
train_long = wide_to_long_train(train)


In [None]:
test_long

In [None]:
print(test_long.columns)

In [None]:
def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_text"] = df["all_question_text"] +" " +df["value"]
    return df

test_long = make_all_text(test_long)
test_long

In [None]:
test_long = test_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
test_long

# Training gemma7B


In [None]:
import os, random
import pandas as pd
import numpy as np
# from string import Template
from pathlib import Path

from torch import nn
# Transformer
from accelerate import Accelerator
import transformers
from transformers import (pipeline, AutoTokenizer, AutoModelForCausalLM, 
                          BitsAndBytesConfig, AutoConfig, TrainingArguments)
# Supervised Trainser
from datasets import Dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model, TaskType, PeftConfig, PeftModel
# Split data into training and test (valid) dataset
from sklearn.model_selection import train_test_split

# For quantization
import bitsandbytes, accelerate
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
import ctypes, gc
import torch

libc = ctypes.CDLL("libc.so.6")
# Seed the same seed to all 
def seed_everything(seed=42):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()

SEED = 42
seed_everything(SEED)
# Set the GPUs
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class CFG:
    model_name = 'gemma_7b'
    model_paths = {'gemma_7b': "google/gemma-7b"}
    model_path = model_paths[model_name]
    
    # Model training argument
    data_path = '/kaggle/input/gemma-rewrite-nbroad/nbroad-v2.csv'
    model_save_path =  f'{model_name}_adapter'
    max_length=512 # truncate the text to the first 150 words to avoid OOM issues.
    NROWS = 1000 # Read 1000 texts from dataset
    batch_size = 1
    lr = 2e-4

In [None]:
def formatting_func(row):
    question = f"{row['all_question_text']} The correct answer is: {row['CorrectAnswerText']}. The wrong answer is: {row['value']}. What is the misconception here?"
    answer = misconception_mapping.loc[misconception_mapping['MisconceptionId'] == row['Misconception_ID'], 'MisconceptionName'].iloc[0]
    template = f"Question:\n{question}\n\nAnswer:\n{answer}"
    return [template]

def train_model(model, tokenizer, training_df = None):
    # Load the training data
    # Create the dataset
    if training_df is None:
        training_df = pd.read_csv(CFG.data_path, nrows=CFG.NROWS)
    training_df['formatted_text'] = training_df.apply(formatting_func, axis=1)

    training_ds = Dataset.from_pandas(training_df)
    
    # Tokenizer 
    training_ds = training_ds.map(lambda samples: tokenizer(samples["formatted_text"]), batched=True)
    # Add PEFT (lora) layer
    lora_config = LoraConfig(r=32, # Rank
                             lora_alpha=32,
                             target_modules=["q_proj", "o_proj", "k_proj", 
                                             "v_proj", "gate_proj", "up_proj", "down_proj"],
                             lora_dropout=0.05,
                             bias="none",
                             task_type=TaskType.CAUSAL_LM)
    # Training arguments
    args = TrainingArguments(
            num_train_epochs=1,
            per_device_train_batch_size=CFG.batch_size,
            gradient_accumulation_steps=16,
            warmup_steps=5,
            max_steps=100,
            learning_rate=CFG.lr,
            fp16=True,
            logging_steps=1,
            output_dir="outputs",
            optim="paged_adamw_8bit",
            report_to="none"
        )
    # Create a trainer (supervised fine-tuned trainer)
    trainer = SFTTrainer(model=model,
                         train_dataset=training_ds,
                         args=args,
                         peft_config=lora_config)
    trainer.train()
    # Save the model
    trainer.save_model(CFG.model_save_path)
    tokenizer.save_pretrained(CFG.model_save_path)
    print(f"Save the model to {CFG.model_save_path}")


In [None]:
def load_model():
    accelerator = Accelerator()
    # Use quantization technique to reduce the memory usage
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
    # Load the model
    model = AutoModelForCausalLM.from_pretrained(
                                CFG.model_path,
                                device_map = "auto",
                                trust_remote_code = True,
                                quantization_config=quantization_config)
    model = accelerator.prepare(model)
    return model, tokenizer

In [None]:
model, tokenizer = load_model()


In [None]:
train_long.columns

In [None]:
train_model(model, tokenizer, train_long)

In [None]:
stop

# Training gemma 2B keras

In [None]:
# Install Keras 3 last. See https://keras.io/getting_started/ for more details.
# !pip install -q -U keras-nlp
# !pip install -q -U keras>=3


import os

os.environ["KERAS_BACKEND"] = "jax"  # Or "torch" or "tensorflow".
# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

import keras
import keras_nlp

In [None]:
%%time
#importing 7b takes too much memory and causes the kaggle notebook to restart.

gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_2b_en")

In [None]:
# Limit the input sequence length to 512 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 512
# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
    beta_1=0.9,          # Adjust beta_1 parameter
    beta_2=0.999         # Adjust beta_2 parameter
    )
# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

In [None]:
# Enable LoRA for the model and set the LoRA rank to 64.
gemma_lm.backbone.enable_lora(rank=64)
# gemma_lm.backbone.load_lora_weights('/kaggle/input/gemma_lora_math/tensorflow2/default/1/model.lora.h5')


In [None]:
training_dataset = []
questions = []
for index, row in train_long.iterrows():
    question = row['all_question_text'] + 'The correct answer is: ' + row['CorrectAnswerText'] + 'The wrong answer is: ' + row['AnswerText'] +'. What is the misconception here? '
    answer = misconception_mapping.loc[misconception_mapping['MisconceptionId'] == row['Misconception_ID'], 'MisconceptionName'].iloc[0]

    template = (f"Question:\n{question}\n\nAnswer:\n{answer}")
    training_dataset.append(template)
    questions.append(question)
gemma_lm.fit(training_dataset, epochs=5, batch_size=1)
gemma_lm.backbone.save_lora_weights('/kaggle/working/model.lora.h5')


In [None]:
test_long

In [None]:
import re
predicted_answer = []
answer_id = []
# train_long['answer_alphabet'] = train_long["Answer"].str.extract(r'Answer([A-Z])Text$')

for index, row in train_long[:100].iterrows():
    if row['CorrectAnswer'] != row['answer_alphabet']:
        ID = str(int(row['QuestionId'])) + '_' + str(row['answer_alphabet'])
        
        question = row['all_question_text'] + 'The correct answer is: ' + r*w['CorrectAnswerText'] + 'The wrong answer is: ' + row['value'] +'. What is the misconception here? '

        template = (f"Question:\n{question}\n\nAnswer:\n")
        response = gemma_lm.generate(template, max_length=256)
#         print(response)
        match = re.search(r"Answer:\s*(.*)", response)        
        if match:
            # Extract the character after 'Answer' and before 'Text'
            answer_text = match.group(1)
        else:
            answer_text = response
#         print(answer_text)

        answer_id.append(ID)
        predicted_answer.append(answer_text)

In [None]:
len(answer_id)

In [None]:
# MODEL_LORA_WT_PATH = '/kaggle/working/model.lora.h5'
# gemma_lm.backbone.load_lora_weights(MODEL_LORA_WT_PATH)


## Loading the model and tokenizer for embedding generation

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch


device = "cuda:0"


tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/bge-small-en-v1.5/transformers/bge/2')
model     = AutoModel.from_pretrained('/kaggle/input/bge-small-en-v1.5/transformers/bge/2')
model.eval()
model.to(device)
print("finish")

In [None]:
from tqdm import tqdm
MisconceptionName = list(misconception_mapping['MisconceptionName'].values)
per_gpu_batch_size = 8


def prepare_inputs(text, tokenizer, device):
    tokenizer_outputs = tokenizer.batch_encode_plus(
        text,
        padding        = True,
        return_tensors = 'pt',
        max_length     = 1024,
        truncation     = True
    )
    result = {
        'input_ids': tokenizer_outputs.input_ids.to(device),
        'attention_mask': tokenizer_outputs.attention_mask.to(device),
    }
    return result


all_ctx_vector = []
for mini_batch in tqdm(range(0, len(MisconceptionName[:]), per_gpu_batch_size)):
    mini_context          = MisconceptionName[mini_batch:mini_batch+ per_gpu_batch_size]
    encoded_input         = prepare_inputs(mini_context,tokenizer,device)
    sentence_embeddings   = model(**encoded_input)[0][:, 0]
    sentence_embeddings   = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    all_ctx_vector.append(sentence_embeddings.detach().cpu().numpy())

all_ctx_vector = np.concatenate(all_ctx_vector, axis=0)
print("Sentence embeddings:", sentence_embeddings.shape)

In [None]:
MisconceptionName[0]

In [None]:
len(all_ctx_vector)

In [None]:
all_text_vector = []
per_gpu_batch_size = 8

for mini_batch in tqdm(
        range(0, len(predicted_answer[:]), per_gpu_batch_size)):
    mini_context = predicted_answer[mini_batch:mini_batch
                                           + per_gpu_batch_size]
    encoded_input = prepare_inputs(mini_context,tokenizer,device)
    sentence_embeddings = model(
        **encoded_input)[0][:, 0]
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    
    all_text_vector.append(sentence_embeddings.detach().cpu().numpy())

all_text_vector = np.concatenate(all_text_vector, axis=0)
print(all_text_vector.shape)

# Predict

In [None]:
test_cos_sim_arr = cosine_similarity(all_text_vector, all_ctx_vector)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

In [None]:
test_sorted_indices[:, :25]

# Make Submit File

In [None]:
train.head(18)

In [None]:
res = pd.DataFrame()

res["MisconceptionId"] = test_sorted_indices[:, :25].tolist()
res["MisconceptionId"] = res["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))
res["QuestionId_Answer"] = answer_id
# filter correct row
# test_long = test_long[test_long["CorrectAnswer"] != test_long["Answer_alphabet"]]
submission = res[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)
# Extract QuestionId and Answer from the 'QuestionId_Answer' column
submission[['QuestionId', 'Answer']] = submission['QuestionId_Answer'].str.split('_', expand=True)

# Convert 'QuestionId' to integer for proper numerical sorting
submission['QuestionId'] = submission['QuestionId'].astype(int)

# Sort by 'QuestionId' first and then by 'Answer' alphabetically
submission_sorted = submission.sort_values(by=['QuestionId', 'Answer']).reset_index(drop=True)

# Display the result
print(submission_sorted[['QuestionId_Answer', 'MisconceptionId']].head(10))


In [None]:
train_long['answer_alphabet'] = train_long["Answer"].str.extract(r'([A-Z])_answer_misconception$')


In [None]:
train_long

In [None]:
submission.head(10)

In [None]:
submission.dtypes

In [None]:
sample_submission.head(10)

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
sample_submission

In [None]:
sample_submission.dtypes

In [None]:
test