In [209]:
# !kaggle competitions download -c eedi-mining-misconceptions-in-mathematics

In [210]:
import pandas as pd

train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')
misconception_df = pd.read_csv('dataset/misconception_mapping.csv')
sample_submission_df = pd.read_csv('dataset/sample_submission.csv')

In [211]:
train_df = train_df.sample(n=20, random_state=42)
from sklearn.model_selection import train_test_split

# Splitting 10% for evaluation, 90% for training
train_df, eval_df = train_test_split(train_df, test_size=0.1, random_state=42)


In [212]:
import pandas as pd

def preprocess_dataframe(df, misconception_df=None, train_flag=False):
    # List of options
    options = ['A', 'B', 'C', 'D']

    # List of columns to keep
    id_vars = ['QuestionId', 'QuestionText', 'ConstructId', 'ConstructName', 'SubjectId', 'SubjectName', 'CorrectAnswer']

    # Initialize an empty list to collect data
    data_list = []

    # Loop over each option to collect data
    for option in options:
        answer_col = f'Answer{option}Text'
        misconception_col = f'Misconception{option}Id'
        
        # Check if the misconception column exists
        if misconception_col in df.columns:
            temp_df = df[id_vars + [misconception_col, answer_col]].copy()
            temp_df.rename(columns={
                misconception_col: 'MisconceptionId',
                answer_col: 'AnswerText'
            }, inplace=True)
        else:
            # Only include the answer column if misconception column doesn't exist
            temp_df = df[id_vars + [answer_col]].copy()
            temp_df['MisconceptionId'] = None  # Assign None to MisconceptionId
            temp_df.rename(columns={
                answer_col: 'AnswerText'
            }, inplace=True)
        
        temp_df['Option'] = option
        data_list.append(temp_df)

    # Concatenate all the data into a single DataFrame
    df_combined = pd.concat(data_list, ignore_index=True)

    # Exclude the rows where the option matches the correct answer
    df_combined = df_combined[df_combined['Option'] != df_combined['CorrectAnswer']]

    # If train_flag is True, merge with 'misconception_df' on 'MisconceptionId'
    if train_flag and misconception_df is not None and 'MisconceptionId' in df_combined.columns:
        df_combined = df_combined.merge(misconception_df, on='MisconceptionId', how='left')
        
        # Drop rows with missing 'MisconceptionName' (only for training data)
        if 'MisconceptionName' in df_combined.columns:
            df_combined = df_combined.dropna(subset=['MisconceptionName'])
    else:
        # For testing data, add a placeholder for 'MisconceptionName'
        df_combined['MisconceptionName'] = None

    # Sort and reset index if desired
    df_combined = df_combined.sort_values(["QuestionId", "Option"]).reset_index(drop=True)

    return df_combined


In [213]:
train_procressed_df = preprocess_dataframe(train_df, misconception_df, train_flag=True)
eval_procressed_df = preprocess_dataframe(eval_df, misconception_df, train_flag=True)
test_procressed_df = preprocess_dataframe(test_df, misconception_df)


In [214]:
import pandas as pd

def process_row_for_qa_token(row, train_flag=False):
    # Replace newlines in all relevant fields
    construct_name = row['ConstructName'].replace('\n', ' ')
    subject_name = row['SubjectName'].replace('\n', ' ')
    question_text = row['QuestionText'].replace('\n', ' ')
    answer_text = row['AnswerText'].replace('\n', ' ')
    
    # Create a prompt for the question_text
    question_prompt = (f"Given the following context:\n"
                       f"Construct: {construct_name}, Subject: {subject_name}.\n"
                       f"Question: {question_text}\n"
                       f"Answer: {answer_text}\n"
                       f"Please predict the misconception.")

    # Set answer_text as the misconception for training
    if train_flag and row['MisconceptionName'] is not None:
        misconception_name = row['MisconceptionName'].replace('\n', ' ')
    else:
        misconception_name = ''

    # Return a DataFrame with question_text (prompt) and answer_text (misconception)
    return pd.DataFrame({
        'question_text': [question_prompt],
        'answer_text': [misconception_name] if train_flag else [None]  # Use None during inference
    })

# Process the entire DataFrame for QA preparation
def process_dataframe_for_qa_token(df, train_flag=False):
    processed_rows = []
    
    # Loop through each row in the input DataFrame
    for _, row in df.iterrows():
        processed_row = process_row_for_qa_token(row, train_flag)
        processed_rows.append(processed_row)
    
    # Concatenate the results into a final DataFrame
    final_df = pd.concat(processed_rows, ignore_index=True)
    
    return final_df


In [215]:
TRAIN_DF = 'data/train.csv'
EVAL_DF = 'data/eval.csv'
TEST_DF = 'data/test.csv'

In [216]:
process_dataframe_for_qa_token(train_procressed_df, train_flag=True).to_csv(TRAIN_DF, index=False)
process_dataframe_for_qa_token(eval_procressed_df, train_flag=True).to_csv(EVAL_DF, index=False)
process_dataframe_for_qa_token(test_procressed_df, train_flag=False).to_csv(TEST_DF, index=False)


In [217]:
import pandas as pd
from datasets import Dataset

# Load your data
train_df = pd.read_csv(TRAIN_DF)  # Contains 'question_text' and 'answer_text'
eval_df = pd.read_csv(EVAL_DF)    # Optional evaluation data
test_df = pd.read_csv(TEST_DF).fillna('')    # Contains 'question_text' only

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
if not eval_df.empty:
    eval_dataset = Dataset.from_pandas(eval_df)
test_dataset = Dataset.from_pandas(test_df)


In [218]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

# Tokenization function
def preprocess_function(examples):
    inputs = [q for q in examples["question_text"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # For training data, add labels
    if "answer_text" in examples:
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples["answer_text"], max_length=128, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply the preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)
test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)



Map: 100%|██████████| 41/41 [00:00<00:00, 902.59 examples/s]

Map: 100%|██████████| 4/4 [00:00<00:00, 642.85 examples/s]

Map: 100%|██████████| 9/9 [00:00<00:00, 1762.23 examples/s]


In [219]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")


In [220]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch" if not eval_df.empty else "no",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False,
)




In [221]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)


In [222]:
trainer.train()



[A
[A
[A