In [76]:
# !pip install kaggle scikit-learn datasets transformers torch accelerate

In [77]:
# !kaggle competitions download -c eedi-mining-misconceptions-in-mathematics -p ./datasets/eedi
# !unzip ./datasets/eedi/eedi-mining-misconceptions-in-mathematics.zip -d ./datasets/eedi

In [78]:
import torch

# Check if a GPU is available and move model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [79]:
import pandas as pd

train_df = pd.read_csv('datasets/eedi/train.csv')
test_df = pd.read_csv('datasets/eedi/test.csv')
misconception_df = pd.read_csv('datasets/eedi/misconception_mapping.csv')
sample_submission_df = pd.read_csv('datasets/eedi/sample_submission.csv')

In [80]:
from sklearn.model_selection import train_test_split

train_df = train_df.sample(n=200, random_state=42)
# Splitting 10% for evaluation, 90% for training
train_df, eval_df = train_test_split(train_df, test_size=0.1, random_state=42)


In [81]:
import pandas as pd

def preprocess_dataframe(df, misconception_df=None, train_flag=False):
    # List of options
    options = ['A', 'B', 'C', 'D']

    # List of columns to keep
    id_vars = ['QuestionId', 'QuestionText', 'ConstructId', 'ConstructName', 'SubjectId', 'SubjectName', 'CorrectAnswer']

    # Initialize an empty list to collect data
    data_list = []

    # Loop over each option to collect data
    for option in options:
        answer_col = f'Answer{option}Text'
        misconception_col = f'Misconception{option}Id'
        
        # Check if the misconception column exists
        if misconception_col in df.columns:
            temp_df = df[id_vars + [misconception_col, answer_col]].copy()
            temp_df.rename(columns={
                misconception_col: 'MisconceptionId',
                answer_col: 'AnswerText'
            }, inplace=True)
        else:
            # Only include the answer column if misconception column doesn't exist
            temp_df = df[id_vars + [answer_col]].copy()
            temp_df['MisconceptionId'] = None  # Assign None to MisconceptionId
            temp_df.rename(columns={
                answer_col: 'AnswerText'
            }, inplace=True)
        
        temp_df['Option'] = option
        data_list.append(temp_df)

    # Concatenate all the data into a single DataFrame
    df_combined = pd.concat(data_list, ignore_index=True)

    # Exclude the rows where the option matches the correct answer
    df_combined = df_combined[df_combined['Option'] != df_combined['CorrectAnswer']]

    # If train_flag is True, merge with 'misconception_df' on 'MisconceptionId'
    if train_flag and misconception_df is not None and 'MisconceptionId' in df_combined.columns:
        df_combined = df_combined.merge(misconception_df, on='MisconceptionId', how='left')
        
        # Drop rows with missing 'MisconceptionName' (only for training data)
        if 'MisconceptionName' in df_combined.columns:
            df_combined = df_combined.dropna(subset=['MisconceptionName'])
    else:
        # For testing data, add a placeholder for 'MisconceptionName'
        df_combined['MisconceptionName'] = None

    # Sort and reset index if desired
    df_combined = df_combined.sort_values(["QuestionId", "Option"]).reset_index(drop=True)

    return df_combined


In [82]:
train_procressed_df = preprocess_dataframe(train_df, misconception_df, train_flag=True)
eval_procressed_df = preprocess_dataframe(eval_df, misconception_df, train_flag=True)
test_procressed_df = preprocess_dataframe(test_df, misconception_df)


In [83]:
import pandas as pd

def process_row_for_qa_token(row, train_flag=False):
    # Replace newlines in all relevant fields
    construct_name = row['ConstructName'].replace('\n', ' ')
    subject_name = row['SubjectName'].replace('\n', ' ')
    question_text = row['QuestionText'].replace('\n', ' ')
    answer_text = row['AnswerText'].replace('\n', ' ')
    
    # Create a prompt for the question_text
    question_prompt = (f"Given the following context:\n"
                       f"Construct: {construct_name}, Subject: {subject_name}.\n"
                       f"Question: {question_text}\n"
                       f"Answer: {answer_text}\n"
                       f"Please predict the misconception.")

    # Set answer_text as the misconception for training
    if train_flag and row['MisconceptionName'] is not None:
        misconception_name = row['MisconceptionName'].replace('\n', ' ')
    else:
        misconception_name = ''

    # Return a DataFrame with question_text (prompt) and answer_text (misconception)
    return pd.DataFrame({
        'question_text': [question_prompt],
        'answer_text': [misconception_name] if train_flag else [None]  # Use None during inference
    })

# Process the entire DataFrame for QA preparation
def process_dataframe_for_qa_token(df, train_flag=False):
    processed_rows = []
    
    # Loop through each row in the input DataFrame
    for _, row in df.iterrows():
        processed_row = process_row_for_qa_token(row, train_flag)
        processed_rows.append(processed_row)
    
    # Concatenate the results into a final DataFrame
    final_df = pd.concat(processed_rows, ignore_index=True)
    
    return final_df


In [84]:
TRAIN_DF = 'data/train.csv'
EVAL_DF = 'data/eval.csv'
TEST_DF = 'data/test.csv'

In [85]:
process_dataframe_for_qa_token(train_procressed_df, train_flag=True).to_csv(TRAIN_DF, index=False)
process_dataframe_for_qa_token(eval_procressed_df, train_flag=True).to_csv(EVAL_DF, index=False)
process_dataframe_for_qa_token(test_procressed_df, train_flag=False).to_csv(TEST_DF, index=False)


In [86]:
import pandas as pd
from datasets import Dataset

# Load your data
train_df = pd.read_csv(TRAIN_DF)  # Contains 'question_text' and 'answer_text'
eval_df = pd.read_csv(EVAL_DF)    # Optional evaluation data
test_df = pd.read_csv(TEST_DF).fillna('')    # Contains 'question_text' only

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
if not eval_df.empty:
    eval_dataset = Dataset.from_pandas(eval_df)
test_dataset = Dataset.from_pandas(test_df) 


In [87]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

# Tokenization function
def preprocess_function(examples):
    inputs = [q for q in examples["question_text"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # For training data, add labels
    if "answer_text" in examples:
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples["answer_text"], max_length=128, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply the preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)
test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)


loading file spiece.model from cache at /root/.cache/huggingface/hub/models--google--flan-t5-small/snapshots/0fc9ddf78a1e988dac52e2dac162b0ede4fd74ab/spiece.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-small/snapshots/0fc9ddf78a1e988dac52e2dac162b0ede4fd74ab/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-small/snapshots/0fc9ddf78a1e988dac52e2dac162b0ede4fd74ab/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-small/snapshots/0fc9ddf78a1e988dac52e2dac162b0ede4fd74ab/tokenizer_config.json
Map:   0%|          | 0/419 [00:00<?, ? examples/s]

Map: 100%|██████████| 419/419 [00:00<00:00, 2349.11 examples/s]
Map: 100%|██████████| 47/47 [00:00<00:00, 2669.00 examples/s]
Map: 100%|██████████| 9/9 [00:00<00:00, 1261.70 examples/s]


In [88]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
model.to(device)


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-small/snapshots/0fc9ddf78a1e988dac52e2dac162b0ede4fd74ab/config.json
Model config T5Config {
  "_name_or_path": "google/flan-t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 20

All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at google/flan-t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-small/snapshots/0fc9ddf78a1e988dac52e2dac162b0ede4fd74ab/generation_config.json
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0
}



T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [89]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch" if not eval_df.empty else "no",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False,
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=50,  # Log every 50 steps
    log_level='info',  # Logging level
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [90]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)


In [91]:
trainer.train()


***** Running training *****
  Num examples = 419
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 135
  Number of trainable parameters = 76,961,152


Epoch,Training Loss,Validation Loss
1,No log,18.765438
2,22.744600,6.876162
3,22.744600,5.144644
4,6.802500,4.681398
5,6.802500,4.561172



***** Running Evaluation *****
  Num examples = 47
  Batch size = 16

***** Running Evaluation *****
  Num examples = 47
  Batch size = 16

***** Running Evaluation *****
  Num examples = 47
  Batch size = 16

***** Running Evaluation *****
  Num examples = 47
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-135
Configuration saved in ./results/checkpoint-135/config.json
Configuration saved in ./results/checkpoint-135/generation_config.json
Model weights saved in ./results/checkpoint-135/model.safetensors
tokenizer config file saved in ./results/checkpoint-135/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-135/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 47
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=135, training_loss=12.312799524377894, metrics={'train_runtime': 26.0947, 'train_samples_per_second': 80.284, 'train_steps_per_second': 5.173, 'total_flos': 97360151838720.0, 'train_loss': 12.312799524377894, 'epoch': 5.0})

In [93]:
# Convert input_ids from list to torch tensor
import torch

input_ids_tensor = torch.tensor(test_dataset['input_ids']).to(device)

# Generate answers using the model's generate() function
generated_outputs = model.generate(input_ids_tensor)

# Decode the generated sequences into readable text
generated_answers = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)

# Print the results
for i, answer in enumerate(generated_answers):
    print(f"Question: {test_df['question_text'][i]}")
    print(f"Generated Answer: {answer}\n")



Question: Given the following context:
Construct: Use the order of operations to carry out calculations involving powers, Subject: BIDMAS.
Question: \[ 3 \times 2+4-5 \] Where do the brackets need to go to make the answer equal \( 13 \) ?
Answer: \( 3 \times 2+(4-5) \)
Please predict the misconception.
Generated Answer: the ability to predict

Question: Given the following context:
Construct: Use the order of operations to carry out calculations involving powers, Subject: BIDMAS.
Question: \[ 3 \times 2+4-5 \] Where do the brackets need to go to make the answer equal \( 13 \) ?
Answer: \( 3 \times(2+4-5) \)
Please predict the misconception.
Generated Answer: the ability to predict

Question: Given the following context:
Construct: Use the order of operations to carry out calculations involving powers, Subject: BIDMAS.
Question: \[ 3 \times 2+4-5 \] Where do the brackets need to go to make the answer equal \( 13 \) ?
Answer: Does not need brackets
Please predict the misconception.
Gener