In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
test_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')
misconception_mapping = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')

In [3]:
display(train_df)

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId
0,0,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets,,,,1672.0
1,1,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,2142.0,143.0,2142.0,
2,2,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct,1287.0,,1287.0,1073.0
3,3,2377,Recall and use the intersecting diagonals prop...,88,Properties of Quadrilaterals,C,The angles highlighted on this rectangle with ...,acute,obtuse,\( 90^{\circ} \),Not enough information,1180.0,1180.0,,1180.0
4,4,3387,Substitute positive integer values into formul...,67,Substitution into Formula,A,The equation \( f=3 r^{2}+3 \) is used to find...,\( 30 \),\( 27 \),\( 51 \),\( 24 \),,,,1818.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1864,1864,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,C,What is the range of the following numbers?\n\...,\( 5 \),\( 11 \),\( 23 \),\( 16 \),2456.0,691.0,,1349.0
1865,1865,2695,"Describe an enlargement, with no centre of enl...",90,Length Scale Factors in Similar Shapes,B,Shape \( Q \) is an enlargement of shape \( P ...,\( 3 \div 11 \),\( 11 \div 3 \),\( 3 \times 11 \),\( 11-3 \),1500.0,,2442.0,1258.0
1866,1866,854,Use the order of operations to carry out calcu...,33,BIDMAS,B,What does the following equal?\n\[\n8-7+10 \ti...,\( 36 \),\( 31 \),\( -29 \),\( 33 \),,,2306.0,1507.0
1867,1867,2634,Distinguish between congruency and similarity,274,Congruency in Other Shapes,B,Tom and Katie are discussing congruence and si...,Only\nTom,Only Katie,Both Tom and Katie,Neither is correct,2312.0,,2312.0,2312.0


In [4]:
import numpy as np
import re

mis_len = len(misconception_mapping)

def preprocess_text(x):
    x = x.lower()                 # Convert words to lowercase
    x = re.sub(r"@\w+", '',x)      # Delete strings starting with @
    #x = re.sub(r"\d+", '',x)      # Delete Numbers
    x = re.sub(r"http\w+", '',x)   # Delete URL
    x = re.sub(r"\\\(", " ", x)
    x = re.sub(r"\\\)", " ", x)
    x = re.sub(r"[ ]{1,}", " ", x)
    x = re.sub(r"\.+", ".", x)    # Replace consecutive commas and periods with one comma and period character
    x = x.strip()                 # Remove empty characters at the beginning and end
    return x

# Combine question text and answer options for training
train_data = pd.DataFrame(columns=['Input', 'Output'])
for idx, row in train_df.iterrows():
    for ch in ['A', 'B', 'C', 'D']:

        if str(row[f"Misconception{ch}Id"]) != "nan":
            real_answer = row['CorrectAnswer']
            real_answer_text = row[f"Answer{real_answer}Text"]

            # our problems require context, so I included all of the information
            input = f"Subject: {row['SubjectName']} Construct: {row['ConstructName']} Question: {row['QuestionText']} Incorrect: {row[f'Answer{ch}Text']}"
            input = preprocess_text(input)
            misconception = int(row[f"Misconception{ch}Id"])
            tmp_dict = {'Input':input, 'Output':misconception}
            train_data = pd.concat([train_data, pd.DataFrame(tmp_dict, index=[0])])

train_data


Unnamed: 0,Input,Output
0,subject: bidmas construct: use the order of op...,1672
0,subject: simplifying algebraic fractions const...,2142
0,subject: simplifying algebraic fractions const...,143
0,subject: simplifying algebraic fractions const...,2142
0,subject: range and interquartile range from a ...,1287
...,...,...
0,subject: congruency in other shapes construct:...,2312
0,subject: congruency in other shapes construct:...,2312
0,subject: rotation construct: describe a 90° or...,801
0,subject: rotation construct: describe a 90° or...,801


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['Input'])
X = tokenizer.texts_to_sequences(train_data['Input'])
padded = pad_sequences(X, maxlen=50, padding="post")  # Adjust the max length as needed

In [6]:
from torch.utils.data import Dataset, DataLoader, random_split
import torch
class DS(Dataset):
    def __init__(self, df, padded, mis_len):
        self.df = df
        self.mis_len = mis_len
        self.padded = padded
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        #x = self.df.iloc[idx]['Input']
        x = padded[idx]
        y = np.zeros(self.mis_len)
        mis_val = self.df.iloc[idx]['Output']
        y[mis_val] = 1
        return torch.tensor(x), torch.tensor(y)


In [7]:
data = DS(train_data, padded, mis_len)

train_size = int(0.8 * len(data))  # Calculate 80% of dataset size
val_size = len(data) - train_size  # Calculate 20% of dataset size
train_ds, val_ds = random_split(data, [train_size, val_size])

In [8]:
train_dl = DataLoader(train_ds, batch_size=4, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=4, shuffle=True)

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

class MisconceptionModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(MisconceptionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #self.conv1 = nn.Conv1d(embedding_dim, hidden_dim, kernel_size=3)
        #self.pool = nn.AvgPool1d(2)
        self.lstm = nn.LSTM(embedding_dim, output_size, batch_first=True)
        self.fc1 = nn.Linear(output_size, output_size)
        self.fc2 = nn.Linear(output_size, output_size)
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

    def forward(self, x):
        x = self.embedding(x)
        #x = x.permute(0, 2, 1)
        #x = self.conv1(x)
        #x = self.relu(x)
        #x = self.pool(x)
        #x = x.permute(0,2,1)
        lstm_out, _ = self.lstm(x)
        out = self.fc1(lstm_out[:, -1, :])  # Get the last output of LSTM
        out = self.dropout(out)
        
        #out = self.fc1(out)
        #out = self.dropout(out)
        out = self.relu(out)
        
        out = self.fc2(out)
        
        
        return self.softmax(out)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
# Hyperparameters
input_size = len(tokenizer.word_index) + 1
embedding_dim = 300
hidden_dim = 256
output_size = mis_len


# Model, Loss, Optimizer
model = MisconceptionModel(input_size, embedding_dim, hidden_dim, output_size)
model = model.to(device)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for multi-label classification
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    running_loss = 0
    val_loss = 0
    model.train()
    for x, y in train_dl:
        x = x.to(device).long()
        y = y.to(device).float()
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)
        running_loss += loss.item()*x.shape[0]
        loss.backward()
        optimizer.step()

    model.eval()
    for x, y in val_dl:
        x = x.to(device).long()
        y = y.to(device).float()
        outputs = model(x)
        loss = criterion(outputs, y)
        val_loss += loss.item()*x.shape[0]
        
    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {running_loss/len(train_dl.dataset):.4f}, Val Loss: {val_loss/len(val_dl.dataset):.4f}')

  return self._call_impl(*args, **kwargs)


Epoch [1/2], Train Loss: 0.0033, Val Loss: 0.0031
Epoch [2/2], Train Loss: 0.0029, Val Loss: 0.0031


In [12]:
test_data = pd.DataFrame(columns=['Input', 'QuestionId_Answer'])
for idx, row in test_df.iterrows():
    corr = row['CorrectAnswer']
    for ch in ['A', 'B', 'C', 'D']:
        if ch != corr:
            qid_ans = f"{row['QuestionId']}_{ch}"
            real_answer = row['CorrectAnswer']
            real_answer_text = row[f"Answer{real_answer}Text"]
            input = f"Subject: {row['SubjectName']} Construct: {row['ConstructName']} Question: {row['QuestionText']} Incorrect: {row[f'Answer{ch}Text']}"
            input = preprocess_text(input)
            #misconception = int(row[f"Misconception{ch}Id"])
            tmp_dict = {'Input':input, 'QuestionId_Answer':qid_ans}
            test_data = pd.concat([test_data, pd.DataFrame(tmp_dict, index=[0])])

X_test = tokenizer.texts_to_sequences(test_data['Input'])
X_test = pad_sequences(X_test, maxlen=50, padding="post")

In [13]:
from torch.utils.data import Dataset, DataLoader, random_split

class testDS(Dataset):
    def __init__(self, padded, df):
        self.padded = padded
        self.df = df
        
    def __len__(self):
        return len(self.padded)

    def __getitem__(self, idx):
        #x = self.df.iloc[idx]['Input']
        x = padded[idx]
        y = self.df.iloc[idx]['QuestionId_Answer']
        return x, y


In [14]:
test_ds = testDS(X_test, test_data)
test_dl = DataLoader(test_ds, batch_size=1, shuffle=False)

In [15]:
model.eval()
results = []
sub = pd.DataFrame(columns=['QuestionId_Answer', 'MisconceptionId'])
with torch.no_grad():
    for input, qid in test_dl:
        input = input.to(device).long()
        output = model(input)
        #print(output)
        topk = output.argsort(dim=1, descending=True)[:, :25]
        #print(topk)
        results.append(topk.cpu().numpy()[0])
        string_val = ' '.join(str(num) for num in topk.cpu().numpy()[0])
        tmp_dict = {'QuestionId_Answer':qid[0], 'MisconceptionId':string_val}
        sub = pd.concat([sub, pd.DataFrame(tmp_dict, index=[0])])

display(sub)

  return self._call_impl(*args, **kwargs)


Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_B,1379 2392 1507 2316 2359 1214 1597 1988 2481 1...
0,1869_C,1379 2392 1507 2316 2359 1214 1597 1988 2481 1...
0,1869_D,1379 2392 1507 2316 2359 1214 1597 1988 2481 1...
0,1870_A,1379 2392 1507 2316 2359 1214 1597 1988 2481 1...
0,1870_B,1198 1990 31 1839 1214 2312 340 1690 1554 220 ...
0,1870_C,340 585 1198 2312 220 483 1990 1287 2386 31 12...
0,1871_A,31 557 1198 1214 1164 1318 1073 1839 2306 1338...
0,1871_C,2336 110 1631 1214 625 2252 421 1513 1990 718 ...
0,1871_D,110 1631 1214 2332 1990 421 82 625 77 11 2336 ...


In [16]:
sub.to_csv("submission.csv", index=False)