In [1]:
import pandas as pd
import os

In [2]:
# Load data
dir = "common-sense"

train_file = os.path.join(dir, "train_data.csv")
train_labels_file = os.path.join(dir, "train_answers.csv")

df_sentences = pd.read_csv(train_file)
df_labels = pd.read_csv(train_labels_file)

test_file = os.path.join(dir, "test_data.csv")

In [3]:
def label(x):
    """Convert options to (integer) labels"""
    if x == 'A':
        return 0
    elif x == 'B':
        return 1
    else:
        return 2

df_labels['label'] = df_labels['answer'].apply(label)
df_sentences = df_sentences.rename(columns={"FalseSent": "sent"})

In [4]:
# Combine sentences and labels dataframes
df = pd.merge(df_sentences, df_labels, on="id").drop(["id", "answer"], axis=1)

df.head()

Unnamed: 0,sent,OptionA,OptionB,OptionC,label
0,I sting a mosquito,A human is a mammal,A human is omnivorous,A human has not stings,2
1,A giraffe is a person.,Giraffes can drink water from a lake.,A giraffe is not a human being.,.Giraffes usually eat leaves.,1
2,A normal closet is larger than a walk-in closet.,Walk-in closets are normal closets.,A person can sleep in a walk-in closet.,A person cannot walk into a normal closet beca...,2
3,I like to ride my chocolate,Chocolate is delicious and bikes are not,"Chocolate is a food, not a transportation unit",My bike can't ride a chocolate,1
4,A GIRL WON THE RACE WITH HORSE,GIRL HAVE BEAUTIFUL HAIR BUT THE HORSE DOESN'T...,THE GIRL WEAR DRESS BUT THE HORSE DOESN'T HAVE .,HORSE RAN FASTER THAN HER,2


In [5]:
# Exploratory
n_0 = len(df.loc[df['label'] == 0])
n_1 = len(df.loc[df['label'] == 1])
n_2 = len(df.loc[df['label'] == 2])

pd.DataFrame([[n_0, n_1, n_2]], columns=['Answer 0/A','Answer 1/B','Answer 2/C'], index=["n_answers"])

Unnamed: 0,Answer 0/A,Answer 1/B,Answer 2/C
n_answers,2597,2665,2738


In [6]:
import re
import nltk
from nltk.corpus import stopwords
# from transformers import BertTokenizer # Alternative

# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

def preprocess(text: str) -> list:
    """Remove punctuation, remove stopwords, tokenize words"""
    text = re.sub("[^A-Za-z]+", " ", text)
    tokens = nltk.word_tokenize(text)   

    tokens = [x.lower() for x in tokens if not x in set(stopwords.words('english'))] 

    return tokens

For now, the rest of the notebook is just simply classifying sent (X) to label (Y). The options are not taken into account yet.
Based on tutorial from: https://medium.com/mlearning-ai/convert-texts-into-tensors-for-deep-learning-74b0cf48d416 and https://coderzcolumn.com/tutorials/artificial-intelligence/pytorch-simple-guide-to-text-classification

In [7]:
def build_vocab(data: list[str]) -> dict:
    """Map term to index"""
    vocab = {'pad': 0, '</e>': 1, '<unk>': 2} 

    for row in data: 
        processed_text = preprocess(row)
        for word in processed_text:
            if word not in vocab:
                # Create entry in the vocab equal to the term, and its value is the length of the vocab 
                vocab[word] = len(vocab)
    return vocab

vocab = build_vocab(df["sent"].values) # Should maybe be build with options not sent???

In [8]:
len(vocab)

6277

In [10]:
dataset = list(zip(df['sent'].values, df['label'].values)) 
dataset[0]

('I sting a mosquito', 2)

In [11]:
import torch

# Split the dataset
lengths = [int(len(dataset) * 0.8), int(len(dataset) * 0.2)]
train_set, val_set = torch.utils.data.random_split(dataset, lengths)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import DataLoader

vectorizer = CountVectorizer(vocabulary=vocab)

def vectorize_batch(batch):
    X, y = list(zip(*batch))
    X = vectorizer.transform(X).todense()
    return torch.tensor(X, dtype=torch.float32), torch.tensor(y)

train_loader = DataLoader(train_set, batch_size=256, collate_fn=vectorize_batch)
test_loader = DataLoader(val_set, batch_size=256, collate_fn=vectorize_batch)

In [13]:
for X, y in train_loader:
    print(X.shape, y.shape)
    break

torch.Size([256, 6277]) torch.Size([256])


In [14]:
from torch import nn
from torch.nn import functional as F

class TextClassifier(nn.Module):
    def __init__(self):
        super(TextClassifier, self).__init__()
        self.seq = nn.Sequential(
            nn.Linear(len(vocab), 128),
            nn.ReLU(),

            nn.Linear(128, 64),
            nn.ReLU(),

            nn.Linear(64, 3)
        )

    def forward(self, X_batch):
        return self.seq(X_batch)

In [15]:
model = TextClassifier()

for X, y in train_loader:
    y_preds = model(X)
    print(y_preds.shape)
    break

torch.Size([256, 3])


In [16]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
        y_shuffled, y_preds, losses = [],[],[]
        for X, y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, y)
            losses.append(loss.item())

            y_shuffled.append(y)
            y_preds.append(preds.argmax(dim=-1))

        y_shuffled = torch.cat(y_shuffled)
        y_preds = torch.cat(y_preds)

        print("Valid Loss: {:.3f}".format(torch.tensor(losses).mean()))
        print("Valid Accuracy: {:.3f}".format(accuracy_score(y_shuffled.detach().numpy(), y_preds.detach().numpy())))


def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        for X, y in tqdm(train_loader):
            y_preds = model(X)

            loss = loss_fn(y_preds, y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        CalcValLossAndAccuracy(model, loss_fn, val_loader)

In [17]:
from torch.optim import Adam

epochs = 8
learning_rate = 1e-4

loss_fn = nn.CrossEntropyLoss()
text_classifier = TextClassifier()
optimizer = Adam(text_classifier.parameters(), lr=learning_rate)

TrainModel(text_classifier, loss_fn, optimizer, train_loader, test_loader, epochs)

100%|██████████| 25/25 [00:00<00:00, 33.07it/s]


Train Loss : 1.099
Valid Loss: 1.099
Valid Accuracy: 0.323


100%|██████████| 25/25 [00:00<00:00, 28.25it/s]


Train Loss : 1.098
Valid Loss: 1.099
Valid Accuracy: 0.323


100%|██████████| 25/25 [00:00<00:00, 48.09it/s]


Train Loss : 1.097
Valid Loss: 1.099
Valid Accuracy: 0.333


100%|██████████| 25/25 [00:00<00:00, 49.43it/s]


Train Loss : 1.096
Valid Loss: 1.099
Valid Accuracy: 0.348


100%|██████████| 25/25 [00:00<00:00, 26.67it/s]


Train Loss : 1.093
Valid Loss: 1.099
Valid Accuracy: 0.349


100%|██████████| 25/25 [00:00<00:00, 41.94it/s]


Train Loss : 1.090
Valid Loss: 1.099
Valid Accuracy: 0.343


100%|██████████| 25/25 [00:00<00:00, 49.82it/s]


Train Loss : 1.085
Valid Loss: 1.099
Valid Accuracy: 0.346


100%|██████████| 25/25 [00:00<00:00, 51.56it/s]


Train Loss : 1.078
Valid Loss: 1.099
Valid Accuracy: 0.346


In [18]:
def MakePredictions(model, loader):
    y_shuffled, y_preds = [], []
    for X, y in loader:
        preds = model(X)
        y_preds.append(preds)
        y_shuffled.append(y)
    gc.collect()
    y_preds, y_shuffled = torch.cat(y_preds), torch.cat(y_shuffled)

    return y_shuffled.detach().numpy(), F.softmax(y_preds, dim=-1).argmax(dim=-1).detach().numpy()

y_actual, y_preds = MakePredictions(text_classifier, test_loader)

In [19]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Test Accuracy : {}".format(accuracy_score(y_actual, y_preds)))
print("\nClassification Report : ")
print(classification_report(y_actual, y_preds, target_names=['OptionA', 'OptionB', 'OptionC']))
print("\nConfusion Matrix : ")
print(confusion_matrix(y_actual, y_preds))

Test Accuracy : 0.345625

Classification Report : 
              precision    recall  f1-score   support

     OptionA       0.34      0.42      0.37       516
     OptionB       0.33      0.08      0.13       538
     OptionC       0.35      0.53      0.43       546

    accuracy                           0.35      1600
   macro avg       0.34      0.35      0.31      1600
weighted avg       0.34      0.35      0.31      1600


Confusion Matrix : 
[[217  47 252]
 [216  45 277]
 [211  44 291]]


In [None]:
# Stuff I tried for the options.

In [None]:
# all_options = []
# for option in list(zip(df['OptionA'], df['OptionB'], df['OptionB'])):
#     a = text_to_tensor(option[0], vocab)
#     b = text_to_tensor(option[1], vocab)
#     c = text_to_tensor(option[2], vocab)

#     all_options.append((a, b, c))

# all_options[1] # Example

In [None]:
# import torch
# from torch.utils.data import TensorDataset

# ids = torch.tensor(df.index)
# options = torch.tensor(all_options)
# labels = torch.tensor(df.label)

# train = TensorDataset(ids, options, labels)

In [None]:
# TODO: Answers: A, B, C -> Features
# TODO: Sentences: OptionA, OptionB, OptionC -> Features

# from transformers import BertTokenizer
# import itertools
# import torch

# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# features = {}

# # Convert to tokens !!(Now set on 500)
# for sent_idx, sentence in df[:500].iterrows():
#     sent_tokens = tokenizer.tokenize(sentence['sent'])

#     option_a = tokenizer.tokenize(sentence[0])
#     option_b = tokenizer.tokenize(sentence[1])
#     option_c = tokenizer.tokenize(sentence[2])

#     options = list(itertools.chain(option_a, option_b, option_c))

#     features[sent_idx] = {
#         "Sentence": sent_tokens,
#         "Options": options,
#         "Label": sentence.label
#     }

# print(features[5]) # Just for testing