In [1]:
import pandas as pd

df_sentences = pd.read_csv("common-sense/train_data.csv")
df_labels = pd.read_csv("common-sense/train_answers.csv")
df = pd.merge(df_sentences, df_labels, on="id")
df = df.rename(columns={"answer":"label"})
df.head()

Unnamed: 0,id,FalseSent,OptionA,OptionB,OptionC,label
0,sentence_1,I sting a mosquito,A human is a mammal,A human is omnivorous,A human has not stings,C
1,sentence_2,A giraffe is a person.,Giraffes can drink water from a lake.,A giraffe is not a human being.,.Giraffes usually eat leaves.,B
2,sentence_3,A normal closet is larger than a walk-in closet.,Walk-in closets are normal closets.,A person can sleep in a walk-in closet.,A person cannot walk into a normal closet beca...,C
3,sentence_4,I like to ride my chocolate,Chocolate is delicious and bikes are not,"Chocolate is a food, not a transportation unit",My bike can't ride a chocolate,B
4,sentence_5,A GIRL WON THE RACE WITH HORSE,GIRL HAVE BEAUTIFUL HAIR BUT THE HORSE DOESN'T...,THE GIRL WEAR DRESS BUT THE HORSE DOESN'T HAVE .,HORSE RAN FASTER THAN HER,C


In [2]:
print(len(df))

8000


In [3]:
n_0 = len(df.loc[df['label'] == 'A'])
n_1 = len(df.loc[df['label'] == 'B'])
n_2 = len(df.loc[df['label'] == 'C'])

pd.DataFrame([[n_0, n_1, n_2]], columns=['OptionA', 'OptionB', 'OptionC'], index=["n_answers"])

Unnamed: 0,OptionA,OptionB,OptionC
n_answers,2597,2665,2738


In [4]:
from datasets import Dataset

data = Dataset.from_pandas(df).train_test_split(test_size=0.3, seed=42).class_encode_column("label")

Flattening the indices:   0%|          | 0/6 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/6 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/3 [00:00<?, ?ba/s]

In [5]:
count = {}
for i in data:
    n_options = {'A': 0, 'B': 0, 'C': 0}
    for example in data[i]:
        if example['label'] == 0:
            n_options['A'] += 1   
        elif example['label'] == 1:
            n_options['B'] += 1   
        else:
            n_options['C'] += 1             
    count[i] = n_options

In [6]:
pd.DataFrame([list(count['train'].values()), list(count['test'].values())], columns=['OptionA', 'OptionB', 'OptionC'], index=["train", "test"])

Unnamed: 0,OptionA,OptionB,OptionC
train,1789,1872,1939
test,808,793,799


In [7]:
for i in count:
    print(i)
    total = sum(count[i].values())
    print(f"Total: {total}")
    for k, v in count[i].items():
        print(f"Option{k}\t {v}")
    print()

train
Total: 5600
OptionA	 1789
OptionB	 1872
OptionC	 1939

test
Total: 2400
OptionA	 808
OptionB	 793
OptionC	 799



In [8]:
labels = {0: 'OptionA', 1: 'OptionB', 2: 'OptionC'}

def show(example):
    """
        Shows an example in the dataset
    """
    print(f"Sentence: {example['FalseSent']}\n")

    print(f"Options:\nA) {example['OptionA']}\nB) {example['OptionB']}\nC) {example['OptionC']}")

    gold_label = example['label']
    print(f"Correct label: {example['label']}\n")
    print(f"Ground truth: {labels[gold_label]}")


In [9]:
show(data["train"][3])

Sentence: Banana is the day before friday.

Options:
A) Bananas can be eaten anyday.
B) Banana is a food, not a day.
C) Bananas are not in the calendar.
Correct label: 1

Ground truth: OptionB


In [10]:
from transformers import AutoTokenizer

MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(f"./models/results/{MODEL_NAME}/checkpoint-1000")

def preprocess_function(examples):
    """ Perform preprocessing of the input.
        # Arguments
            Dataset: Huggingface Dataset containing features
        # Output
            Huggingface dataset_dict with the tokenized examples with corresponding input_ids, attention_mask, and labels.
    """
    first = [[i] * 3 for i in examples["FalseSent"]]

    second = [
        [f"{examples[opt][i]}" for opt in list(labels.values())] for i, j in enumerate(examples['FalseSent'])
    ]

    first = sum(first, [])
    sec = sum(second, [])

    # Truncation makes sure to make sure input is not longer than max
    tokenized_examples = tokenizer(first, sec, truncation=True)
 
    return {k: [v[i : i + 3] for i in range(0, len(v), 3)] for k, v in tokenized_examples.items()}

# Apply preprocess function on entire dataset
tokenized = data.map(preprocess_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [11]:
# Check decoded tokenized and compare with ground truth 
idx = 3
[tokenizer.decode(tokenized["train"]["input_ids"][idx][i]) for i in range(3)]

['[CLS] banana is the day before friday. [SEP] bananas can be eaten anyday. [SEP]',
 '[CLS] banana is the day before friday. [SEP] banana is a food, not a day. [SEP]',
 '[CLS] banana is the day before friday. [SEP] bananas are not in the calendar. [SEP]']

In [12]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    Flattens all model inputs, apply padding, unflatten results.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.long)
        return batch

In [13]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in tokenized["train"][i].items() if k in accepted_keys} for i in range(10)]

batch = DataCollatorForMultipleChoice(tokenizer)(features)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [14]:
[tokenizer.decode(batch["input_ids"][idx][i].tolist()) for i in range(3)]

['[CLS] banana is the day before friday. [SEP] bananas can be eaten anyday. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] banana is the day before friday. [SEP] banana is a food, not a day. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] banana is the day before friday. [SEP] bananas are not in the calendar. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]']

In [18]:
[len(tokenizer.decode(batch["input_ids"][idx][i].tolist())) for i in range(3)]

[205, 193, 209]

In [22]:
x = [tokenizer.decode(batch["input_ids"][idx][i].tolist()) for i in range(3)]

In [25]:
x

['[CLS] banana is the day before friday. [SEP] bananas can be eaten anyday. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] banana is the day before friday. [SEP] banana is a food, not a day. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] banana is the day before friday. [SEP] bananas are not in the calendar. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]']

In [24]:
for i in x:
    print(len(i.split()))

35
35
36
