In [1]:
# ! pip install accelerate -U

In [2]:
from argparse import ArgumentParser
from tqdm import tqdm
import csv
import re
import random
import transformers

import torch
from torch import nn, optim
# To import the Transformer Models
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, logging
from sklearn.metrics import precision_recall_fscore_support as score
import numpy as np
import pandas as pd

TRAIN_SPLIT = 10000
TEST_SPLIT = 2000
EPOCHS = 10
BATCH_SIZE = 16

In [3]:
try:
  from datasets import load_dataset
except:
  !pip install datasets
  from datasets import load_dataset

train = load_dataset("CLUTRR/v1", name= "gen_train234_test2to10", split=f"train[:{TRAIN_SPLIT}]")
test = load_dataset("CLUTRR/v1", name= "gen_train234_test2to10", split=f"test[:{TEST_SPLIT}]")

In [4]:
train_dataset = pd.DataFrame(train)
test_dataset = pd.DataFrame(test)
train_dataset["input_text"] = train_dataset["clean_story"] + " " + train_dataset["query"] + " " + train_dataset["genders"]
test_dataset["input_text"] = test_dataset["clean_story"] + " " + test_dataset["query"] + " " + test_dataset["genders"]
# train_dataset["input_text"] = train_dataset["clean_story"]
# test_dataset["input_text"] = test_dataset["clean_story"]
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove square brackets, parentheses, and single quotes using regex
    text = re.sub(r'[\[\]():]', ' ', text)
    # Remove single quotes
    text = text.replace("'", "")
    return text
train_dataset.input_text = train_dataset.input_text.apply( lambda text: preprocess_text(text))
test_dataset.input_text = test_dataset.input_text.apply( lambda text: preprocess_text(text))
train_dataset = train_dataset[['input_text', 'target']].rename(columns={'target': 'labels'})
test_dataset = test_dataset[['input_text', 'target']].rename(columns={'target': 'labels'})


In [5]:
print(train_dataset.head(5))
print(test_dataset.head(5))

                                          input_text  labels
0   ashley s daughter,  lillian , asked her mom t...      15
1   nancy  likes to cut the hair of her daughter ...      16
2   dale  and his sister  nancy  are decorating f...      17
3   lillian  and her sister  nancy  are the only ...      14
4   ashley  liked to go to the park with her gran...       6
                                          input_text  labels
0   clarence s granddaughter,  emily , was busy h...      10
1   emily  and her granddaughter  ashley  went to...      10
2   clarence  has 3 children, and one grandson. t...      11
3   glen  is  emily s brand new baby brother.  cl...      10
4   clarence  bought a train set for his grandson...      11


In [6]:
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, logging
pretrained_model = "google/electra-base-generator"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

def preprocess_function(datatset):
    return tokenizer(datatset["input_text"], truncation=True)


def pipeline(dataframe):
    dataset = Dataset.from_pandas(dataframe, preserve_index=False)
    tokenized_ds = dataset.map(preprocess_function, batched=True)
    tokenized_ds = tokenized_ds.remove_columns('input_text')
    return tokenized_ds

In [7]:
tokenized_train = pipeline(train_dataset)
tokenized_val = pipeline(test_dataset)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1048 [00:00<?, ? examples/s]

In [8]:
tokenized_train

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})

In [9]:
# Set values for model and train
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=20)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.logits
        labels = inputs.get("labels")
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="./results",
    save_strategy = 'epoch',
    optim="adamw_torch",
    learning_rate=0.00002,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    report_to="none",
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

%time trainer.train()


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-generator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/6250 [00:00<?, ?it/s]

{'loss': 2.8314, 'grad_norm': 26.08255958557129, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.8}
{'loss': 2.3486, 'grad_norm': 27.70707893371582, 'learning_rate': 1.6800000000000002e-05, 'epoch': 1.6}
{'loss': 2.1193, 'grad_norm': 34.18219757080078, 'learning_rate': 1.5200000000000002e-05, 'epoch': 2.4}
{'loss': 1.9791, 'grad_norm': 47.67961120605469, 'learning_rate': 1.3600000000000002e-05, 'epoch': 3.2}
{'loss': 1.7765, 'grad_norm': 62.932594299316406, 'learning_rate': 1.2e-05, 'epoch': 4.0}
{'loss': 1.6029, 'grad_norm': 166.15390014648438, 'learning_rate': 1.04e-05, 'epoch': 4.8}
{'loss': 1.4354, 'grad_norm': 114.10753631591797, 'learning_rate': 8.8e-06, 'epoch': 5.6}
{'loss': 1.2649, 'grad_norm': 74.26197814941406, 'learning_rate': 7.2000000000000005e-06, 'epoch': 6.4}
{'loss': 1.1405, 'grad_norm': 87.5284194946289, 'learning_rate': 5.600000000000001e-06, 'epoch': 7.2}
{'loss': 1.0115, 'grad_norm': 187.13925170898438, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.0}
{'l

TrainOutput(global_step=6250, training_loss=1.5797543994140626, metrics={'train_runtime': 1523.77, 'train_samples_per_second': 65.627, 'train_steps_per_second': 4.102, 'train_loss': 1.5797543994140626, 'epoch': 10.0})

In [12]:
tokenized_test = tokenized_val

tokenized_test = tokenized_test.remove_columns('labels')

preds = trainer.predict(tokenized_test)
preds_flat = [np.argmax(x) for x in preds[0]]

precision, recall, fscore, support = score(tokenized_val['labels'], preds_flat)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

# Calculate accuracy
correct_predictions = sum(p == l for p, l in zip(preds_flat, tokenized_val['labels']))
total_predictions = len(preds_flat)
accuracy = correct_predictions / total_predictions
print(accuracy)

precision: [0.27777778 0.         0.36       0.16929134 0.21189591 0.5952381
 0.51515152 0.55555556 0.21052632 0.         0.7804878  0.61702128
 0.02857143 0.2        0.34615385 0.6        0.125      0.31578947]
recall: [0.04545455 0.         0.28125    0.61428571 0.73076923 0.49019608
 0.39534884 0.16129032 0.12       0.         0.49230769 0.70731707
 0.33333333 0.41176471 0.05625    0.6        0.19047619 0.08759124]
fscore: [0.078125   0.         0.31578947 0.2654321  0.32853026 0.53763441
 0.44736842 0.25       0.15286624 0.         0.60377358 0.65909091
 0.05263158 0.26923077 0.09677419 0.6        0.1509434  0.13714286]
support: [110   3  32  70  78  51  43  31 100   5  65  82   3  17 160  40  21 137]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.3053435114503817
