## Imports


In [1]:
import gc
import torch

gc.collect()

torch.cuda.empty_cache()

In [2]:
from datasets import Dataset
import pandas as pd
import numpy as np

from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import evaluate

In [3]:
data_dir = "E:\Projects\listops-1000\\"

## Importing Data

In [4]:
train_df = pd.read_csv(data_dir + "basic_train.tsv", sep="\t", nrows=100)
train_df = pd.DataFrame(train_df)

val_df = pd.read_csv(data_dir + "basic_val.tsv", sep="\t")
val_df = pd.DataFrame(val_df)

test_df = pd.read_csv(data_dir + "basic_test.tsv", sep="\t")
test_df = pd.DataFrame(test_df)


train_dataset = Dataset.from_pandas(train_df, split="train")
val_dataset = Dataset.from_pandas(val_df, split="val")
test_dataset = Dataset.from_pandas(test_df, split="test")



In [5]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
batch_size = 64
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
def preprocess_function(examples, func_tokenizer):
    return func_tokenizer(examples["Source"], truncation=True, max_length=512)

tokenized_train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_val_dataset = val_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_test_dataset = test_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_train_dataset = tokenized_train_dataset.rename_column("Source", "text")
tokenized_train_dataset = tokenized_train_dataset.rename_column("Target", "label")

tokenized_val_dataset = tokenized_val_dataset.rename_column("Source", "text")
tokenized_val_dataset = tokenized_val_dataset.rename_column("Target", "label")

tokenized_test_dataset = tokenized_test_dataset.rename_column("Source", "text")
tokenized_test_dataset = tokenized_test_dataset.rename_column("Target", "label")

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=10,
)

training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        num_train_epochs=10,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        report_to="none"              
)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    return {"acc": acc}

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 2.284865617752075, 'eval_acc': {'accuracy': 0.1715}, 'eval_runtime': 102.9391, 'eval_samples_per_second': 19.429, 'eval_steps_per_second': 0.311, 'epoch': 1.0}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 2.2777538299560547, 'eval_acc': {'accuracy': 0.1715}, 'eval_runtime': 97.0986, 'eval_samples_per_second': 20.598, 'eval_steps_per_second': 0.33, 'epoch': 2.0}


  0%|          | 0/32 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
trainer.evaluate(tokenized_test_dataset)


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.251964807510376, 'eval_acc': {'accuracy': 0.178}, 'eval_runtime': 858.3778, 'eval_samples_per_second': 2.33, 'eval_steps_per_second': 0.291, 'epoch': 0.18}


{'eval_loss': 2.251964807510376,
 'eval_acc': {'accuracy': 0.178},
 'eval_runtime': 858.3778,
 'eval_samples_per_second': 2.33,
 'eval_steps_per_second': 0.291,
 'epoch': 0.176}