In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
from datasets import load_dataset, load_metric, DatasetDict, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import evaluate
import os
torch.cuda.is_available()


False

In [5]:
# setting random seed and size of dataset to load
SEED = 12
SIZE = 500

# import distilbert tokenizer to tokenize and attention mat the data
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
imdb = load_dataset("imdb")

# split into small train test split
train_small = imdb['train'].shuffle(seed=SEED).select([i for i in list(range(SIZE))])
test_small = imdb['test'].shuffle(seed=SEED).select([i for i in list(range(SIZE))])

# tokenizer function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# tokenize small train small test split
train_tokenized = train_small.map(preprocess_function, batched=True)
test_tokenized = test_small.map(preprocess_function, batched=True)


Found cached dataset imdb (C:/Users/hashi/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 109.50it/s]
Loading cached shuffled indices for dataset at C:\Users\hashi\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-23a9476c8f68f7b3.arrow
Loading cached shuffled indices for dataset at C:\Users\hashi\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-fe2bb30f2aa00111.arrow
Loading cached processed dataset at C:\Users\hashi\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-5fedd185abbc3477.arrow
                                                               

In [6]:
print(f'train_keys: {train_tokenized}\ntest_keys: {test_tokenized}')
# looking at an example from the dataset
ex = 212
print(f"x: {train_tokenized['text'][ex]}\ny: {train_tokenized['label'][ex]}\nattention: {train_tokenized['attention_mask'][ex]}")

train_keys: Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 500
})
test_keys: Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 500
})
x: This kind of film has become old hat by now, hasn't it? The whole thing is syrupy nostalgia turned in upon itself in some kind of feedback loop.<br /><br />It sure sounds like a good idea: a great ensemble cast, some good gags, and some human drama about what could have/might have been. Unfortunately, there is no central event that binds them all together, like there was in "The Big Chill", one of those seminal movies that spawned copycat films like this one. You end up wanting to see more of one or two particular people instead of getting short takes on everyone. The superficiality this creates is not just annoying, it's maddening. The below-average script doesn't help.
y: 0
attention: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [7]:
data_collator = DataCollatorWithPadding(tokenizer)

accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# set labels
id2label = {1: "neg", 0: "pos"}
label2id = {v: k for k, v in id2label.items()}

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
training_args = TrainingArguments(
    output_dir="bert_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

KeyboardInterrupt: 