# Preprocessing IMDB Dataset for Sentiment Analysis with DistillBERT
We will use DistillBERT, a smaller and faster version of BERT.

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification ,TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
import numpy as np

dataset = load_dataset("imdb")

In [None]:
df_train = dataset['train']
df_test = dataset['test']

In [None]:
df_train

In [None]:
df_test

Load the DistillBERT tokenizer from the Hugging Face Transformers library and the data_collator.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Define a preprocessing function that tokenizes the text data and prepares it for input into the DistillBERT model.

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

Apply to train and test using the `map` function 

>batched=True to process multiple samples at once for efficiency.

In [None]:
tokenized_train = df_train.map(preprocess_function, batched=True)

In [None]:
tokenized_test = df_test.map(preprocess_function, batched=True)

In [None]:
print(tokenized_train)
print(tokenized_test)

Define metrics for evaluation using accuracy and f1 from the `evaluate` library.

In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [None]:
def compute_metrics(eval_pred):
    print(eval_pred)
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

# Load model 

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Define training arguments

In [None]:
training_args = TrainingArguments(
    output_dir="../models/imdb-distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch",
    push_to_hub=False,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
model.config.id2label = {0: "NEGATIVE", 1: "POSITIVE"}
model.config.label2id = {"NEGATIVE": 0, "POSITIVE": 1} 

model.save_pretrained("../models/imdb-distilbert/checkpoint-3126")
tokenizer.save_pretrained("../models/imdb-distilbert/checkpoint-3126")