# Exercise: Fine tune GPT2 for IMDB reviews classification

In [1]:
from datasets import load_dataset
dataset = load_dataset("stanfordnlp/imdb", split="train")
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
## Get the GPT2 model + add classification head
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Exercise: Repeat this for a different model. Find a suitable model.
model = AutoModelForSequenceClassification.from_pretrained('gpt2', num_labels=2)

tokenizer.pad_token = tokenizer.eos_token 
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)
tokenized_dataset = dataset.map(tokenize_function)

In [12]:
# Lets check accuracy with the trained model
checkpoint = "./test-trainer/checkpoint-7500"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)  
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# tokenizer.pad_token = tokenizer.eos_token 
# model.config.pad_token_id = tokenizer.pad_token_id

In [23]:
import evaluate
import numpy as np

def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [24]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    "gpt2-imdb", # output dir 
    evaluation_strategy="epoch")

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [26]:
# trainer.train()

In [27]:
predictions = trainer.predict(tokenized_dataset['test'])

In [28]:
predictions.metrics

{'test_loss': 0.11568310856819153,
 'test_model_preparation_time': 0.0018,
 'test_accuracy': 0.9766,
 'test_runtime': 110.2739,
 'test_samples_per_second': 45.342,
 'test_steps_per_second': 5.668}

In [29]:
# make an inference pipeline

from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Replace with your model checkpoint directory
checkpoint = "./test-trainer/checkpoint-7500"

# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Create a pipeline
nlp_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [33]:
# Test the pipeline
print(nlp_pipeline("I think I will take my girlfriend to this movie!"))

[{'label': 'LABEL_1', 'score': 0.99901282787323}]


In [31]:
# Test the pipeline
print(nlp_pipeline("It actually pains me to say it, but this movie was horrible on every level. The blame does not lie entirely with Van Damme as you can see he tried his best, but let's face it, he's almost fifty, how much more can you ask of him? I find it so hard to believe that the same people who put together Undisputed 2; arguably the best (western) martial arts movie in years, created this. Everything from the plot, to the dialog, to the editing, to the overall acting was just horribly put together and in many cases outright boring and nonsensical. Scott Adkins who's fight scenes seemed more like a demo reel, was also terribly underused and not even the main villain which is such a shame because 1) He is more than capable of playing that role and 2) The actual main villain was not only not intimidating at all but also quite annoying. Again, not blaming Van Damme. I will always be a fan, but avoid this one."))

[{'label': 'LABEL_0', 'score': 0.9999393224716187}]
