## Read First
This is the CPU solution of the project notebook. Use this only if you can't access Google Colab (`colab.research.google.com`). This notebook will finetune Distil GPT-2 on the CPU, or on MPS if you're using a Macbook with an M1 chip (we'll walk you through it.)

## Setup

In [None]:
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U pandas 
!pip install -q -U torch 
!pip install -q -U sklearn


In [None]:
import random
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)

set_seed()

In [None]:
gpt2 = "distilbert/distilgpt2"
# STEP 1. Check and make sure you're using the right model & notebook.
model_name = gpt2

## EDA

In [None]:
df = pd.read_csv("frankenstein_chunks.csv")
df.head()

In [None]:
print("Dataframe Info:")
print(df.info())
print("\n")
print("Dataframe Description:")
print(df.describe())
print("\n")
print("Number of unique values in each column:")
print(df.nunique())
random_index= random.randint(0, len(df) - 1)
df.loc[random_index, 'text']
df = df[:len(df)//2]

In [None]:
df.isnull().sum()

In [None]:
# now we'll quickly convert this to a train/test split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2)

# STEP 2. Convert the train_df and test_df from Pandas into Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


## Model Import

In [None]:
# STEP 3. Load in the model. Check and make sure it's on the CPU.
model = AutoModelForCausalLM.from_pretrained(model_name)
# If you're using a Macbook M1, you may be using "MPS" instead of the CPU
# MPS is a processor that is optimized for the M1 and comes with its own CUDA-like platform
# We'll run the code below, which functions here identically to torch.device.cuda.is_available()
device = "mps" if torch.backends.mps.is_available() else "cpu"
model.to(device)
model.device


## Tokenizing the data

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# STEP 4. Tokenize the train and test sets.
tokenized_train_dataset= train_dataset.map(lambda examples: tokenizer(examples["text"], padding="longest", truncation=True), batched=True)
tokenized_test_dataset = test_dataset.map(lambda examples: tokenizer(examples["text"], padding="longest", truncation=True), batched=True)



## Base Model Evaluation

In [None]:
def generate_text(prompt):
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = model.generate(**inputs, max_new_tokens=100)
  output = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return output

In [None]:
# STEP 5. Generate a completion with the base model for informal evaluation.
base_generation = generate_text("I'm afraid I've created a ") 
base_generation

In [None]:
def calc_perplexity(model):
  total_perplexity = 0
  for row in test_dataset:
    inputs = tokenizer(row['text'], return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]
    with torch.no_grad():
        outputs = model(**inputs, labels=input_ids)
    loss = outputs.loss
    # STEP 6. Complete the equation for perplexity.
    perplexity = torch.exp(loss)
    total_perplexity += perplexity

  num_test_rows = len(test_dataset)
  avg_perplexity = total_perplexity / num_test_rows
  return avg_perplexity

base_ppl = calc_perplexity(model)
base_ppl

## Training

In [None]:
import transformers

tokenizer.pad_token = tokenizer.eos_token
model.config.use_cache = False

trainer = transformers.Trainer(
    train_dataset=tokenized_train_dataset,
    model=model,
    args=transformers.TrainingArguments(
        # use_mps_device=True,
        warmup_steps=200,
        logging_steps=1,
        save_steps=200,
        output_dir="outputs",
      # STEP 7. Configure the training arguments.
        per_device_train_batch_size=2,
        num_train_epochs=2,
        learning_rate=2e-5,
        optim="adamw_hf"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
# STEP 8. Finetune the model.
trainer.train()

## Evaluating the finetuned model

In [None]:

# STEP 9. Generate a completion with the finetuned model and compare it to the base generation.
ft_generation = generate_text("I'm afraid I've created a ") 

print("Base model generation: " + base_generation)
print("Finetuned generation: " + ft_generation)

In [None]:
# STEP 10. Calculate the finetuned model's perplexity and compare it to the base model's.
ft_ppl = calc_perplexity(model)
print("Base model perplexity: " + str(base_ppl))
print("Finetuned model perplexity: " + str(ft_ppl))