<a href="https://colab.research.google.com/github/jtghchau/TextClassification/blob/main/Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install datasets
!pip install transformers
!pip install evaluate
import evaluate

from google.colab import runtime

from datasets import load_dataset, Dataset
dataset = load_dataset("tiny_shakespeare")

train_text = dataset["train"]["text"][0]
print(train_text[:500])

lines = [line for line in train_text.split("\n") if line.strip()]
line_dataset = Dataset.from_dict({"text": lines})

data_split = line_dataset.train_test_split(test_size=0.1)

train_data = data_split["train"]
val_data = data_split["test"]

print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")

from transformers import AutoTokenizer

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=64 #Reduced this from 128 and still crashing
    )

train_dataset = train_data.map(tokenize_function, batched=True, remove_columns=["text"])
val_dataset = val_data.map(tokenize_function, batched=True, remove_columns=["text"])

print(train_dataset[0])

train_dataset = train_dataset.shuffle(seed=42).select(range(2000))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor
Train size: 26317
Validation size: 2925


Map:   0%|          | 0/26317 [00:00<?, ? examples/s]

Map:   0%|          | 0/2925 [00:00<?, ? examples/s]

{'input_ids': [1870, 1309, 467, 416, 262, 8674, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
#Part 2 of Mini-Project
import math
import matplotlib.pyplot as plt
from transformers import TrainingArguments, Trainer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainerCallback

#Attempting to clear my cache as before it would run out of GPU memory quickly
import torch
torch.cuda.empty_cache()

def compute_metrics(eval_pred):
    loss = float(eval_pred.predictions[0])
    perplexity = math.exp(loss)
    return {"perplexity": perplexity}

class TextGenerationCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, model, tokenizer, **kwargs):
        if state.global_step % 100 == 0:
            prompt = "Wherefore art thou"
            inputs = tokenizer(prompt, return_tensors="pt").input_ids
            output = model.generate(inputs, max_length=50, num_return_sequences=1)
            print(f"Step {state.global_step}: {tokenizer.decode(output[0], skip_special_tokens=True)}")

trainingArguments = TrainingArguments(
    output_dir="./distilgpt2-finetuned-shakespeare",
    num_train_epochs=3, #Reduced from 5
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=500,
    gradient_accumulation_steps=24, #Added this
    fp16=True, #Added this
    eval_accumulation_steps=10
)

model = AutoModelForCausalLM.from_pretrained(model_name)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=trainingArguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[TextGenerationCallback()]
)

#Im running out of memory and I've looked up ways to reduce it but it wasn't helping. Other than that I think my code works but it just needs more memory to run. What do your recommend me to do to reduce my ram usuage or optimize my code
trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss


In [1]:
history = trainer.state.log_history
steps = [log["step"] for log in history if "loss" in log]
train_losses = [log["loss"] for log in history if "loss" in log]
val_losses = [log["eval_loss"] for log in history if "eval_loss" in log]

plt.figure(figsize=(10,5))
plt.plot(steps, train_losses, label='Training Loss')
plt.plot(steps, val_losses, label='Validation Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()

model.eval()
prompt = "Thus speaks"
inputs = tokenizer(prompt, return_tensors="pt").input_ids

inputs = inputs.to(model.device)
output = model.generate(inputs, max_length=50, num_return_sequences=1)
print(tokenizer.decode(output[0], skip_special_tokens=True))

NameError: name 'trainer' is not defined



---

# **Deliverables**

---

