In [None]:
! pip install keras-nlp datasets wandb

In [None]:
import keras_nlp
import tensorflow as tf
import datasets
from datasets import load_dataset
import numpy as np
import pandas as pd
import wandb
import matplotlib.pyplot as plt



In [None]:
tf.config.list_physical_devices('GPU')

### Load and preprocess data

In [None]:
dataset = datasets.load_dataset("tatsu-lab/alpaca", split="trian")
df = pd.DataFrame(dataset)
df = df[['text']]
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
print(dataset['train'][0])

{'instruction': 'When did Virgin Australia start operating?', 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}


In [None]:
n = int(0.9 * len(df))
train_examples = df[:n]
val_examples = df[n:]

In [None]:
train_examples.head()

In [None]:
val_examples.head()

In [None]:
train_examples = tf.data.Dataset.from_tensor_slices((train_examples))

val_examples = tf.data.Dataset.from_tensor_slices((val_examples))

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 32

In [None]:
def make_batches(ds):
    return ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)

### Train model

In [None]:
num_epochs = 5

In [None]:
learning_rate = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5,
    decay_steps=train_batch.cardinality() * num_epochs,
    end_learning_rate=0.0,
)

optimizer = tf.keras.optimizers.Adam(learning_rate)

In [None]:
wandb.init(project="gpt2-instruct-tune",
           config={
               "learning_rate": learning_rate,
               "architecture": "gpt2",
               "dataset": "tatsu-lab/alpaca",
               "epochs": num_epochs,
               }
           )

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=300,
    truncation=True,
)

generator = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en",
    preprocessor=preprocessor,
)

In [None]:
generator.compile(
    optimizer=optimizer,
    loss=loss,
    weight_metircs=["accuracy"],
    )

In [None]:
history = generator.fit(train_batches, validation_data=val_batches, epochs=num_epochs)

In [None]:
metrics_df = pd.DataFrame(history.history)
metrics_df.head()

In [None]:
metrics_df[["loss", "val_loss"]].plot()
metrics_df[["accuracy", "val_accuracy"]].plot()

In [None]:
output = generator.generate("Formula 1 is a ", max_length=100)
print(output)

In [None]:
prompt = "Imagine you're a detective solving a mystery in a futuristic city. Describe your first clue."

output = generator.generate(f"### Instruction:\n{prompt}\n### Response:\n", max_length=100)

print(output)

### Save model