## GReaT Example with California Housing Dataset

In [None]:
# Execute only once!
import os
import sys
sys.path.append("..")
os.chdir("..")

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import logging
from sklearn import datasets

In [None]:
from utils import set_logging_level
from be_great import GReaT

In [None]:
import matplotlib.pyplot as plt

In [None]:
logger = set_logging_level(logging.INFO)

### Load Data

In [None]:
data = datasets.fetch_california_housing(as_frame=True).frame
data.head()

### Create GReaT Model

Only one epoch here for demonstration

In [None]:
great = GReaT("distilgpt2",                         # Name of the large language model used (see HuggingFace for more options)
              epochs=1,                             # Number of epochs to train (only one epoch for demonstration)
              save_steps=2000,                      # Save model weights every x steps
              logging_steps=50,                     # Log the loss and learning rate every x steps
              experiment_dir="trainer_california",  # Name of the directory where all intermediate steps are saved
              #lr_scheduler_type="constant",        # Specify the learning rate scheduler 
              #learning_rate=5e-5                   # Set the inital learning rate
             )

### Start Training

In [None]:
trainer = great.fit(data)

In [None]:
loss_hist = trainer.state.log_history.copy()
loss_hist.pop()

In [None]:
loss = [x["loss"] for x in loss_hist]
epochs = [x["epoch"] for x in loss_hist]

In [None]:
plt.plot(epochs, loss)

### Save Model

In [None]:
great.save("california")

### Load Model

In [None]:
# great = GReaT.load_from_dir("california")

In [None]:
great.load_finetuned_model("../great_private/models/california/california_distilgpt2_100.pt")

### Generate Samples

In [None]:
n_samples = 1000

In [None]:
samples = great.sample(n_samples, k=50, device="cuda:1")

In [None]:
samples.head()

In [None]:
samples.shape

In [None]:
samples.to_csv("california_samples.csv")

## Plot Data

Original Data

In [None]:
true_samples = data.sample(n = 1000)

In [None]:
plt.scatter(true_samples["Longitude"], true_samples["Latitude"], c=true_samples["MedHouseVal"])

Generated samples

In [None]:
#samples = pd.read_csv("california_samples.csv")

In [None]:
plt.scatter(samples["Longitude"], samples["Latitude"], c=samples["MedHouseVal"])