In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import torch
import random
import numpy as np
from tqdm.auto import tqdm
import time
from utils import set_device, format_time, save_model
from dream_dataset import DreamsDataset

!nvidia-smi

Thu Jun  8 09:51:13 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.43.02              Driver Version: 535.98       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...    On  | 00000000:01:00.0 Off |                  N/A |
| N/A   45C    P8              11W /  85W |     45MiB /  8192MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [28]:
# # for converting old TSV to JSON

# old_data = pd.read_csv('./data/dreams_reddit_old.tsv', sep='\t', header=None)
# old_data.to_json('./data/dreams_reddit_old.json', orient='values', force_ascii=False)

In [29]:
def build_dream_data() -> pd.DataFrame:
    old_data = pd.read_json('./data/dreams_reddit_old.json')
    new_data = pd.read_json('./data/dreams_reddit_new.json')
    old_data.columns = ['body']
    return pd.concat((old_data, new_data))

dreams = build_dream_data()
dreams = dreams.values.tolist()
len(dreams)

21673

In [30]:
train_dreams, test_dreams = train_test_split(dreams, test_size=0.2)
train_size = len(train_dreams)
test_size = len(test_dreams)
train_size, test_size

(17338, 4335)

In [31]:
device = set_device()

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [41]:
train_dataset = DreamsDataset(train_dreams, tokenizer)
test_dataset = DreamsDataset(test_dreams, tokenizer)

In [43]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=1)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=1)

In [44]:
seed_val = 662023

model.resize_token_embeddings(len(tokenizer))

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [45]:
epochs = 10
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

In [46]:
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

total_steps = train_size * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)



In [50]:
total_t0 = time.time()
training_stats = []

for epoch in tqdm(range(epochs)):
    print(f'Epoch {epoch}')
    
    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_loader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()
        output = model(b_input_ids, labels=b_labels, attention_mask=b_masks, token_type_ids=None)

        loss = output[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        if (step % sample_every == 0) and (step != 0):
            elapsed = format_time(time.time() - t0)
            # .format(step, len(train_dataloader), batch_loss, elapsed)
            print(f'Batch {batch} of {train_size} -- Loss: {batch_loss} -- Elapsed: {elapsed}.')

            model.eval()
            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            
            for i, sample_output in enumerate(sample_outputs):
                  print(f"{i}: {tokenizer.decode(sample_output, skip_special_tokens=True)}")
            model.train()
        
        loss.backward()
        optimizer.step()
        scheduler.step()

        avg_train_loss = total_train_loss / train_size      
    
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print(f"Average training loss: {avg_train_loss}")
        print(f"Training epoch took: {training_time}")

        # validation
        print(f'Validating Step {step}')
        t0 = time.time()
        model.eval()

        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in test_loader:
            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)

            with torch.no_grad():
                outputs = model(b_input_ids, labels=b_labels, attention_mask=b_masks, token_type_ids=None)
                loss = outputs[0]
            
            batch_loss = loss.item()
            total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(test_loader)
    validation_time = format_time(time.time() - t0)    

    print(f"Validation Loss: {avg_val_loss}")
    print(f"Validation took: {validation_time}")

    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("Training complete!")
print(f"Total training took {format_time(time.time()-total_t0)} (h:mm:ss)")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0


: 

: 

In [None]:
train_stats_df= pd.DataFrame(training_stats)
train_stats_df.set_index('epoch')
train_stats_df

In [11]:
import seaborn as sns
from matplotlib import pyplot as plt

sns.lineplot(train_stats_df, x='epoch', y=('Training Loss', 'Valid. Loss'))

NameError: name 'train_stats_df' is not defined

In [None]:
save_model(output_dir='./models', model=model, tokenizer=tokenizer)