## FINE TUNE ON GOTHIC TEXT ##

In [None]:
import os
import time
import datetime
import numpy as np
import random
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
from torch.optim import AdamW  # Use PyTorch's AdamW optimizer
from torch.amp import GradScaler, autocast  # Updated imports for mixed precision training

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
print(os.getcwd())
os.chdir('/content/drive/MyDrive/Colab Notebooks/gothic')
print(os.getcwd())

# Function to format elapsed time as hh:mm:ss
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

# ========================================
#               Parameters
# ========================================

# Training parameters
batch_size = 2 # Mini-batch size
max_length = 1024  # Maximum sequence length
epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
checkpoint_interval = 1  # Save checkpoint every N epochs
step_interval = 10

# Gradient accumulation parameters
desired_tokens_per_batch = 524288  # Desired effective batch size in tokens
tokens_per_mini_batch = batch_size * max_length
gradient_accumulation_steps = desired_tokens_per_batch // tokens_per_mini_batch  # Number of steps to accumulate gradients

print(f"Gradient accumulation steps: {gradient_accumulation_steps}")

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ========================================
#        Model and Tokenizer Setup
# ========================================

model_name = "gpt2-xl"
tokenizer = GPT2Tokenizer.from_pretrained(model_name, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos token
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))  # Adjust model embeddings if needed
model = model.to(device)

# ========================================
#               Dataset
# ========================================

class GothicDataset(Dataset):
    def __init__(self, tokenizer, max_length=1024):
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Load the text
        with open('./gothic_novels_combined.txt', 'r') as f:
            text = f.read()
        print(f"Loaded {len(text)} characters")

        # Tokenize the text into a sequence of input tokens
        tokenized_text = tokenizer.encode(text)

        # Split the tokenized text into chunks of max_length
        self.input_ids = []
        self.attn_masks = []

        for i in range(0, len(tokenized_text) - max_length + 1, max_length):
            chunk = tokenized_text[i:i + max_length]

            # Create padding if needed (only needed if chunks aren't exactly max_length)
            padding_length = max_length - len(chunk)

            assert padding_length == 0, f"Padding length should be 0, but got {padding_length}"

            # Pad input IDs with tokenizer's pad token id if necessary
            padded_chunk = chunk + [tokenizer.pad_token_id] * padding_length

            # Create an attention mask: 1 for tokens, 0 for padding
            attention_mask = [1] * len(chunk) + [0] * padding_length

            self.input_ids.append(padded_chunk)
            self.attn_masks.append(attention_mask)
        print(f"Number of input sequences: {len(self.input_ids)}")

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = self.input_ids[idx]
        attn_mask = self.attn_masks[idx]
        return torch.tensor(input_ids), torch.tensor(attn_mask)



# Initialize dataset and dataloaders
dataset = GothicDataset(tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f'{train_size:,} training samples')
print(f'{val_size:,} validation samples')

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# ========================================
#          Optimizer and Scheduler
# ========================================

optimizer = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

# Calculate total steps
total_steps = len(train_dataloader) * epochs

# Prepare learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
)

# Initialize mixed precision training scaler with updated API
scaler = GradScaler()

# ========================================
#          Training and Validation
# ========================================

training_stats = []
initial_t0 = time.time()  # Measure total training time

# Training loop
for epoch_i in range(epochs):
    print(f'\n======== Epoch {epoch_i + 1} / {epochs} ========')
    print('Training...')

    t0 = time.time()  # Measure epoch training time
    total_train_loss = 0  # Reset the total loss for this epoch
    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        model.zero_grad()  # Clear any previously calculated gradients

        # Forward pass and compute loss with mixed precision
        with autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(b_input_ids, labels=b_labels, attention_mask=b_masks)
            loss = outputs.loss / gradient_accumulation_steps  # Normalize loss

        total_train_loss += loss.item()  # Accumulate the loss

        scaler.scale(loss).backward()  # Backward pass with scaled loss

        if (step + 1) % step_interval == 0:
            print(f"step:{step+1}")

        # Update parameters every `gradient_accumulation_steps`
        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping

            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            model.zero_grad()

    avg_train_loss = total_train_loss / len(train_dataloader)  # Calculate the average loss
    training_time = format_time(time.time() - t0)

    print(f"\n  Average training loss: {avg_train_loss:.4f}")
    print(f"  Training epoch took: {training_time}")

    # ========================================
    #               Validation
    # ========================================
    print("\nRunning Validation...")

    t0 = time.time()  # Measure validation time
    total_eval_loss = 0
    model.eval()

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, labels=b_labels, attention_mask=b_masks)
            loss = outputs.loss

        total_eval_loss += loss.item()

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)

    print(f"  Validation Loss: {avg_val_loss:.4f}")
    print(f"  Validation took: {validation_time}")

    # Record all statistics from this epoch
    training_stats.append({
        'epoch': epoch_i + 1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Training Time': training_time,
        'Validation Time': validation_time
    })

    print("\nEpoch Summary:")
    print(f"  Epoch {epoch_i + 1} / {epochs}")
    print(f"  Training Loss: {avg_train_loss:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}")
    print(f"  Training Time: {training_time}")
    print(f"  Validation Time: {validation_time}")

    # ========================================
    #               Sampling
    # ========================================
    print("\nGenerating Sample Output...")

    model.eval()
    sample_outputs = model.generate(
        bos_token_id=random.randint(1, 30000),
        do_sample=True,
        top_k=50,
        max_length=200,
        top_p=0.95,
        num_return_sequences=1
    )
    for i, sample_output in enumerate(sample_outputs):
        print(f"{i}: {tokenizer.decode(sample_output, skip_special_tokens=True)}")
    model.train()

    # Save model checkpoint after each epoch
    if (epoch_i + 1) % checkpoint_interval == 0:
        checkpoint_dir = f'./checkpoint-{epoch_i + 1}'
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        model.save_pretrained(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)

print("\nTraining complete!")
print(f"Total training took {format_time(time.time() - initial_t0)} (h:mm:ss)")

# ========================================
#               Plotting
# ========================================
epochs = [x['epoch'] for x in training_stats]
training_loss = [x['Training Loss'] for x in training_stats]
validation_loss = [x['Valid. Loss'] for x in training_stats]

plt.figure(figsize=(10, 6))
plt.plot(epochs, training_loss, label='Training Loss')
plt.plot(epochs, validation_loss, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss per Epoch')
plt.legend()
plt.grid(True)
plt.show()


## FINE TUNE ON QUESTION AND ANSWER ##

In [None]:
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
print(os.getcwd())
os.chdir('/content/drive/MyDrive/Colab Notebooks/gothic')
print(os.getcwd())

import time
import datetime
import numpy as np
import random
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
from torch.optim import AdamW  # Use PyTorch's AdamW optimizer
from torch.amp import GradScaler, autocast  # Updated imports for mixed precision training

import re
import ast
import json
from cleaner import normalize_text


# Function to format elapsed time as hh:mm:ss
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

# ========================================
#               Parameters
# ========================================

# Training parameters
batch_size = 2 # Mini-batch size
max_length = 1024  # Maximum sequence length
epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
checkpoint_interval = 1  # Save checkpoint every N epochs
step_interval = 100

print(f"batch_size: {batch_size}")
print(f"max_length: {max_length}")
print(f"epochs: {epochs}")
print(f"learning_rate: {learning_rate}")
print(f"warmup_steps: {warmup_steps}")
print(f"epsilon: {epsilon}")

# Gradient accumulation parameters
desired_tokens_per_batch = 524288  # Desired effective batch size in tokens
tokens_per_mini_batch = batch_size * max_length
gradient_accumulation_steps = desired_tokens_per_batch // tokens_per_mini_batch  # Number of steps to accumulate gradients

print(f"Gradient accumulation steps: {gradient_accumulation_steps}")




# Enable CUDA Launch Blocking for debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Load the model and tokenizer
# model = GPT2LMHeadModel.from_pretrained("./checkpoint_chat-2")
# tokenizer = GPT2Tokenizer.from_pretrained("./checkpoint-5")
# Define your additional special tokens
model_name = "gpt2-xl"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos token
model = GPT2LMHeadModel.from_pretrained(model_name)
special_tokens = {
    'pad_token': '<|pad|>',
    'bos_token': '<|startoftext|>',
    'eos_token': '<|endoftext|>',
    'additional_special_tokens': ['<|user|>', '<|assistant|>']
}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))  # Adjust model embeddings if needed

# Device configuration
# Set device_type based on the availability of CUDA
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device_type)
model.to(device)
print(f"device_type: {device_type}")

!pip install datasets
from datasets import load_dataset

# Load the DailyDialog dataset
dataset = load_dataset("daily_dialog")

# Check the available splits
print(dataset)

class ChatDataset(Dataset):
    def __init__(self, tokenizer, max_length=1024):
        conversations = []

        # First get Hugging Face data
        print(f"loading Hugging Face data")
        conversations.extend(self.get_conversations_HF())
        print(f"{len(conversations)=}")

        # Now get Cornell data
        print(f"loading Cornell data")
        data_path = "data/cornell_movie_dialogs_corpus"
        conversations.extend(self.get_conversations_cornell(data_path))
        print(f"{len(conversations)=}")

        # Now get woz data
        print(f"loading woz data")
        data_path = 'data/MultiWOZ_2.2'
        conversations.extend(self.get_conversations_woz(data_path))
        print(f"{len(conversations)=}")

        # Now get taskmaster data
        print(f"loading taskmaster data")
        data_path = 'data/Taskmaster'
        conversations.extend(self.get_conversations_taskmaster(data_path))
        print(f"{len(conversations)=}")

        conversations_list = []
        dialogue_list = []
        for dialogue in  conversations:
            dialogue_list.append("<|startoftext|>\n")
            for line in dialogue:
                dialogue_list.append(line + '\n')
            dialogue_list.append('<|endoftext|>\n')
            conversations_list.append(''.join(dialogue_list))
            dialogue_list = []
        random.shuffle(conversations_list)
        text = ''.join(conversations_list)
        print(f"{len(text)=}")

        # Tokenize the text into a sequence of input tokens
        tokenized_text = tokenizer.encode(text)
        print(f"{len(tokenized_text)=}")

        # Split the tokenized text into chunks of max_length
        self.input_ids = []
        self.attn_masks = []

        for i in range(0, len(tokenized_text) - max_length + 1, max_length):
            chunk = tokenized_text[i:i + max_length]

            # Create padding if needed (only needed if chunks aren't exactly max_length)
            padding_length = max_length - len(chunk)

            assert padding_length == 0, f"Padding length should be 0, but got {padding_length}"

            # Pad input IDs with tokenizer's pad token id if necessary
            padded_chunk = chunk + [tokenizer.pad_token_id] * padding_length

            # Create an attention mask: 1 for tokens, 0 for padding
            attention_mask = [1] * len(chunk) + [0] * padding_length

            self.input_ids.append(padded_chunk)
            self.attn_masks.append(attention_mask)
        print(f"Number of input sequences: {len(self.input_ids)}")

    def get_conversations_HF(self):
        train = []
        val = []
        test = []
        the_datasets = {'train': train, 'validation': val, 'test': test}

        types = ['train', 'validation', 'test']

        pattern = r'\s([,.!?;:])'
        # Regular expressions for spaces around parentheses
        pattern_left_parenthesis = r'\(\s'
        pattern_right_parenthesis = r'\s\)'

        for t in types:
            print(f"loading: {t=}")
            ds = dataset[t]
            # Access specific fields
            print(f"{len(ds)=}")

            for i in range(len(ds)):
                dialogue = ds[i]['dialog']
                turn = ["<|user|>", "<|assistant|>"]
                conv = []
                for i, s in enumerate(dialogue):
                    s = s.strip()
                    s = normalize_text(s)
                    s = s.replace(" ' ", "'")
                    s = s.replace('$ ', '$')
                    s = s.replace('( ', '(')
                    s = s.replace(') ', ')')
                    s = re.sub(pattern, r'\1', s)
                    s = turn[0 if (i+1)%2 == 1 else 1] + s
                    conv.append(s)
                the_datasets[t].append(tuple(conv))

        return the_datasets['train'] + the_datasets['validation'] + the_datasets['test']

    def get_conversations_cornell(self, data_path):
        conversations = []

        # Load movie lines
        id2line = {}
        with open(os.path.join(data_path, 'movie_lines.txt'), 'r', encoding='iso-8859-1') as f:
            for line in f:
                parts = line.strip().split(' +++$+++ ')
                if len(parts) == 5:
                    line_id, text = parts[0], parts[4]
                    id2line[line_id] = text

        # Load conversations
        with open(os.path.join(data_path, 'movie_conversations.txt'), 'r', encoding='iso-8859-1') as f:
            for line in f:
                parts = line.strip().split(' +++$+++ ')
                if len(parts) == 4:
                    conv_line_ids = ast.literal_eval(parts[3])  # safer than eval()
                    # Create pairs of conversations (input, response)
                    for i in range(len(conv_line_ids) - 1):
                        # Ensure both line IDs are in id2line
                        if conv_line_ids[i] in id2line and conv_line_ids[i + 1] in id2line:
                            input_line = "<|user|>" + normalize_text(id2line[conv_line_ids[i]])
                            response_line = "<|assistant|>" + normalize_text(id2line[conv_line_ids[i + 1]])
                            conversations.append((input_line, response_line))
                        #else:
                        #    print(f"Missing line ID in conversation: {conv_line_ids[i]} or {conv_line_ids[i + 1]}")

        print(f"{len(conversations)=}")

        return conversations

    def get_conversations_woz(self, data_path):
        turn = ['<|user|>', '<|assistant|>']
        types = ['train', 'dev', 'test']
        the_datasets = {t : [] for t in types}
        dir = {t : os.path.join(data_path, t) for t in types}
        for t in types:
            print(f"processing: {t=}")
            json_files = [f for f in os.listdir(dir[t]) if f.endswith('.json')] # Filter the list to include only JSON files
            dialogues = [] # Initialize a list to store the data from all JSON files
            # Loop through each JSON file and load the data
            for json_file in json_files:
                file_path = os.path.join(dir[t], json_file)
                with open(file_path, 'r') as file:
                    dialogues.extend(json.load(file))
            for dialogue in dialogues:
                conversation_list = []
                conversation = dialogue['turns']
                for i, line in enumerate(conversation):
                    conversation_list.append(turn[0 if (i+1)%2 == 1 else 1] + normalize_text(line['utterance']))
                the_datasets[t].append(tuple(conversation_list))
            print(f"{len(the_datasets[t])=}")

        return the_datasets['train'] + the_datasets['dev'] + the_datasets['test']

    def get_conversations_taskmaster(self, data_path):
        dirs = ['TM-1-2019', 'TM-2-2020/data', 'TM-3-2020/data', 'TM-4-2024/data']
        conversations = []
        for dir in dirs:
            print(f"{dir=}")
            path = os.path.join(data_path, dir)
            json_files = [f for f in os.listdir(path) if f.endswith('.json')] # Filter the list to include only JSON files
            dialogues = [] # Initialize a list to store the data from all JSON files
            # Loop through each JSON file and load the data
            print("loading JSON files")
            for json_file in json_files:
                file_path = os.path.join(path, json_file)
                with open(file_path, 'r') as file:
                    print(f"{file_path=}")
                    dialogues.extend(json.load(file))
            print("processing dialogues")
            for dialogue in dialogues:
                utterances = dialogue['utterances']
                conversation_list = []
                previous_prompt = ""
                for line in utterances:
                    prompt = "<|assistant|>"
                    if line['speaker'].lower() == 'user':
                        prompt = "<|user|>"
                    if previous_prompt == prompt:
                        last_text = conversation_list.pop()
                        conversation_list.append(last_text + " " + normalize_text(line['text']))
                    else:
                        conversation_list.append(prompt + normalize_text(line['text']))
                        previous_prompt = prompt
                conversations.append(tuple(conversation_list))
        print(f"{len(conversations)=}")

        return conversations

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = self.input_ids[idx]
        attn_mask = self.attn_masks[idx]
        return torch.tensor(input_ids), torch.tensor(attn_mask)


# Initialize dataset and dataloaders
dataset = ChatDataset(tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f'{train_size:,} training samples')
print(f'{val_size:,} validation samples')

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# ========================================
#          Optimizer and Scheduler
# ========================================

optimizer = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

# Calculate total steps
total_steps = len(train_dataloader) * epochs

# Prepare learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
)

# Initialize mixed precision training scaler with updated API
scaler = GradScaler()

# ========================================
#          Training and Validation
# ========================================

training_stats = []
initial_t0 = time.time()  # Measure total training time

# Training loop
for epoch_i in range(epochs):
    print(f'\n======== Epoch {epoch_i + 1} / {epochs} ========')
    print('Training...')

    t0 = time.time()  # Measure epoch training time
    total_train_loss = 0  # Reset the total loss for this epoch
    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        model.zero_grad()  # Clear any previously calculated gradients

        # Forward pass and compute loss with mixed precision
        with autocast(device_type = 'cuda', dtype=torch.float16):
            outputs = model(b_input_ids, labels=b_labels, attention_mask=b_masks)
            loss = outputs.loss / gradient_accumulation_steps  # Normalize loss

        total_train_loss += loss.item()  # Accumulate the loss

        scaler.scale(loss).backward()  # Backward pass with scaled loss

        if (step + 1) % step_interval == 0:
            print(f"step:{step+1}")

        # Update parameters every `gradient_accumulation_steps`
        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping

            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            model.zero_grad()

    avg_train_loss = total_train_loss / len(train_dataloader)  # Calculate the average loss
    training_time = format_time(time.time() - t0)

    print(f"\n  Average training loss: {avg_train_loss:.4f}")
    print(f"  Training epoch took: {training_time}")

    # ========================================
    #               Validation
    # ========================================
    print("\nRunning Validation...")

    t0 = time.time()  # Measure validation time
    total_eval_loss = 0
    model.eval()

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, labels=b_labels, attention_mask=b_masks)
            loss = outputs.loss

        total_eval_loss += loss.item()

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)

    print(f"  Validation Loss: {avg_val_loss:.4f}")
    print(f"  Validation took: {validation_time}")

    # Record all statistics from this epoch
    training_stats.append({
        'epoch': epoch_i + 1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Training Time': training_time,
        'Validation Time': validation_time
    })

    print("\nEpoch Summary:")
    print(f"  Epoch {epoch_i + 1} / {epochs}")
    print(f"  Training Loss: {avg_train_loss:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}")
    print(f"  Training Time: {training_time}")
    print(f"  Validation Time: {validation_time}")

    # ========================================
    #               Sampling
    # ========================================
    print("\nGenerating Sample Output...")

    model.eval()
    sample_outputs = model.generate(
        bos_token_id=random.randint(1, 30000),
        do_sample=True,
        top_k=50,
        max_length=200,
        top_p=0.95,
        num_return_sequences=1
    )
    for i, sample_output in enumerate(sample_outputs):
        print(f"{i}: {tokenizer.decode(sample_output, skip_special_tokens=True)}")
    model.train()

    # Save model checkpoint after each epoch
    if (epoch_i + 1) % checkpoint_interval == 0:
        checkpoint_dir = f'./checkpoint_chat_2-{epoch_i + 1}'
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        model.save_pretrained(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)

print("\nTraining complete!")
print(f"Total training took {format_time(time.time() - initial_t0)} (h:mm:ss)")

# ========================================
#               Plotting
# ========================================
epochs = [x['epoch'] for x in training_stats]
training_loss = [x['Training Loss'] for x in training_stats]
validation_loss = [x['Valid. Loss'] for x in training_stats]

plt.figure(figsize=(10, 6))
plt.plot(epochs, training_loss, label='Training Loss')
plt.plot(epochs, validation_loss, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss per Epoch')
plt.legend()
plt.grid(True)
plt.show()


## SIMPLE EVAL ##

In [None]:
model.eval()

# Prepare input prompt
prompt = "<|startoftext|><|user|>To whome am I speaking?\n<|assistant|>"
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

print(generated)

# Generate text
sample_outputs = model.generate(
                                generated,
                                do_sample=True,
                                top_k=50,
                                max_length=300,
                                top_p=0.95,
                                num_return_sequences=1
                                )

# Display generated text
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

model.train()

## Loading Fine-Tuned Model

In [None]:
import os
import time
import datetime
import numpy as np
import random
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
from torch.optim import AdamW  # Use PyTorch's AdamW optimizer
from torch.cuda.amp import GradScaler, autocast  # Updated imports for mixed precision training

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
print(os.getcwd())
os.chdir('/content/drive/MyDrive/Colab Notebooks/gothic')
print(os.getcwd())

# Enable CUDA Launch Blocking for debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Load the model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./checkpoint_chat_2-4")
tokenizer = GPT2Tokenizer.from_pretrained("./checkpoint_chat_2-4")
model.resize_token_embeddings(len(tokenizer))  # Adjust model embeddings if needed

# Set the pad token to the EOS token to avoid warnings
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

## Generate Text REPL

In [None]:
def test():
    model.eval()  # Set model to evaluation mode

    generated = '<|startoftext|>'

    stop = ['<|user|>', '<|assistant|>', '<|endoftext|>', '<|startoftext|>']

    while True:
        try:
            input_text = input("> ")
            if input_text == 'exit':
                print("-------------------------------------------")
                print(generated)
                print("-------------------------------------------")
                break
            initial_length = len(generated)
            generated += "\n<|user|>" + input_text + "\n<|assistant|>"
            new_length = len(generated)
            statement_length = new_length - initial_length

            if new_length > 1024:
                generated = generated[-1024:]
                initial_length = 1024 - statement_length
                new_length = 1024

            # Encode input text
            gg = torch.tensor(tokenizer.encode(generated)).unsqueeze(0).to(device)

            # Create attention mask with correct shape
            attention_mask = torch.ones(gg.shape, dtype=torch.long, device=device)

            # Generate output
            output = model.generate(
                gg,
                attention_mask=attention_mask,  # Pass the attention mask
                do_sample=True,
                top_k=50,
                max_length=1024,
                top_p=0.95,
                num_return_sequences=1,
                pad_token_id=tokenizer.pad_token_id  # Explicitly set pad_token_id
            )

            generated = tokenizer.decode(output[0], skip_special_tokens=True)

            for substring in stop:
                index = generated.find(substring, new_length)
                if index != -1:
                    generated = generated[:index]

            print(generated[initial_length:])

        except Exception as e:
            print(e)

test()