In [1]:
import numpy as np
import pandas as pd
import json
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

In [2]:
file_path = "../data/alpaca_data_cleaned_subset.json"
context_length = 10

In [3]:
def read_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def preprocess_data(data):
    sequences = []
    for item in data:
        sequence = ['[SOS]'] + list(item['instruction']) + ['\n'] + list(item['input']) + ['\n'] + list(item['output']) + ['[EOS]']
        sequences.append(sequence)
    return sequences

valid_chars = list('abcdefghijklmnopqrstuvwxyz ?!.,\n')
def preprocess_data_simple(data):
    sequences = []
    for item in data:
        instruction_alpha = ''.join([char for char in item['instruction'].lower() if char in valid_chars])
        input_alpha = ''.join([char for char in item['input'].lower() if char in valid_chars])
        output_alpha = ''.join([char for char in item['output'].lower() if char in valid_chars])

        sequence = ['[SOS]'] + list(instruction_alpha) + ['\n'] + list(input_alpha) + ['\n'] + list(output_alpha) + ['[EOS]']
        sequences.append(sequence)
    return sequences

# Add another character for the beginning of the sequence (or maybe just continue with [SOS]?) When there are no characters at the beginning of the sequence.

def create_vocab(sequences):
    chars = [char for seq in sequences for char in seq]
    return sorted(set(chars))

def one_hot_encode(sequence, char_to_idx):
    one_hot = np.zeros((len(sequence), len(char_to_idx)), dtype=np.int32)
    for i, char in enumerate(sequence):
        one_hot[i, char_to_idx[char]] = 1
    return one_hot

def numerical_encode(sequence, char_to_idx):
    numerical_encoding = np.zeros(len(sequence), dtype=np.int32)
    for i, char in enumerate(sequence):
        numerical_encoding[i] = char_to_idx[char]
    return numerical_encoding

def create_training_examples(sequences, char_to_idx, input_length=10):
    x = []
    y = []

    for seq in sequences:
        encoded_seq = one_hot_encode(seq, char_to_idx)
        numerical_encode_seq = numerical_encode(seq, char_to_idx)
        total_chars = len(seq)

        for i in range(total_chars - input_length):
            x.append(encoded_seq[i:i+input_length])
            y.append(encoded_seq[i+input_length])

    return np.array(x), np.array(y)

# Main script
data = read_json(file_path)
#sequences = preprocess_data(data)
sequences = preprocess_data_simple(data)

vocab = create_vocab(sequences)
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

X, Y = create_training_examples(sequences, char_to_idx, input_length=context_length)
X = X.reshape(X.shape[0], -1)

print(f"Number of training examples: {X.shape[0]}")
print(f"Example input shape: {X[0].shape}")
print(f"output shape: {Y.shape}")

Number of training examples: 19261
Example input shape: (330,)
output shape: (19261, 33)


In [4]:
X.shape, Y.shape

((19261, 330), (19261, 33))

In [5]:
X = X[0:5_000_000]
Y = Y[0:5_000_000]

In [6]:
X[0]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [7]:
Y[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [8]:
X.shape, Y.shape

((19261, 330), (19261, 33))

In [9]:
# Split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [10]:
X_train.shape, Y_train.shape

((15408, 330), (15408, 33))

In [11]:
input_size = 340  # Assuming 340 characters in the vocabulary
output_size = 34  # Output is the index of the next character
embed_size = 32
num_heads = 1 # 2
num_layers = 1 # 2
batch_size = 8
learning_rate = 0.01 # 0.01
weight_decay = 1e-5  # Adjust the weight decay value as needed
num_epochs = 2 # 10

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Define the Transformer model for sequence-to-sequence prediction
class CharTransformer(nn.Module):
    def __init__(self, input_size, output_size, embed_size, num_heads, num_layers):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, embed_size)
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
        )
        self.fc = nn.Linear(embed_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x, x)

        # Take the representation of the last token from each sequence
        x_last_token = x[:, -1, :]

        # Apply the linear layer to each sequence
        output = self.fc(x_last_token)

        return output

# Define a custom dataset
class CharDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        input_vector = self.X[idx]
        target_index = self.Y[idx]
        return input_vector, target_index

# Hyperparameters
input_size = 340  # Assuming 340 characters in the vocabulary
output_size = 34  # Output is the index of the next character
embed_size = 32
num_heads = 2 # 2
num_layers = 2 # 2
batch_size = 8
learning_rate = 0.01 # 0.01
weight_decay = 1e-5  # Adjust the weight decay value as needed
num_epochs = 10 # 10



# Create model, loss, and optimizer
model = CharTransformer(input_size, output_size, embed_size, num_heads, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Create the dataset and DataLoader
dataset = CharDataset(X_train, Y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    total_correct = 0
    for batch in tqdm(dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        inputs, targets = batch

        # Convert inputs to PyTorch tensor
        input_tensor = torch.tensor(inputs).long()
        # print(inputs.shape, input_tensor.shape)
        target_tensor = torch.tensor(targets).long()
        # print(targets.shape, target_tensor.shape)

        # Add batch and sequence dimensions
        optimizer.zero_grad()
        outputs = model(input_tensor)

        # Ensure the output has shape (batch_size, output_size)
        # print("output shape:", outputs.shape)

        # CrossEntropyLoss expects target_tensor to have shape (batch_size,)
        target_tensor = target_tensor[:, -1]  # Take the last character as the target
        target_tensor = target_tensor.view(-1)

        # Check if batch sizes match before calculating the loss
        assert outputs.size(0) == target_tensor.size(0), "Batch sizes do not match"

        # Calculate the loss
        loss = criterion(outputs, target_tensor)
        loss.backward()
        optimizer.step()

        # Calculate accuracy on this batch
        # Get predicted characters (argmax along the second dimension)
        predicted_chars = torch.argmax(outputs, dim=1)  # Use dim=1 for the second dimension
        true_chars = torch.nonzero(targets, as_tuple=False)[:, 1]

        # Count correct predictions
        correct_predictions = (predicted_chars == true_chars)
        total_correct += correct_predictions.sum().item()

        total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    accuracy = total_correct / len(dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}, Accuracy: {accuracy * 100:.2f}%')

# Save the trained model
torch.save(model.state_dict(), 'char_transformer_model.pth')


  input_tensor = torch.tensor(inputs).long()
  target_tensor = torch.tensor(targets).long()
Epoch 1/10: 100%|██████████| 1926/1926 [06:11<00:00,  5.19it/s]


Epoch 1/10, Loss: 0.009958996237089486, Accuracy: 6.49%


Epoch 2/10: 100%|██████████| 1926/1926 [05:58<00:00,  5.38it/s]


Epoch 2/10, Loss: 0.005765530230630024, Accuracy: 6.49%


Epoch 3/10: 100%|██████████| 1926/1926 [06:05<00:00,  5.27it/s]


Epoch 3/10, Loss: 0.005493828691024898, Accuracy: 6.49%


Epoch 4/10: 100%|██████████| 1926/1926 [05:55<00:00,  5.41it/s]


Epoch 4/10, Loss: 0.004959368609379717, Accuracy: 6.49%


Epoch 5/10: 100%|██████████| 1926/1926 [06:01<00:00,  5.33it/s]


Epoch 5/10, Loss: 0.00551295438841598, Accuracy: 6.49%


Epoch 6/10: 100%|██████████| 1926/1926 [06:02<00:00,  5.31it/s]


Epoch 6/10, Loss: 0.005464767994692194, Accuracy: 6.49%


Epoch 7/10: 100%|██████████| 1926/1926 [05:58<00:00,  5.37it/s]


Epoch 7/10, Loss: 0.004958967814159313, Accuracy: 6.49%


Epoch 8/10: 100%|██████████| 1926/1926 [05:58<00:00,  5.37it/s]


Epoch 8/10, Loss: 0.005621875166058545, Accuracy: 6.49%


Epoch 9/10: 100%|██████████| 1926/1926 [05:58<00:00,  5.37it/s]


Epoch 9/10, Loss: 0.005259834920566641, Accuracy: 6.49%


Epoch 10/10: 100%|██████████| 1926/1926 [05:58<00:00,  5.37it/s]

Epoch 10/10, Loss: 0.004956257450301324, Accuracy: 6.49%





In [13]:
# Load the trained model
model = CharTransformer(input_size, output_size, embed_size, num_heads, num_layers)
model.load_state_dict(torch.load('char_transformer_model.pth'))
model.eval()

# Create the test dataset and DataLoader
test_dataset = CharDataset(X_test, Y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Evaluation loop
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch in test_dataloader:
        inputs, targets = batch

        # Convert inputs to PyTorch tensor
        input_tensor = torch.tensor(inputs).long()
        target_tensor = torch.tensor(targets).long()

        # Forward pass
        outputs = model(input_tensor)

        # Get predicted characters (argmax along the second dimension)
        predicted_chars = torch.argmax(outputs, dim=1)  # Use dim=1 for the second dimension
        true_chars = torch.nonzero(targets, as_tuple=False)[:, 1]

        # Count correct predictions
        correct_predictions = (predicted_chars == true_chars)
        total_correct += correct_predictions.sum().item()
        total_samples += correct_predictions.size(0)

# Calculate accuracy
accuracy = total_correct / total_samples
print(f'Test Accuracy: {accuracy * 100:.2f}%')


  input_tensor = torch.tensor(inputs).long()
  target_tensor = torch.tensor(targets).long()


Test Accuracy: 0.83%
