In [27]:
print("Training LifeGPT Multi Grid")

import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
import random
import numpy as np
import torch
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import time
from x_transformers import TransformerWrapper, Decoder
from x_transformers.autoregressive_wrapper import AutoregressiveWrapper
import datetime
from IPython.display import display, HTML
from tqdm import tqdm
import typing
import matplotlib.pyplot as plt
import csv

# =============================================================================
#                           Helper Functions / Setup
# =============================================================================

def empty_cuda_cache():
    """Clears the CUDA cache."""
    torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
print("Torch version:", torch.__version__)
empty_cuda_cache()

# =============================================================================
#                            Data Loading
# =============================================================================

data_path = "datasets\\"
train_file = data_path + "conway_multi_grid_train_20250107_180038.csv"
val_file   = data_path + "conway_multi_grid_val_20250107_181530.csv"
test_file  = data_path + "conway_multi_grid_test_20250107_181732.csv"

df_train = pd.read_csv(train_file)
df_val   = pd.read_csv(val_file)
df_test  = pd.read_csv(test_file)

TRAIN_SIZE = 10000
TEST_SIZE  = 1000

start_char = "@"
end_char   = "$"
mask_char  = ["_"]

def generate_data(df, future_steps):
    """Generate text-based sequences from DataFrame columns."""
    X_data = []
    for i in range(len(df['State 1'])):
        future_state_col = f"State {future_steps}"
        if future_state_col in df.columns:
            str_ = f"{start_char}PredictNextState<{df['State 1'][i]}> [{df[future_state_col][i]}]{end_char}"
            X_data.append(str_)
    return X_data

future_steps_list = [2]
X_data_train = {steps: generate_data(df_train, steps) for steps in future_steps_list}
X_data_val   = {steps: generate_data(df_val,   steps) for steps in future_steps_list}
X_data_test  = {steps: generate_data(df_test,  steps) for steps in future_steps_list}

print("Sample from X_data_test[2]:", X_data_test[2][0])
print("First character of sample:", X_data_test[2][0][0])

for steps in future_steps_list:
    print(f"Train set for {steps} steps: {len(X_data_train[steps])} sequences")
    print(f"Val   set for {steps} steps: {len(X_data_val[steps])} sequences")

max_length = max(len(seq) for steps in future_steps_list for seq in X_data_train[steps])
print("Max sequence length:", max_length)

# =============================================================================
#                            Tokenizer
# =============================================================================

num_words = 256

class Tokenizer:
    def __init__(self, n_pad: int, device: torch.device, pad_byte: int = 0):
        self.n_pad = n_pad
        self.device = device
        self.pad_byte = pad_byte

    def tokenize_str(self, sentence: str, encoding="utf8", do_padding=True):
        base = list(bytes(sentence, encoding))
        if do_padding:
            if len(base) < self.n_pad:
                base.extend([self.pad_byte] * (self.n_pad - len(base)))
            assert len(base) == self.n_pad, f"n_pad is too small. Need at least {len(base)}."
        tensor = torch.Tensor(base)
        return tensor.long().to(self.device)

    def texts_to_sequences(self, texts: typing.List[str], encoding="utf8", do_padding=True):
        sentences = [self.tokenize_str(sentence, do_padding=do_padding).unsqueeze(0) for sentence in texts]
        return torch.cat(sentences, dim=0).to(self.device)

    @staticmethod
    def prepare_texts(document: str) -> typing.List[str]:
        return list(filter(lambda x: len(x) != 0, document.split("\n")))

    def sequences_to_texts(self, texts: torch.Tensor, encoding="utf8"):
        out = []
        for seq in texts:
            chars = []
            i = 0
            while i < len(seq) and seq[i] != 0:
                chars.append(int(seq[i]))
                i += 1
            try:
                out.append(bytes(chars).decode(encoding))
            except:
                pass
        return out

tokenizer_X = Tokenizer(max_length, device)

def tokenize_data(data, tokenizer):
    tokenized_data = tokenizer.texts_to_sequences(data)
    print("Example tokenized data:", tokenized_data[0])
    return tokenized_data

X_data_train_tokenized = {steps: tokenize_data(X_data_train[steps], tokenizer_X) for steps in future_steps_list}
X_data_val_tokenized   = {steps: tokenize_data(X_data_val[steps],   tokenizer_X) for steps in future_steps_list}

def tokenize_special_char(char, tokenizer):
    token = tokenizer.texts_to_sequences([char])
    token_value = token[0][0].cpu().numpy()
    print(f"'{char}' token:", token_value)
    return token_value

start_char_token = tokenize_special_char(start_char, tokenizer_X)
end_char_token   = tokenize_special_char(end_char,   tokenizer_X)
mask_token       = tokenize_special_char(mask_char[0], tokenizer_X)
print("Mask token:", mask_token)

def remove_start_end_token(string_input, start='@', end='$'):
    return string_input.replace(start, "").replace(end, "")

def remove_start_end_token_first(string_input, start='@', end='$'):
    i = string_input.find(start)
    j = string_input.find(end)
    return string_input[i+1:j]

def extract_task(string_input, end_task_token=')'):
    j = string_input.find(end_task_token)
    return string_input[:j+1]

def extract_start_and_end(string_input, start_token='[', end_token=']'):
    i = string_input.find(start_token)
    j = string_input.find(end_token)
    return string_input[i+1:j]

def reverse_tokenize(tokenizer_X, X_data, X_norm_factor=1):
    X_data_tokenized_reversed = tokenizer_X.sequences_to_texts((X_data * X_norm_factor).int())
    return [i for i in X_data_tokenized_reversed]

# Quick test of tokenization
aa = reverse_tokenize(tokenizer_X, X_data_train_tokenized[2][1:2])
bb = X_data_train[2][1:2]
if aa == bb:
    print("Tokenization and reverse tokenization are consistent.")
else:
    print("Inconsistent behavior detected.")

print("extract_task example:", extract_task(remove_start_end_token_first(X_data_train[2][1]), end_task_token='>'))
print("Original text:", X_data_train[2][1])
print("extract_start_and_end(<,>):", extract_start_and_end(X_data_train[2][1], start_token='<', end_token='>'))
print("extract_start_and_end([,]):", extract_start_and_end(X_data_train[2][1], start_token='[', end_token=']'))
print("Reverse token val data sample:", reverse_tokenize(tokenizer_X, X_data_val_tokenized[2][:1]))
print("Original val data sample:", X_data_val[2][1])

# =============================================================================
#                            Model Definition
# =============================================================================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device re-checked:", device)

def empty_cuda_cache():
    """Clears the CUDA cache."""
    torch.cuda.empty_cache()

def get_model(max_length, num_words, model_name,
              dim=256, depth=12, heads=8, attn_dim_head=64,
              rot_pos=True, attn_flash=True,
              masking=True, mask_prob=0.15):
    empty_cuda_cache()

    model = TransformerWrapper(
        num_tokens=num_words,
        max_seq_len=max_length,
        attn_layers=Decoder(
            dim=dim,
            depth=depth,
            heads=heads,
            attn_dim_head=attn_dim_head,
            rotary_pos_emb=rot_pos,
            attn_flash=attn_flash
        )
    )

    if masking:
        model = AutoregressiveWrapper(model, mask_prob=mask_prob)
        print(f"Model created: rot pos={rot_pos}, attn_flash={attn_flash}, masking={masking}")
    else:
        model = AutoregressiveWrapper(model)
        print(f"Model created: rot pos={rot_pos}, attn_flash={attn_flash}, masking={masking}")

    model.cuda()

    model_creation_time = datetime.datetime.now()
    model_creation_time_str = model_creation_time.strftime("%Y-%m-%d_%H-%M-%S")
    print(f'Model "{model_name}" Created @ {model_creation_time_str} Eastern Time')

    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Model has {num_params} trainable parameters")

    # Use the small-snippet style path creation
    model_dir = os.path.join(os.getcwd(), "model_parameters", f"{model_name}_{model_creation_time_str}")
    print("DEBUG: Attempting to create model_dir:", model_dir)
    os.makedirs(model_dir, exist_ok=True)
    print("DEBUG: Directory exists after creation?", os.path.exists(model_dir))

    model_info_file = os.path.join(model_dir, f"{model_name}_info.txt")
    print("DEBUG: model_info_file:", model_info_file)

    with open(model_info_file, "w") as f:
        f.write(f"Model Name: {model_name}\n")
        f.write(f"Model Created @ {model_creation_time_str} Eastern Time\n")
        f.write(f"Number of trainable parameters: {num_params}\n")
        f.write(f"Model Architecture:\n{model}\n")
        f.write("Model Parameters:\n")
        f.write(f"num_tokens: {num_words}\n")
        f.write(f"max_seq_len: {max_length}\n")
        f.write(f"dim: {dim}\n")
        f.write(f"depth: {depth}\n")
        f.write(f"heads: {heads}\n")
        f.write(f"attn_dim_head: {attn_dim_head}\n")
        f.write(f"rotary_pos_emb: {rot_pos}\n")
        f.write(f"attn_flash: {attn_flash}\n\n")
        f.write("Note: Jan-08-This model is looking at the effects of varying grid sizes on generalization.")

    return model, model_dir

model_name = "01_07_2025_Conway_Multi_Grid"
rot_pos = True
model, model_dir = get_model(max_length, num_words, model_name, rot_pos=rot_pos, masking=True, mask_prob=0.15)

empty_cuda_cache()
print("CUDA cache cleared.")

LEARNING_RATE = 1e-4
optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# =============================================================================
#                          DataLoader / Datasets
# =============================================================================

def cycle(loader):
    """Yields batches from loader in an infinite cycle."""
    while True:
        for data in loader:
            yield data

class RegressionDataset(Dataset):
    def __init__(self, X_data):
        self.X_data = X_data
    
    def __getitem__(self, index):
        return self.X_data[index]
    
    def __len__(self):
        return len(self.X_data)

NUM_EPOCHS                = 50
SAVE_EPOCH                = 2
VALIDATE_EVERY            = 5
GENERATE_EVERY            = 10
GRADIENT_ACCUMULATE_EVERY = 5
MEASURE_ACC_EVERY         = 1000
BATCH_SIZE                = 5

steps = future_steps_list[0]
GENERATE_LENGTH = max_length - len(extract_task(X_data_train[steps][0], end_task_token='>'))
print("GENERATE_LENGTH =", GENERATE_LENGTH)
print(f"Predicting jump from State 1 to State {steps}")

train_dataset = RegressionDataset(X_data_train_tokenized[steps])
val_dataset   = RegressionDataset(X_data_val_tokenized[steps])
train_loader  = cycle(DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True))
val_loader    = cycle(DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=True, drop_last=True))

train_sample = next(train_loader)
val_sample   = next(val_loader)
print("Train batch shape:", train_sample.shape)
print("Validation batch shape:", val_sample.shape)

torch.cuda.empty_cache()

display(HTML("<style>.output_result { max-height:10000px; }</style>"))

# =============================================================================
#                           Generation and Accuracy
# =============================================================================

def generate_sample():
    model.eval()
    inp_str = extract_task(random.choice(X_data_val), end_task_token=">")
    inp = torch.Tensor(tokenizer_X.texts_to_sequences(inp_str, do_padding=False)).to(device)
    inp = inp.transpose(0, 1).long()
    prime = reverse_tokenize(tokenizer_X, inp[:1])[0]
    sample = model.generate(prompts=inp, seq_len=GENERATE_LENGTH, cache_kv=True)
    try:
        output_str = reverse_tokenize(tokenizer_X, sample[:1])
    except:
        output_str = ["non utf token found"]
    return output_str

def count_mismatches(ground_truth, pred):
    mismatches = sum(1 for gt, p in zip(ground_truth, pred) if gt != p)
    accuracy = 1 - mismatches / len(ground_truth)
    return mismatches, accuracy

# =============================================================================
#                         CSV Logging for Loss / Accuracy
# =============================================================================

csv_file_path = os.path.join(model_dir, "loss_data.csv")
print("DEBUG: Writing CSV to:", csv_file_path)
with open(csv_file_path, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["epoch", "batch_within_epoch", "batch_overall", "train_loss", "val_loss", "accuracy", "elapsed_time"])

train_losses         = []
val_losses           = []
batches              = []
accuracies           = []
accuracies_batches   = []
epoch_list           = []
batch_overall        = []

total_batches = NUM_EPOCHS * int(TRAIN_SIZE / BATCH_SIZE)
overall_batch_counter = 0
accuracy = 0
start_time = time.time()

# =============================================================================
#                              Training Loop
# =============================================================================

for epoch in range(NUM_EPOCHS):
    num_batches = int(TRAIN_SIZE / BATCH_SIZE)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}")
    epoch_start_time = time.time()

    for i in range(num_batches):
        model.train()
        loss = model(next(train_loader))
        loss.backward()
        
        if (i + 1) % GRADIENT_ACCUMULATE_EVERY == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optim.step()
            optim.zero_grad()

        model.eval()
        with torch.no_grad():
            val_loss = model(next(val_loader)).item()

        train_losses.append(loss.item())
        val_losses.append(val_loss)
        batches.append(overall_batch_counter)
        batch_overall.append(overall_batch_counter)

        if (i + 1) % MEASURE_ACC_EVERY == 0:
            valid_output_found = False
            attempt_count = 0
            max_attempts = 1
            accuracy_list = []
            while len(accuracy_list) < 10 and attempt_count < max_attempts:
                inp_seq = X_data_test[2][len(accuracy_list)]
                inp_str = extract_task(inp_seq, end_task_token=">")
                inp = torch.Tensor(tokenizer_X.texts_to_sequences(inp_str, do_padding=False)).to(device)
                inp = inp.transpose(0, 1).long()
                with torch.no_grad():
                    sample = model.generate(prompts=inp, seq_len=GENERATE_LENGTH, cache_kv=True)
                try:
                    output_str = reverse_tokenize(tokenizer_X, sample[:1])
                    pred         = extract_start_and_end(output_str[0], start_token="[", end_token="]")
                    ground_truth = extract_start_and_end(inp_seq,       start_token="[", end_token="]")
                    _, acc = count_mismatches(ground_truth=ground_truth, pred=pred)
                    accuracy_list.append(acc)
                    valid_output_found = True
                except Exception as e:
                    print("Error decoding output:", e)
                    attempt_count += 1
            
            if valid_output_found and len(accuracy_list) > 0:
                accuracy = np.mean(accuracy_list)
            else:
                accuracy = 0

            accuracies.append(accuracy)
            accuracies_batches.append(overall_batch_counter)

        elapsed_time = time.time() - start_time
        with open(csv_file_path, mode="a", newline="") as file:
            writer = csv.writer(file)
            writer.writerow([epoch, i, overall_batch_counter, loss.item(), val_loss, accuracy, elapsed_time])

        if (i + 1) % VALIDATE_EVERY == 0:
            print(f"Batch {i+1}/{num_batches}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")

        overall_batch_counter += 1

    if (epoch + 1) % SAVE_EPOCH == 0:
        model_save_path = os.path.join(model_dir, f"LifeGPT_multi_grid_epoch_{epoch+1}.pt")
        torch.save(model.state_dict(), model_save_path)
        print(f"Model saved at epoch {epoch+1}, file: {model_save_path}")

    epoch_end_time = time.time()
    print(f"Epoch {epoch+1} time: {epoch_end_time - epoch_start_time:.2f} seconds")


Training LifeGPT Multi Grid
Device: cuda
Torch version: 2.2.2
Sample from X_data_test[2]: @PredictNextState<0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000> [0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000]$
First character of sample: @
Train set for 2 steps: 10000 sequences
Val   set for 2 steps: 1000 sequences
Max sequence length: 535
Example tokenized data: tensor([ 64,  80, 114, 101, 100, 105,  99, 116,  78, 101, 120, 116,  83, 116,
         97, 116, 101,  60,  48,  48,  48,  48,  48,  48,  48,  48,  48,  48,
         48,  48,  48,  48,  48,  48,  48,  48,  48,  48,  48, 

DEBUG: Writing CSV to: c:\Users\jaime\ML_Playground_1\LifeGPT\Multi_Grid\model_parameters\01_07_2025_Conway_Multi_Grid_2025-01-08_13-09-21\loss_data.csv
Epoch 1/50
Batch 5/2000, Train Loss: 5.5755, Val Loss: 2.8812, Accuracy: 0.0000
Batch 10/2000, Train Loss: 2.8896, Val Loss: 1.2987, Accuracy: 0.0000
Batch 15/2000, Train Loss: 2.0007, Val Loss: 1.2733, Accuracy: 0.0000
Batch 20/2000, Train Loss: 1.3060, Val Loss: 0.9320, Accuracy: 0.0000
Batch 25/2000, Train Loss: 0.8225, Val Loss: 0.9456, Accuracy: 0.0000
Batch 30/2000, Train Loss: 0.6265, Val Loss: 0.6486, Accuracy: 0.0000
Batch 35/2000, Train Loss: 0.8255, Val Loss: 0.6667, Accuracy: 0.0000
Batch 40/2000, Train Loss: 0.6604, Val Loss: 0.8544, Accuracy: 0.0000
Batch 45/2000, Train Loss: 0.6476, Val Loss: 0.7168, Accuracy: 0.0000
Batch 50/2000, Train Loss: 0.7845, Val Loss: 0.7942, Accuracy: 0.0000
Batch 55/2000, Train Loss: 0.6473, Val Loss: 0.4893, Accuracy: 0.0000
Batch 60/2000, Train Loss: 0.6957, Val Loss: 0.4666, Accuracy: 0.00

In [None]:
data_path = "datasets\\"
train_file = data_path + "conway_multi_grid_train_20250107_180038.csv"
val_file   = data_path + "conway_multi_grid_val_20250107_181530.csv"
test_file  = data_path + "conway_multi_grid_test_20250107_181732.csv"

df_train = pd.read_csv(train_file)
df_val   = pd.read_csv(val_file)
df_test  = pd.read_csv(test_file)

In [1]:
def check_duplicates():
    # Assuming both dataframes have the same structure, we'll check State 1 column for duplicates
    duplicates = []
    
    # Iterate through test dataset
    for test_idx, test_row in df_test.iterrows():
        # Find matching rows in train dataset
        train_matches = df_train[df_train['State 1'] == test_row['State 1']]
        
        if not train_matches.empty:
            for train_idx in train_matches.index:
                # Deep comparison of all relevant state columns
                all_states_match = True
                for col in [col for col in df_test.columns if col.startswith('State')]:
                    if df_test.loc[test_idx, col] != df_train.loc[train_idx, col]:
                        all_states_match = False
                        break
                
                if all_states_match:
                    duplicates.append({
                        'test_row': test_idx,
                        'train_row': train_idx,
                        'state_1': test_row['State 1']
                    })
    
    if duplicates:
        print(f"Found {len(duplicates)} duplicate samples:")
        for dup in duplicates:
            print(f"Test row {dup['test_row']} matches Train row {dup['train_row']}")
    else:
        print("No duplicates found between test and train datasets")

# Run the check
check_duplicates()

NameError: name 'df_test' is not defined