In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2ForSequenceClassification
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt

In [None]:
# setup device for torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
# model = GPT2LMHeadModel.from_pretrained('gpt2-large')
# model = model.to(device)

In [None]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

### Dataset Preparation

In [None]:
traces_file = 'traces.csv'
# nrows = 500000000 # number of traces to use (debug purposes)
# nrows = 64000 # number of traces to use (debug purposes)
batch_size = 128
print('number of traces: ', len(pd.read_csv(traces_file)))
# print('number of traces used: ', nrows)

# traces = pd.read_csv(traces_file, nrows=nrows)
traces = pd.read_csv(traces_file)
# Split into train and test
train_data, test_data = train_test_split(traces, test_size=0.25)

# get number of unique values in each column
print('number of unique values to predict: ', traces.nunique()[-1])

In [None]:
def load_data(data, batch_size):
    label_encoder_pc = LabelEncoder()
    label_encoder_delta_in = LabelEncoder()
    label_encoder_delta_out = LabelEncoder()

    # Fit label encoder and transform labels into encoded values
    data['pc_encoded'] = label_encoder_pc.fit_transform(data['pc'])
    data['delta_in_encoded'] = label_encoder_delta_in.fit_transform(data['delta_in'])
    data['delta_out_encoded'] = label_encoder_delta_out.fit_transform(data['delta_out'])

    # Convert dataframes to tensors, explicitly specifying the dtype as torch.int64
    pc = torch.tensor(data['pc_encoded'].values, dtype=torch.int64)
    delta_in = torch.tensor(data['delta_in_encoded'].values, dtype=torch.int64)
    targets = torch.tensor(data['delta_out_encoded'].values, dtype=torch.int64)


    # Create a custom Dataset instance
    dataset = TensorDataset(pc, delta_in, targets)

    # Create a DataLoader instance
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    # Get unique target keys
    target_keys = set(data['delta_out_encoded'].unique())

    return data_loader, len(label_encoder_pc.classes_), len(label_encoder_delta_in.classes_), len(label_encoder_delta_out.classes_), target_keys

train_iter, num_pc, num_delta_in, num_output_next, target_keys = load_data(train_data, batch_size=batch_size)
test_iter, _, _, _, _ = load_data(test_data, batch_size=batch_size)
# add 1 for the next delta which is not in the training set
num_pc += 1
num_delta_in += 1
num_output_next += 1

print('number of unique pc: ', num_pc)
print('number of unique input delta: ', num_delta_in)
print('number of unique output delta: ', num_output_next)


In [None]:
# # Example of encoding a single data point
# def encode_data_point(pc, delta_in, delta_out):
#     # Convert to strings and concatenate
#     text = f"PC: {pc} Delta In: {delta_in} Delta Out: {delta_out}"
#     # Encode using GPT-2 tokenizer
#     return tokenizer(text, return_tensors='pt')

# # Example of converting a dataset row into GPT-2 compatible format
# encoded_inputs = [encode_data_point(row.pc, row.delta_in, row.delta_out) for index, row in traces.iterrows()]


### Model Definition

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=num_output_next)
model = model.to(device)

hparams = {
    'learning_rate': 0.0001,
    'epochs': 3
}
# Assuming 'hparams' is your hyperparameters dictionary
learning_rate = hparams['learning_rate']
epochs = hparams['epochs']

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=num_output_next).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# Set the tokenizer's pad_token outside of the loop (it only needs to be done once)
tokenizer.pad_token = tokenizer.eos_token

# Update the model's configuration to recognize the same pad token
model.config.pad_token_id = tokenizer.pad_token_id

# Training loop
model.train()
for epoch in range(epochs):
    for batch in train_iter:
        # Assuming your DataLoader correctly prepares batches as (pc, delta_in, targets)
        pc, delta_in, targets = batch
        labels = targets.to(device)  # Ensure labels are on the correct device

        # Generate input strings for GPT-2 from pc and delta_in values
        # This needs to be done within the loop to access the current batch's pc and delta_in
        input_strings = ['PC: {} Delta In: {}'.format(pc_val.item(), delta_in_val.item()) for pc_val, delta_in_val in zip(pc, delta_in)]

        # Tokenize the input strings
        encoding = tokenizer(input_strings, return_tensors='pt', padding=True, truncation=True, max_length=512)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Perform the forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward and optimization step
        optimizer.zero_grad()  # Clear existing gradients before the backward pass
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")


In [None]:
# TEST THE MODEL 
model.eval()
with torch.no_grad():
    for batch in test_iter:
        pc, delta_in, targets = batch
        labels = targets.to(device)  # Ensure labels are on the correct device

        # Generate input strings for GPT-2 from pc and delta_in values
        # This needs to be done within the loop to access the current batch's pc and delta_in
        input_strings = ['PC: {} Delta In: {}'.format(pc_val.item(), delta_in_val.item()) for pc_val, delta_in_val in zip(pc, delta_in)]

        # Tokenize the input strings
        encoding = tokenizer(input_strings, return_tensors='pt', padding=True, truncation=True, max_length=512)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Perform the forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        print(f"Test Loss: {loss.item():.4f}")
        
        

In [None]:
# # Example of using the model to generate a sequence
# pc = 140269433753112
# delta_in = 1

# labels = targets.to(device)  # Ensure labels are on the correct device

# # Generate input strings for GPT-2 from pc and delta_in values
# # This needs to be done within the loop to access the current batch's pc and delta_in
# # input_strings = ['PC: {} Delta In: {}'.format(pc_val.item(), delta_in_val.item()) for pc_val, delta_in_val in zip(pc, delta_in)]
# input_strings = ['PC: {} Delta In: {}'.format(pc, delta_in)]

# # Tokenize the input strings
# encoding = tokenizer(input_strings, return_tensors='pt', padding=True, truncation=True, max_length=512)
# input_ids = encoding['input_ids'].to(device)
# attention_mask = encoding['attention_mask'].to(device)

# # Perform the forward pass
# outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
