In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2ForSequenceClassification
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# setup device for torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
# model = GPT2LMHeadModel.from_pretrained('gpt2-large')
# model = model.to(device)

In [4]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

### Dataset Preparation

In [5]:
traces_file = 'traces.csv'
# nrows = 500000000 # number of traces to use (debug purposes)
# nrows = 64000 # number of traces to use (debug purposes)
batch_size = 16
print('number of traces: ', len(pd.read_csv(traces_file)))
# print('number of traces used: ', nrows)

# traces = pd.read_csv(traces_file, nrows=nrows)
traces = pd.read_csv(traces_file)
# Split into train and test
train_data, test_data = train_test_split(traces, test_size=0.25)

# get number of unique values in each column
print('number of unique values to predict: ', traces.nunique()[-1])

number of traces:  84006
number of unique values to predict:  332


  print('number of unique values to predict: ', traces.nunique()[-1])


In [6]:
def load_data(data, batch_size):
    label_encoder_pc = LabelEncoder()
    label_encoder_delta_in = LabelEncoder()
    label_encoder_delta_out = LabelEncoder()

    # Fit label encoder and transform labels into encoded values
    data['pc_encoded'] = label_encoder_pc.fit_transform(data['pc'])
    data['delta_in_encoded'] = label_encoder_delta_in.fit_transform(data['delta_in'])
    data['delta_out_encoded'] = label_encoder_delta_out.fit_transform(data['delta_out'])

    # Convert dataframes to tensors, explicitly specifying the dtype as torch.int64
    pc = torch.tensor(data['pc_encoded'].values, dtype=torch.int64)
    delta_in = torch.tensor(data['delta_in_encoded'].values, dtype=torch.int64)
    targets = torch.tensor(data['delta_out_encoded'].values, dtype=torch.int64)


    # Create a custom Dataset instance
    dataset = TensorDataset(pc, delta_in, targets)

    # Create a DataLoader instance
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    # Get unique target keys
    target_keys = set(data['delta_out_encoded'].unique())

    return data_loader, len(label_encoder_pc.classes_), len(label_encoder_delta_in.classes_), len(label_encoder_delta_out.classes_), target_keys

train_iter, num_pc, num_delta_in, num_output_next, target_keys = load_data(train_data, batch_size=batch_size)
test_iter, _, _, _, _ = load_data(test_data, batch_size=batch_size)
# add 1 for the next delta which is not in the training set
num_pc += 1
num_delta_in += 1
num_output_next += 1

print('number of unique pc: ', num_pc)
print('number of unique input delta: ', num_delta_in)
print('number of unique output delta: ', num_output_next)


number of unique pc:  2522
number of unique input delta:  333
number of unique output delta:  333


In [7]:
# # Example of encoding a single data point
# def encode_data_point(pc, delta_in, delta_out):
#     # Convert to strings and concatenate
#     text = f"PC: {pc} Delta In: {delta_in} Delta Out: {delta_out}"
#     # Encode using GPT-2 tokenizer
#     return tokenizer(text, return_tensors='pt')

# # Example of converting a dataset row into GPT-2 compatible format
# encoded_inputs = [encode_data_point(row.pc, row.delta_in, row.delta_out) for index, row in traces.iterrows()]


### Model Definition

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=num_output_next)
model = model.to(device)

hparams = {
    'learning_rate': 5e-5,
    'epochs': 3
}
# Assuming 'hparams' is your hyperparameters dictionary
learning_rate = hparams['learning_rate']
epochs = hparams['epochs']

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=num_output_next).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# Set the tokenizer's pad_token outside of the loop (it only needs to be done once)
tokenizer.pad_token = tokenizer.eos_token

# Update the model's configuration to recognize the same pad token
model.config.pad_token_id = tokenizer.pad_token_id

# Training loop
model.train()
for epoch in range(epochs):
    for batch in train_iter:
        # Assuming your DataLoader correctly prepares batches as (pc, delta_in, targets)
        pc, delta_in, targets = batch
        labels = targets.to(device)  # Ensure labels are on the correct device

        # Generate input strings for GPT-2 from pc and delta_in values
        # This needs to be done within the loop to access the current batch's pc and delta_in
        input_strings = ['PC: {} Delta In: {}'.format(pc_val.item(), delta_in_val.item()) for pc_val, delta_in_val in zip(pc, delta_in)]

        # Tokenize the input strings
        encoding = tokenizer(input_strings, return_tensors='pt', padding=True, truncation=True, max_length=512)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Perform the forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward and optimization step
        optimizer.zero_grad()  # Clear existing gradients before the backward pass
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 12.7908
Epoch 1, Loss: 7.8899
Epoch 1, Loss: 6.3798
Epoch 1, Loss: 8.8537
Epoch 1, Loss: 6.4636
Epoch 1, Loss: 6.4163
Epoch 1, Loss: 5.9689
Epoch 1, Loss: 6.4807
Epoch 1, Loss: 6.4674
Epoch 1, Loss: 6.1032
Epoch 1, Loss: 6.3680
Epoch 1, Loss: 6.3356
Epoch 1, Loss: 5.9212
Epoch 1, Loss: 5.7903
Epoch 1, Loss: 5.8433
Epoch 1, Loss: 5.7020
Epoch 1, Loss: 5.8031
Epoch 1, Loss: 5.8103
Epoch 1, Loss: 5.7136
Epoch 1, Loss: 5.8417
Epoch 1, Loss: 5.7753
Epoch 1, Loss: 5.7251
Epoch 1, Loss: 5.7619
Epoch 1, Loss: 5.8904
Epoch 1, Loss: 5.7283
Epoch 1, Loss: 5.8922
Epoch 1, Loss: 5.8421
Epoch 1, Loss: 5.7931
Epoch 1, Loss: 5.7715
Epoch 1, Loss: 5.6719
Epoch 1, Loss: 5.5314
Epoch 1, Loss: 5.6237
Epoch 1, Loss: 5.5278
Epoch 1, Loss: 5.5892
Epoch 1, Loss: 5.5028
Epoch 1, Loss: 5.2933
Epoch 1, Loss: 5.5264
Epoch 1, Loss: 5.5338
Epoch 1, Loss: 5.6536
Epoch 1, Loss: 5.5827
Epoch 1, Loss: 5.3911
Epoch 1, Loss: 5.6104
Epoch 1, Loss: 5.5849
Epoch 1, Loss: 5.3281
Epoch 1, Loss: 5.3484
Epoch 1, 

In [10]:
# TEST THE MODEL 
model.eval()
with torch.no_grad():
    for batch in test_iter:
        pc, delta_in, targets = batch
        labels = targets.to(device)  # Ensure labels are on the correct device

        # Generate input strings for GPT-2 from pc and delta_in values
        # This needs to be done within the loop to access the current batch's pc and delta_in
        input_strings = ['PC: {} Delta In: {}'.format(pc_val.item(), delta_in_val.item()) for pc_val, delta_in_val in zip(pc, delta_in)]

        # Tokenize the input strings
        encoding = tokenizer(input_strings, return_tensors='pt', padding=True, truncation=True, max_length=512)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Perform the forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        print(f"Test Loss: {loss.item():.4f}")
        
        

Test Loss: 4.9306
Test Loss: 3.3971
Test Loss: 4.0117
Test Loss: 4.4569
Test Loss: 3.6482
Test Loss: 2.3442
Test Loss: 4.3625
Test Loss: 3.0843
Test Loss: 3.8996
Test Loss: 3.3795
Test Loss: 2.9709
Test Loss: 4.4205
Test Loss: 4.2160
Test Loss: 5.7027
Test Loss: 3.8210
Test Loss: 3.9698
Test Loss: 4.6323
Test Loss: 3.1061
Test Loss: 4.0040
Test Loss: 6.0756
Test Loss: 4.7151
Test Loss: 4.8114
Test Loss: 4.4800
Test Loss: 2.4101
Test Loss: 3.2037
Test Loss: 3.9259
Test Loss: 4.3476
Test Loss: 4.0881
Test Loss: 3.7233
Test Loss: 5.1928
Test Loss: 3.3934
Test Loss: 2.1118
Test Loss: 4.3446
Test Loss: 5.0191
Test Loss: 6.3239
Test Loss: 4.5446
Test Loss: 3.8497
Test Loss: 4.4731
Test Loss: 4.5084
Test Loss: 3.4092
Test Loss: 3.9579
Test Loss: 4.0837
Test Loss: 4.4954
Test Loss: 4.6913
Test Loss: 4.1737
Test Loss: 4.2995
Test Loss: 4.0568
Test Loss: 3.7394
Test Loss: 4.2005
Test Loss: 4.0187
Test Loss: 3.2974
Test Loss: 3.4395
Test Loss: 4.1829
Test Loss: 4.5672
Test Loss: 4.4242
Test Loss:

In [19]:
# Example of using the model to generate a sequence
pc = 140269433753112
delta_in = 1

labels = targets.to(device)  # Ensure labels are on the correct device

# Generate input strings for GPT-2 from pc and delta_in values
# This needs to be done within the loop to access the current batch's pc and delta_in
# input_strings = ['PC: {} Delta In: {}'.format(pc_val.item(), delta_in_val.item()) for pc_val, delta_in_val in zip(pc, delta_in)]
input_strings = ['PC: {} Delta In: {}'.format(pc, delta_in)]

# Tokenize the input strings
encoding = tokenizer(input_strings, return_tensors='pt', padding=True, truncation=True, max_length=512)
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)

# Perform the forward pass
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)


ValueError: Expected input batch_size (1) to match target batch_size (10).