In [None]:
import pandas as pd
import numpy as np
import os
import random
import torch
import torch.nn as nn
import csv
from itertools import islice
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import DataLoader, Dataset
import torch

In [None]:
data = []
buffer = []
numrows = 1000000
# numrows = 10000
with open('../page.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in islice(reader, numrows):
        buffer.append({
            "pc": row["pc"],
            "page_in": row["page_in"],
            "page_out": row["page_out"]
        })
        if len(buffer) == 10:
            data.append(buffer)
            buffer = []
if buffer:  # handle any remaining rows
    data.append(buffer)

split_index = int(len(data) * 0.2)
train_data = data[:split_index]
validation_data = data[split_index:]

In [None]:
class MemoryAccessDataset(Dataset):
    def __init__(self, tokenizer, data):
        """
        Initializes the MemoryAccessDataset.

        Args:
        - tokenizer: The tokenizer used to encode the sequences.
        - data: The data to be processed, expected to be a list of chunks, where each chunk
                contains multiple traces of memory access data.
        """
        self.input_ids = []
        self.attention_masks = []
        self.labels = []
        for chunk in data:
            sequences = []
            labels = []
            # For each trace in the chunk, create sequences and labels
            for trace in chunk:
                sequences.append(f"PC: {trace['pc']} Page: {trace['page_in']}")
                labels.append(trace['page_out'])
            
            # Concatenate all sequences in the chunk
            full_sequence = " ".join(sequences)
            encoding = tokenizer(full_sequence, max_length=512, padding='max_length', truncation=True, return_tensors="pt")
            label_encoding = tokenizer(" ".join(labels), max_length=512, padding='max_length', truncation=True, return_tensors="pt")
            
            # Append the encoded inputs and attention masks to the respective lists
            self.input_ids.append(encoding.input_ids.squeeze())
            self.attention_masks.append(encoding.attention_mask.squeeze())
            
            # Append the encoded labels to the labels list
            self.labels.append(label_encoding.input_ids.squeeze())

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        """
        Retrieves the input_ids, attention_mask, and labels for a given index.

        Returns:
        A tuple containing:
        - input_ids: The tokenized input sequence.
        - attention_mask: The attention mask for the input sequence.
        - labels: The tokenized labels.
        """
        return self.input_ids[idx], self.attention_masks[idx], self.labels[idx]


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

# Prepare datasets and dataloaders
train_dataset = MemoryAccessDataset(tokenizer, train_data)
validation_dataset = MemoryAccessDataset(tokenizer, validation_data)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=2)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
epochs = 5
loss_values = []

model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        inputs, masks, labels = batch
        inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_loss / len(train_loader)
    loss_values.append(avg_train_loss)
    print(f"Epoch {epoch + 1}, Average training loss: {avg_train_loss:.2f}")


In [None]:
def predict_next_page_address(model, tokenizer, sequence):
    """
    Predicts the next page address given a sequence of program counters and page addresses.
    """
    model.eval()
    inputs = tokenizer.encode(sequence, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(inputs, 
                                 max_length=inputs.shape[1] + 20, 
                                 num_return_sequences=1, 
                                 pad_token_id=tokenizer.eos_token_id)
    predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Assume the last word is the next page
    predicted_page_address = predicted_text.split()[-1] 
    return predicted_page_address

In [None]:
def test_accuracy(model, tokenizer, data):
    correct_predictions = 0
    total_predictions = 0
    # Iterate through each chunk in the validation data
    for chunk in data:
        # Create the input sequence by concatenating program counters and page addresses, except the last one
        sequence = ' '.join([f"PC: {trace['pc']} Page: {trace['page_in']}" for trace in chunk[:-1]])
        actual_next_page = chunk[-1]['page_out']
        predicted_page_address = predict_next_page_address(model, tokenizer, sequence)
        
        if predicted_page_address.strip() == actual_next_page.strip():
            correct_predictions += 1
        total_predictions += 1
        print(f"Predicted: {predicted_page_address}, Actual: {actual_next_page}")
    
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

accuracy = test_accuracy(model, tokenizer, validation_data)
print(f"Accuracy: {accuracy * 100:.2f}%")


In [None]:
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
# # Save the model
# model_path = "/home/yisheng/data/gpt2_trained_model"
# model.save_pretrained(model_path)

# # Save the tokenizer
# tokenizer_path = "/home/yisheng/data/gpt2_trained_tokenizer"
# tokenizer.save_pretrained(tokenizer_path)

In [None]:
# from transformers import GPT2Tokenizer, GPT2LMHeadModel

# # Load the tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

# # Load the model
# model = GPT2LMHeadModel.from_pretrained(model_path)

# # Ensure the model is in evaluation mode
# model.eval()

# # Example usage
# pc = "140203333008656"  # Example PC value
# page_address = "-77547"  # Example current page address
# predicted_page_address = predict_next_page_address(model, tokenizer, pc, page_address)
# print(f"Predicted Next Page Address: {predicted_page_address}")