# Generative model to create paths

In [15]:
# Generative model to create paths
import random; random.seed(23)
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from linecache import getline

def blocks(files, size=65536):
    while True:
        b = files.read(size)
        if not b: break
        yield b

def count_lines(file_path):
    with open(file_path, "r",encoding="utf-8",errors='ignore') as f:
        return sum(bl.count("\n") for bl in blocks(f))


class PathDataset(Dataset):
    def __init__(self, file_path: Path):
        self.path_str = str(file_path)
        self.num_lines = count_lines(file_path)
        self.lines = [i for i in range(self.num_lines)]

    def __len__(self) -> int:
        return self.num_lines

    def __getitem__(self, key) -> list[str]:
        if isinstance(key, slice):  # List slicing
            indices = range(*key.indices(self.num_lines))
            for i in indices:
                yield self[i]
        else:  # Single index
            idx = self.lines[key]
            line = getline(self.path_str, idx+1)  # getline indexes at 1
            if len(line) > 0:
                start_pos = line.find(':') + 1
                line = line[start_pos:].strip()
                line = line.split(' ')
            return line
            
    def shuffle(self):
        random.shuffle(self.lines)
    
    def items(self):
        for i in range(self.num_lines):
            yield self[i]

dataset = PathDataset(Path('F:/data/prov_dp/tc3-theia/9155821.txt'))
print(len(dataset))
print(dataset[0])

446400



In [3]:
lengths = []
tokens = set()
for path in dataset.items():
    lengths.append(len(path))
    tokens.update(path)

In [4]:
itos = list(tokens)
itos[:0] = ['.']
stoi = {
    token: i for i, token in enumerate(itos)
}

In [11]:
import torch.nn.functional as F

block_size = 8 # Should always be odd. How can we encode the fact that nodes and edges must alternate?

def encode(path):
    padded_path = path[:block_size] + ['.'] * (D - len(block_size))
    return [stoi[t] for t in padded_path]

encoding = encode(dataset[1])
tensor = torch.tensor(encoding)

tensor.shape


torch.Size([17])

In [52]:
def build_dataset(path_dataset):
    X, Y = [], []

    for path in path_dataset:
        context = [0] * block_size
        for x in path + '.':
            i_x = stoi[x]
            X.append(context)
            context = context[1:] * [i_x] # Pop, add to back
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

n1 = int(0.8 * len(dataset))
n2 = int(0.9 * len(dataset))
X_train, Y_train = build_dataset()

['0', '0']

['PROCESS_LET', 'FILE_/home/admin/.mozilla/firefox/Crash_Reports/LastCrash', 'FILE_/home/admin/.pulse/263ceeca39f67879e8eda45300000001-default-sink', 'PROCESS_LET_/usr/bin/pulseaudio', 'PROCESS_LET_/home/admin/Downloads/firefox/crashreporter', 'PROCESS_LET_fluxbox', 'READ_Transfer_IP_Data', 'READ_Access_File', 'FILE', 'PROCESS_LET_/usr/bin/firefox', 'PROCESS_LET_/home/admin/Downloads/firefox/firefox', 'WRITE_Access_File', 'FILE_/home/admin/.pulse/263ceeca39f67879e8eda45300000001-default-source', 'FILE_EXEC_Access_File', 'PROC_CREATE_Start_Processlet', 'IP_CHANNEL']


['.',
 'PROCESS_LET',
 'FILE_/home/admin/.mozilla/firefox/Crash_Reports/LastCrash',
 'FILE_/home/admin/.pulse/263ceeca39f67879e8eda45300000001-default-sink',
 'PROCESS_LET_/usr/bin/pulseaudio',
 'PROCESS_LET_/home/admin/Downloads/firefox/crashreporter',
 'PROCESS_LET_fluxbox',
 'READ_Transfer_IP_Data',
 'READ_Access_File',
 'FILE',
 'PROCESS_LET_/usr/bin/firefox',
 'PROCESS_LET_/home/admin/Downloads/firefox/firefox',
 'WRITE_Access_File',
 'FILE_/home/admin/.pulse/263ceeca39f67879e8eda45300000001-default-source',
 'FILE_EXEC_Access_File',
 'PROC_CREATE_Start_Processlet',
 'IP_CHANNEL']