In [1]:
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
import torch

In [2]:
dev_dict = pkl.load(open('data/earthquake/dev.pkl', 'rb'))
test_dict = pkl.load(open('data/earthquake/test.pkl', 'rb'))
train_dict = pkl.load(open('data/earthquake/train.pkl', 'rb'))

In [3]:
print('dev_dict:', dev_dict)

dev_dict: {'dim_process': 7, 'dev': [[{'time_since_last_event': 0.0, 'time_since_start': 0.0, 'type_event': 3}, {'time_since_last_event': 0.5486190000083297, 'time_since_start': 0.5486190000083297, 'type_event': 1}, {'time_since_last_event': 0.6663428999891039, 'time_since_start': 1.2149618999974336, 'type_event': 0}, {'time_since_last_event': 2.7089128000079654, 'time_since_start': 3.923874700005399, 'type_event': 3}, {'time_since_last_event': 0.8426292999938596, 'time_since_start': 4.766503999999259, 'type_event': 2}, {'time_since_last_event': 0.018509999994421378, 'time_since_start': 4.78501399999368, 'type_event': 2}, {'time_since_last_event': 0.7265199999965262, 'time_since_start': 5.511533999990206, 'type_event': 0}, {'time_since_last_event': 0.18911279999883845, 'time_since_start': 5.700646799989045, 'type_event': 1}, {'time_since_last_event': 0.05716920000850223, 'time_since_start': 5.757815999997547, 'type_event': 1}, {'time_since_last_event': 0.4891032000014093, 'time_since_s

In [4]:
import random

from easy_tpp.preprocess.event_tokenizer import EventTokenizer
from easy_tpp.config_factory import DataSpecConfig

def make_raw_data():
    data = [
        [{"time_since_last_event": 0, "time_since_start": 0, "type_event": 0}],
        [{"time_since_last_event": 0, "time_since_start": 0, "type_event": 1}],
        [{"time_since_last_event": 0, "time_since_start": 0, "type_event": 1}],
    ]
    for i, j in enumerate([2, 5, 3]):
        start_time = 0
        for k in range(j):
            delta_t = random.random()
            start_time += delta_t
            data[i].append({"time_since_last_event": delta_t,
                            "time_since_start": start_time,
                            "type_event": random.randint(0, 10)
                            })

    return data


def main():
    source_data = make_raw_data()

    time_seqs = [[x["time_since_start"] for x in seq] for seq in source_data]
    type_seqs = [[x["type_event"] for x in seq] for seq in source_data]
    time_delta_seqs = [[x["time_since_last_event"] for x in seq] for seq in source_data]

    input_data = {'time_seqs': time_seqs,
                  'type_seqs': type_seqs,
                  'time_delta_seqs': time_delta_seqs}

    config = DataSpecConfig.parse_from_yaml_config({'num_event_types': 10,  'pad_token_id': 10})

    tokenizer = EventTokenizer(config)

    output = tokenizer.pad(input_data, return_tensors='pt')

    print(output)


if __name__ == '__main__':
    main()

{'time_seqs': tensor([[ 0.0000,  0.3452,  1.1554, 10.0000, 10.0000, 10.0000],
        [ 0.0000,  0.4125,  1.0235,  1.8393,  2.4094,  2.6727],
        [ 0.0000,  0.8694,  0.8897,  1.4690, 10.0000, 10.0000]]), 'time_delta_seqs': tensor([[ 0.0000,  0.3452,  0.8102, 10.0000, 10.0000, 10.0000],
        [ 0.0000,  0.4125,  0.6109,  0.8159,  0.5700,  0.2633],
        [ 0.0000,  0.8694,  0.0203,  0.5794, 10.0000, 10.0000]]), 'type_seqs': tensor([[ 0,  5,  8, 10, 10, 10],
        [ 1,  5,  5,  2,  0,  3],
        [ 1,  1,  4,  1, 10, 10]]), 'seq_non_pad_mask': tensor([[ True,  True,  True, False, False, False],
        [ True,  True,  True,  True,  True,  True],
        [ True,  True,  True,  True, False, False]]), 'attention_mask': tensor([[[False,  True,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True],
 

In [5]:
train_dict

{'dim_process': 7,
 'train': [[{'time_since_last_event': 0.0,
    'time_since_start': 0.0,
    'type_event': 0},
   {'time_since_last_event': 1.2051030000002356,
    'time_since_start': 1.2051030000002356,
    'type_event': 0},
   {'time_since_last_event': 0.033874000000650994,
    'time_since_start': 1.2389770000008866,
    'type_event': 1},
   {'time_since_last_event': 0.33377300000574905,
    'time_since_start': 1.5727500000066357,
    'type_event': 2},
   {'time_since_last_event': 0.10636799999338109,
    'time_since_start': 1.6791180000000168,
    'type_event': 0},
   {'time_since_last_event': 0.4333000000042375,
    'time_since_start': 2.1124180000042543,
    'type_event': 0},
   {'time_since_last_event': 1.4924579999933485,
    'time_since_start': 3.6048759999976028,
    'type_event': 2},
   {'time_since_last_event': 0.2127420000033453,
    'time_since_start': 3.817618000000948,
    'type_event': 0},
   {'time_since_last_event': 0.017842999994172715,
    'time_since_start': 3.83

In [None]:
# get dimension, the number of event types
dim_process = train_dict['dim_process']

# get the training data by subsetting the dictionary
train_data = train_dict['train']

# convert the training data to the format that the tokenizer expects (convert dictionary of dictionaries to list of lists)
time_seqs = [[x["time_since_start"] for x in seq] for seq in train_data]
type_seqs = [[x["type_event"] for x in seq] for seq in train_data]
time_delta_seqs = [[x["time_since_last_event"] for x in seq] for seq in train_data]

input_data = {'time_seqs': time_seqs,
                'type_seqs': type_seqs,
                'time_delta_seqs': time_delta_seqs}

config = DataSpecConfig.parse_from_yaml_config({'num_event_types': dim_process,  'pad_token_id': dim_process})

tokenizer = EventTokenizer(config)

# return_tensors='pt' will return the output as PyTorch tensors
output = tokenizer.pad(input_data, return_tensors='pt')

print(output)

{'time_seqs': tensor([[0.0000e+00, 1.2051e+00, 1.2390e+00,  ..., 1.9818e+01, 1.9842e+01,
         7.0000e+00],
        [0.0000e+00, 5.6208e+00, 5.7464e+00,  ..., 2.2073e+01, 2.3913e+01,
         7.0000e+00],
        [0.0000e+00, 1.3862e-01, 9.6840e+00,  ..., 5.2158e+01, 5.2163e+01,
         7.0000e+00],
        ...,
        [0.0000e+00, 8.2689e-01, 2.0075e+00,  ..., 1.4963e+01, 1.5105e+01,
         7.0000e+00],
        [0.0000e+00, 4.0136e-02, 2.0609e-01,  ..., 7.1222e-01, 7.8175e-01,
         9.1837e-01],
        [0.0000e+00, 1.0268e-01, 5.8591e-01,  ..., 7.0000e+00, 7.0000e+00,
         7.0000e+00]]), 'time_delta_seqs': tensor([[0.0000e+00, 1.2051e+00, 3.3874e-02,  ..., 3.1534e-01, 2.4082e-02,
         7.0000e+00],
        [0.0000e+00, 5.6208e+00, 1.2562e-01,  ..., 6.4807e-01, 1.8403e+00,
         7.0000e+00],
        [0.0000e+00, 1.3862e-01, 9.5454e+00,  ..., 1.6073e+00, 5.4100e-03,
         7.0000e+00],
        ...,
        [0.0000e+00, 8.2689e-01, 1.1806e+00,  ..., 2.6363e-01, 1.4

In [None]:
from models.encoders.gru import GRUTPPEncoder
from models.decoders.rmtpp import RMTPPDecoder, RMTPPLoss

# Data Processing Pipeline
def prepare_data(raw_data, config):
    """Prepare data with right padding
    
    Args:
        raw_data: dictionary of dictionaries
        config: DataSpecConfig object
    """
    # Initialize tokenizer with right padding
    tokenizer = EventTokenizer(config)
    tokenizer.padding_side = 'right'  # Ensure right padding
    
    # Prepare input format for tokenizer
    input_data = {
        'time_seqs': [[x["time_since_start"] for x in seq] for seq in raw_data],
        'type_seqs': [[x["type_event"] for x in seq] for seq in raw_data],
        'time_delta_seqs': [[x["time_since_last_event"] for x in seq] for seq in raw_data]
    }
    
    # Get sequence lengths before padding
    sequence_length = torch.tensor([len(seq) for seq in input_data['type_seqs']])
    
    # Process with tokenizer, return as PyTorch tensors, do not return attention masks
    batch = tokenizer.pad(input_data, return_tensors='pt', return_attention_mask=False)
    batch['sequence_length'] = sequence_length
    
    return batch


dim_process = train_dict['dim_process']
train_data = train_dict['train']

config = DataSpecConfig.parse_from_yaml_config({
    'num_event_types': dim_process,
    'pad_token_id': dim_process
})

HIDDEN_DIM = 128


# Process data once
processed_data = prepare_data(train_data, config)

# Create model
encoder = GRUTPPEncoder(config, hidden_dim=HIDDEN_DIM)

# Train/inference with processed data
output = encoder(processed_data)

In [8]:
processed_data["sequence_length"]

tensor([17, 17, 17,  ..., 17, 18, 15])

In [9]:
MLP_DIM = 64
device = 'cpu' if not torch.cuda.is_available() else 'cuda'


# decode
decoder = RMTPPDecoder(hidden_dim=HIDDEN_DIM, num_event_types=dim_process, mlp_dim=MLP_DIM, device='cpu')
decoder_output = decoder(output)

criterion = RMTPPLoss(device=device, ignore_index=config.pad_token_id)

# Forward pass
hidden_states = encoder(processed_data)
decoder_output = decoder(hidden_states)

# Compute loss
time_loss, mark_loss, total_loss = criterion.forward(
    decoder_output,
    processed_data["time_delta_seqs"], 
    processed_data["type_seqs"],
    processed_data["sequence_length"]
)