In [1]:
import csv
import pickle
import time
import datetime
# import numpy as np
# import pandas as pd
from collections import Counter
from preprocess.Dataset import get_dataloader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('./../structuredData/911.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    data = []
    for row in spamreader:
        data.append(row)

In [3]:
len(data)

663523

In [4]:
data[0]
LAT = 0
LNG = 1
DESC = 2
ZIP = 3
TITLE = 4
TIME = 5
TWP = 6
ADDR = 7

In [20]:
# event type (K)
EMS = 0
FIRE = 1
TRAFFIC = 2

In [6]:
zips = []
for d in data:
    if len(d[ZIP]) == 5:
        zips.append(d[ZIP])

In [7]:
zip_counts = Counter(zips)

In [8]:
freq_zips = [elem[0] for elem in zip_counts.most_common(75)]

In [21]:
vertex_mapping = {freq_zips[i]: i for i in range(len(freq_zips))}

In [10]:
def get_event_type(title):
    if title[:3] == "EMS":
        return EMS
    elif title[:4] == "Fire":
        return FIRE
    elif title[:7] == "Traffic":
        return TRAFFIC
    else:
        print(title)

In [22]:
filtered_data = []
for d in data:
    if d[ZIP] in vertex_mapping:
        element = datetime.datetime.strptime(d[TIME], "%Y-%m-%d %H:%M:%S")
        timestamp = datetime.datetime.timestamp(element)
        event_type = get_event_type(d[TITLE])
        vertex = vertex_mapping[d[ZIP]]
        filtered_data.append([element, timestamp, event_type, vertex])
filtered_data.sort(key=lambda x: x[1])

In [23]:
unstructured_data = []
cur_stream = []
cur_time = None
start_time = None
timescale = 1e3
for t, ts, k, v in filtered_data:
    cur_time = t if cur_time is None else cur_time
    same_stream = cur_time.year == t.year and cur_time.month == t.month and cur_time.day == t.day and cur_time.hour == t.hour
#     same_stream = cur_time.year == t.year and cur_time.month == t.month and cur_time.day == t.day
    if not same_stream:
        for event in cur_stream:
            event['time_since_start'] /= timescale
            event['time_since_last_event'] /= timescale
        if len(cur_stream) > 1:
            unstructured_data.append(cur_stream)
        cur_stream = []
        cur_time = t
        start_time = datetime.datetime.timestamp(t)
    cur_stream.append({
        'time_since_start': (ts - start_time if start_time else 0),
        'time_since_last_event': ts - cur_stream[-1]['time_since_start'] - start_time if len(cur_stream) > 0 else 0,
        'type_event': k
    })
for event in cur_stream:
    event['time_since_start'] /= timescale
    event['time_since_last_event'] /= timescale
if len(cur_stream) > 1:
    unstructured_data.append(cur_stream)

In [24]:
structured_data = []
cur_stream = []
cur_time = None
start_time = None
timescale = 1e3
for t, ts, k, v in filtered_data:
    cur_time = t if cur_time is None else cur_time
    same_stream = cur_time.year == t.year and cur_time.month == t.month and cur_time.day == t.day and cur_time.hour == t.hour
#     same_stream = cur_time.year == t.year and cur_time.month == t.month and cur_time.day == t.day
    if not same_stream:
        for event in cur_stream:
            event['time_since_start'] /= timescale
            event['time_since_last_event'] /= timescale
        if len(cur_stream) > 1:
            structured_data.append(cur_stream)
        cur_stream = []
        cur_time = t
        start_time = datetime.datetime.timestamp(t)
    cur_stream.append({
        'time_since_start': (ts - start_time if start_time else 0),
        'time_since_last_event': ts - cur_stream[-1]['time_since_start'] - start_time if len(cur_stream) > 0 else 0,
        'type_event': k,
        'vertex': v,
    })
for event in cur_stream:
    event['time_since_start'] /= timescale
    event['time_since_last_event'] /= timescale
if len(cur_stream) > 1:
    structured_data.append(cur_stream)

In [25]:
len(unstructured_data)

40167

In [26]:
unstructured_data[10]

[{'time_since_start': 0.0, 'time_since_last_event': 0.0, 'type_event': 0},
 {'time_since_start': 0.589, 'time_since_last_event': 0.589, 'type_event': 0},
 {'time_since_start': 1.553, 'time_since_last_event': 0.964, 'type_event': 1},
 {'time_since_start': 2.058, 'time_since_last_event': 0.505, 'type_event': 2}]

In [27]:
structured_data[10]

[{'time_since_start': 0.0,
  'time_since_last_event': 0.0,
  'type_event': 0,
  'vertex': 0},
 {'time_since_start': 0.589,
  'time_since_last_event': 0.589,
  'type_event': 0,
  'vertex': 10},
 {'time_since_start': 1.553,
  'time_since_last_event': 0.964,
  'type_event': 1,
  'vertex': 4},
 {'time_since_start': 2.058,
  'time_since_last_event': 0.505,
  'type_event': 2,
  'vertex': 24}]

In [18]:
df = pd.DataFrame(unstructured_data)

In [22]:
# 60/20/20 split
train, test, dev = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [36]:
train_list = train.values.tolist()
test_list = test.values.tolist()
dev_list = dev.values.tolist()

In [37]:
trimmed_train = []
trimmed_test = []
trimmed_dev = []
for stream in train_list:
    trimmed_train.append([event for event in stream if event is not None])
for stream in test_list:
    trimmed_test.append([event for event in stream if event is not None])
for stream in dev_list:
    trimmed_dev.append([event for event in stream if event is not None])

In [15]:
with open('./../structuredData/911.pkl', 'wb') as handle:
    pickle.dump(unstructured_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [142]:
with open('./../structuredData/911.pkl', 'rb') as handle:
    b = pickle.load(handle)

In [143]:
len(b)

40519

In [15]:
trainloader = get_dataloader(unstructured_data[:100], 4, shuffle=True)

In [33]:
! locale

LANG="en_US.UTF-8"
LC_COLLATE="en_US.UTF-8"
LC_CTYPE="en_US.UTF-8"
LC_MESSAGES="en_US.UTF-8"
LC_MONETARY="en_US.UTF-8"
LC_NUMERIC="en_US.UTF-8"
LC_TIME="en_US.UTF-8"
LC_ALL=


In [34]:
for i, e in enumerate(trainloader):
    print(i)
    print(e)
    break

NameError: name 'trainloader' is not defined

In [28]:
train = {
    "dim_process": 3,
    "num_vertices": 75,
    "devtest": [],
    "args": None,
    "dev": [],
    "train": structured_data[:100],
    "test": [],
}
test = {
    "dim_process": 3,
    "num_vertices": 75,
    "devtest": [],
    "args": None,
    "dev": [],
    "train": [],
    "test": structured_data[100:150],
}
dev = {
    "dim_process": 3,
    "num_vertices": 75,
    "devtest": [],
    "args": None,
    "dev": structured_data[150:200],
    "train": [],
    "test": [],
}

In [29]:
with open('./../structuredData/train.pkl', 'wb') as handle:
    pickle.dump(train, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./../structuredData/test.pkl', 'wb') as handle:
    pickle.dump(test, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./../structuredData/dev.pkl', 'wb') as handle:
    pickle.dump(dev, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [103]:
with open('./../NeuralHawkesData/data_mimic/fold1/train.pkl', 'rb') as handle:
    b = pickle.load(handle, encoding='latin-1')

In [3]:
with open('./data/train.pkl', 'rb') as handle:
    b = pickle.load(handle, encoding='latin-1')

In [21]:
import numpy as np
from bisect import bisect_left  

In [17]:
NUM_BINS = 1000
epsilon = 0.0001

In [24]:
flattened = [x for l in b['train'] for x in l]
latest_t = 0
num_types = 0
for entry in flattened:
  if entry['time_since_start'] >= latest_t:
    latest_t = entry['time_since_start']
  if entry['vertex'] >= num_types:
    num_types = entry['vertex']
num_types += 1

bins = np.linspace(0, latest_t+epsilon, NUM_BINS+1)
S_cumulative = np.zeros((NUM_BINS, num_types))
S_onehot = np.zeros((NUM_BINS, num_types))

for entry in flattened:
  i = bisect_left(bins, entry['time_since_start'])
  S_cumulative[i-1, entry['vertex']] += 1
  S_onehot[i-1, entry['vertex']] = 1

In [28]:
np.save('S_cumulative.npy', S_cumulative)
np.save('S_onehot.npy', S_onehot)

In [32]:
A = np.load('A.npy')
W = np.load('W.npy')

In [36]:
weighted_A = A @ W

In [37]:
np.save('weighted_A.npy', weighted_A)

In [115]:
stream_lengths = [len(elem) for elem in b['train']]
last_times = [elem[-1]["time_since_last_event"] for elem in b['train']]

In [117]:
min(stream_lengths)

2

In [56]:
filtered_data[100], filtered_data[431], filtered_data[432]

([datetime.datetime(2015, 12, 11, 0, 1, 29), 1449810089.0, 1, 30],
 [datetime.datetime(2015, 12, 11, 23, 58, 50), 1449896330.0, 1, 9],
 [datetime.datetime(2015, 12, 12, 0, 15, 12), 1449897312.0, 1, 39])

In [70]:
filtered_data[100][0] - filtered_data[431][0]

datetime.timedelta(days=-1, seconds=159)

In [26]:
data[2]

['40.2580614',
 '-75.2646799',
 'BRIAR PATH & WHITEMARSH LN;  HATFIELD TOWNSHIP; Station 345; 2015-12-10 @ 17:29:21;',
 '19446',
 'EMS: DIABETIC EMERGENCY',
 '2015-12-10 17:29:21',
 'HATFIELD TOWNSHIP',
 'BRIAR PATH & WHITEMARSH LN',
 '1']

In [17]:
string = "2015-12-10 17:10:52"
element = datetime.datetime.strptime(string,"%Y-%m-%d %H:%M:%S")
timestamp = datetime.datetime.timestamp(element)
print(timestamp)

1449785452.0


In [27]:
string = "2015-12-10 17:29:21"
element = datetime.datetime.strptime(string,"%Y-%m-%d %H:%M:%S")
timestamp = datetime.datetime.timestamp(element)
print(timestamp)

1449786561.0


In [28]:
1449786561.0 - 1449785452.0

1109.0

In [12]:
data[663522]

['40.0150463',
 '-75.2996738',
 'HAVERFORD STATION RD & W MONTGOMERY AVE; LOWER MERION; 2020-07-29 @ 15:52:46;',
 '19041',
 'Traffic: VEHICLE ACCIDENT -',
 '2020-07-29 15:52:46',
 'LOWER MERION',
 'HAVERFORD STATION RD & W MONTGOMERY AVE',
 '1']

In [None]:
"""
Example input data format

[
{
'time_since_start': 0.0,
'time_since_last_event': 0.0,
'type_event': 1
},
{
'time_since_start': 0.23076923,
'time_since_last_event': 0.23076923,
'type_event': 1
}, {
'time_since_start': 0.28846154,
'time_since_last_event': 0.057692304,
'type_event': 1
}, {
'time_since_start': 0.34615386,
'time_since_last_event': 0.05769232,
'type_event': 1
}
]
"""

In [54]:
! bash run.sh

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

