- Batch the dataframe by a few thousand and write tensors to a directory
- Use that directory to train the model, Pytorch dataset can data file names as input

In [110]:
import pandas as pd
import ast
import h3
import pickle
import numpy as np
import torch
import math
import random
import time
import multiprocessing as mp
import itertools

In [None]:
window_size = 5
negative_samples = 3

In [111]:
data = pd.read_csv("../data/original_data.csv")

In [112]:
#data = data.sample(300000, random_state=123).reset_index(drop=True)
data = data.reset_index(drop=True)

In [113]:
data = data.drop(columns=data.columns[1:-1])
data.head()

Unnamed: 0,TRIP_ID,POLYLINE
0,1372636858620000589,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


https://www.kaggle.com/crailtap/taxi-trajectory

## Functions

In [114]:
def remove_repeats(vals):
    if len(vals) == 0:
        return None
    result = []
    curr = vals[0]
    for val in vals[1:]:
        if val != curr:
            result.append(curr)
        curr = val
    result.append(curr)
    return result

In [115]:
def latlon_to_h3(latlons, res=7):
    latlons = ast.literal_eval(latlons)
    result = []
    for latlon in latlons:
        h3_id = h3.geo_to_h3(latlon[0], latlon[1], res)
        result.append(h3_id)
    result = remove_repeats(result)
    return result

## Feature Engineering

### Convert to H3 Ids

In [116]:
data["H3_POLYLINE"] = data["POLYLINE"].apply(latlon_to_h3)

In [117]:
data = data.dropna(axis=0, subset=["H3_POLYLINE"])

In [118]:
data["len_h3"] = data["H3_POLYLINE"].apply(len)
data = data[data["len_h3"] > 1]
print(data["len_h3"].sum())
data = data.drop(columns=["len_h3"])

6722453


In [119]:
data.head()

Unnamed: 0,TRIP_ID,POLYLINE,H3_POLYLINE
0,1372636858620000589,"[[-8.618643,41.141412],[-8.618499,41.141376],[...","[877b63adbffffff, 877b63ad9ffffff, 877b63adbff..."
1,1372637303620000596,"[[-8.639847,41.159826],[-8.640351,41.159871],[...","[877b63ad8ffffff, 877b63addffffff, 877b63adcff..."
2,1372636951620000320,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...","[877b63adbffffff, 877b63ad9ffffff, 877b63adbff..."
3,1372636854620000520,"[[-8.574678,41.151951],[-8.574705,41.151942],[...","[877b63370ffffff, 877b63adbffffff, 877b63370ff..."
4,1372637091620000337,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...","[877b63adeffffff, 877b63adcffffff, 877b63ac3ff..."


In [120]:
data.to_csv("../data/data_with_h3.csv", index=False)

### Tokenizing H3 IDs

In [121]:
unq_h3_ids = data.explode("H3_POLYLINE").H3_POLYLINE.unique()

In [122]:
h3_to_token = {}
token_to_h3 = {}
for i in range(len(unq_h3_ids)):
    h3_to_token[unq_h3_ids[i]] = i
    token_to_h3[i] = unq_h3_ids[i]

In [123]:
with open("../models/tokenizers/encode_h3_to_token.pickle", "wb") as f:
    pickle.dump(h3_to_token, f)
with open("../models/tokenizers/decode_token_to_h3.pickle", "wb") as f:
    pickle.dump(token_to_h3, f)

In [124]:
def tokenize(vals, val_to_token_dict):
    result = []
    for val in vals:
        result.append(val_to_token_dict[val])
    return result

In [125]:
def decode(tokens, token_to_val_dict):
    result = []
    for token in tokens:
        result.append(token_to_val_dict[token])
    return result

In [126]:
data["h3_tokens"] = data["H3_POLYLINE"].apply(lambda x: tokenize(x, h3_to_token))

In [127]:
token_vocab = data["h3_tokens"].explode().unique().tolist()

In [128]:
data.head()

Unnamed: 0,TRIP_ID,POLYLINE,H3_POLYLINE,h3_tokens
0,1372636858620000589,"[[-8.618643,41.141412],[-8.618499,41.141376],[...","[877b63adbffffff, 877b63ad9ffffff, 877b63adbff...","[0, 1, 0, 2, 0]"
1,1372637303620000596,"[[-8.639847,41.159826],[-8.640351,41.159871],[...","[877b63ad8ffffff, 877b63addffffff, 877b63adcff...","[2, 3, 4, 5, 4]"
2,1372636951620000320,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...","[877b63adbffffff, 877b63ad9ffffff, 877b63adbff...","[0, 1, 0, 2, 3, 6, 3, 1, 0]"
3,1372636854620000520,"[[-8.574678,41.151951],[-8.574705,41.151942],[...","[877b63370ffffff, 877b63adbffffff, 877b63370ff...","[7, 0, 7, 6]"
4,1372637091620000337,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...","[877b63adeffffff, 877b63adcffffff, 877b63ac3ff...","[8, 4, 5, 9, 10]"


### Skipgram features

In [129]:
test_inp = list(range(1, 6))
window_size = 2

#### Positive pairs

In [130]:
def get_positive_pairs(seq, window_size=3):
    pairs = []
    for i in range(len(seq)):
        for j in reversed(range(1, window_size+1)):
            new_idx = i - j
            if new_idx >= 0:
                pairs.append([seq[i], seq[new_idx], 1])
        for k in range(1, window_size+1):
            new_idx = i + k
            if new_idx < len(seq):
                pairs.append([seq[i], seq[new_idx], 1])
    return pairs

In [131]:
start_time = time.time()
temp_pos_pairs = get_positive_pairs(test_inp)
print(time.time() - start_time)
temp_pos_pairs[0]

7.343292236328125e-05


[1, 2, 1]

#### Get negative pairs

In [132]:
def get_training_sample(positive_pair, vocab, num_pairs=2):
    try:
        target = positive_pair[0]
    except:
        print(positive_pair)
    contexts = [positive_pair[1]]
    labels = [1]
    while True:
        neg_context = np.random.choice(vocab, size=1)[0]
        if neg_context not in positive_pair:
            contexts.append(neg_context)
            labels.append(0)
        if len(contexts) == num_pairs + 1:
            break
            
    c_l = list(zip(contexts, labels))
    random.shuffle(c_l)
    contexts, labels = zip(*c_l)

    return target, *contexts, *labels

In [133]:
temp_pos = temp_pos_pairs[0]

In [134]:
start_time = time.time()
temp_neg_pairs = get_training_sample(temp_pos, token_vocab, 2)
print(time.time() - start_time)
temp_neg_pairs

0.000843048095703125


(1, 463, 828, 2, 0, 0, 1)

### Creating training set

In [135]:
def tokens_to_skipgram_data_mp(token_pairs, token_vocab, num_neg_sample):
    pool = mp.Pool(processes=6)
    result = pool.starmap_async(get_training_sample, zip(token_pairs, itertools.repeat(token_vocab), itertools.repeat(num_neg_sample)))
    result = np.array(result.get())
    targets = np.expand_dims(result[:, 0], 1)
    contexts = result[:, 1:num_neg_sample+2]
    labels = result[:, num_neg_sample+2:]
    return targets, contexts, labels

In [136]:
def get_batch_indices(data_len, batch_sz):
    batches = []
    num_batches = math.ceil(data_len / batch_sz)
    for i in range(num_batches):
        batches.append((i*batch_sz, (i+1)*batch_sz))
    return batches

In [137]:
batch_idx = get_batch_indices(len(data), 150000)
runtimes = []

In [138]:
part_num=1
for idx in batch_idx:
    start = time.time()
    data_batch = data.loc[idx[0]:idx[1], "h3_tokens"].copy()
    
    data_batch = data_batch.apply(lambda x: get_positive_pairs(x, 2))
    data_batch = data_batch.explode()
    
    train_targets, train_contexts, train_labels = tokens_to_skipgram_data_mp(data_batch, token_vocab, 2)
    
    train_targets_tensor = torch.tensor(train_targets)
    train_contexts_tensor = torch.tensor(train_contexts)
    train_labels_tensor = torch.tensor(train_labels)
    
    torch.save(train_targets_tensor, f"../data/loc2vec_train_pt_tensors/targets/train_targets_part{part_num}.pt")
    torch.save(train_contexts_tensor, f"../data/loc2vec_train_pt_tensors/contexts/train_contexts_part{part_num}.pt")
    torch.save(train_labels_tensor, f"../data/loc2vec_train_pt_tensors/labels/train_labels_part{part_num}.pt")
    
    part_num+=1
    rt = time.time() - start
    runtimes.append(rt)

In [139]:
rt_df = pd.DataFrame({"runtimes":runtimes})
rt_df.to_csv("data_gen_runtimes.csv", index=False)