In [1]:
import pickle
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

from itertools import groupby
from joblib import Parallel, delayed
from tqdm import tqdm, tqdm_notebook

In [None]:
# pq_table = pq.read_table('data/web_seq.parquet')
# df = pq_table.to_pandas()

In [2]:
df = pd.read_csv("data/sample.csv")

## Data preparations

In [3]:
df["clientId"] = df["clientId"].astype(str)
df["visitMYDateTime"] =  pd.to_datetime(df["visitMYDateTime"], format='%Y-%m-%dT%H:%M:%S.%f')
df["visitId"] = df["visitId"].astype(str)
df["visitNumber"] = df["visitNumber"].astype(int)
df["hitNumber"] = df["hitNumber"].astype(int)
df["level_1"] = df["level_1"].astype(str)
df["level_2"] = df["level_2"].astype(str)
df["level_3"] = df["level_3"].astype(str)
df["map_location_3"] = df["map_location_3"].astype(str)
df["map_location_2"] = df["map_location_2"].astype(str)
df["map_location_1"] = df["map_location_1"].astype(str)
df["global_id"] = df["global_id"].astype(str)
df["legacy_id"] = df["legacy_id"].astype(str)
df["location_type"] = df["location_type"].astype(str)
df["display_name"] = df["display_name"].astype(str)

list_clientId = df.clientId.unique().tolist()
df.head(2)

Unnamed: 0,clientId,visitMYDateTime,visitId,visitNumber,hitNumber,level_1,level_2,level_3,map_location_3,map_location_2,map_location_1,global_id,legacy_id,location_type,display_name
0,1500036865.1604922,2020-11-09 19:45:27,1604922327,1,1,kuala lumpur,kl city,,,KL City,Kuala Lumpur,mycty_51978,51,CITY,kl city
1,929830187.1604893,2020-11-09 11:43:19,1604893399,2,1,perak,gopeng,,,Gopeng,Perak,mycty_51778,466,CITY,gopeng


In [4]:
def display_name_mapping(x):
    if x["location_type"] == "BUILDING_NAME":
        return x["bld_display_name"]
    if x["location_type"] == "STREET_NAME":
        return x["str_display_name"]
    if x["location_type"] == "COUNTRY":
        return x["country_display_name"]
    if x["location_type"] == "REGION":
        return x["region_display_name"]
    if x["location_type"] == "STATE":
        return x["state_display_name"]
    if x["location_type"] == "DISTRICT":
        return x["district_display_name"]
    if x["location_type"] == "DIVISION":
        return x["div_display_name"]
    if x["location_type"] == "CITY":
        return x["city_display_name"]
    if x["location_type"] == "POST_CODE":
        return x["postcode_display_name"]

In [5]:
# location data
location_table = pq.read_table("data/my_locations_db.parquet")
location_db_df = location_table.to_pandas()
location_db_df["display_name"] = location_db_df.apply(display_name_mapping, axis=1)
location_db_df["map_location_1"] = location_db_df["state_display_name"]
location_db_df["map_location_2"] = location_db_df["city_display_name"]
location_db_df["map_location_3"] = location_db_df["bld_display_name"]
location_db_df.head(2)

Unnamed: 0,location_id,global_id,legacy_id,location_type,location_name,geo_coordinate,country_display_name,region_display_name,state_display_name,city_display_name,postcode_display_name,district_display_name,div_display_name,str_display_name,bld_display_name,display_name,map_location_1,map_location_2,map_location_3
0,5eb32ff0-46cc-48a2-a3f3-4032c04f4d32,bld_67258,7067.0,BUILDING_NAME,the amber residence,POINT(101.519839 2.971741),Malaysia,Peninsular Malaysia,Selangor,Kota Kemuning,42500.0,,,Persiaran Rimbayu,The Amber Residence,The Amber Residence,Selangor,Kota Kemuning,The Amber Residence
1,a443af86-f3f4-4e33-a1b1-e2c3c778e6e4,str_66918,,STREET_NAME,jalan hang tuah,,Malaysia,Peninsular Malaysia,Kuala Lumpur,KL City,,,,Jalan Hang Tuah,,Jalan Hang Tuah,Kuala Lumpur,KL City,


In [6]:
tmploc_df = location_db_df.copy(deep=True)[["global_id", "legacy_id", "location_type", "display_name", "map_location_1", "map_location_2", "map_location_3"]]
tmploc_df.drop_duplicates(subset=["global_id"], keep="first", inplace=True)
tmploc_df.set_index("global_id", drop=True, inplace=True)
dict_loc = tmploc_df.to_dict(orient="index")

In [7]:
print(f"Number of unique clientId: {df.clientId.nunique()}")

Number of unique clientId: 120689


In [8]:
tmp_df = df.copy(deep=True)
tmp_df["seq"] = tmp_df[["clientId", "visitMYDateTime", "visitId", "visitNumber", "hitNumber", "global_id"]]\
    .sort_values(["visitMYDateTime", "visitId", "visitNumber", "hitNumber"],ascending=True)\
    .groupby("clientId")["global_id"]\
    .transform(lambda x: '||'.join(x))
tmp_df = tmp_df[["seq"]].drop_duplicates(keep="first")
tmp_df.shape

(46176, 1)

In [9]:
list_seq = []
for v in tqdm(tmp_df.seq.values, total=len(tmp_df.seq.values), position=0, leave=True):
    list_seq.append([x[0] for x in groupby(v.split("||"))])

100%|██████████| 46176/46176 [00:00<00:00, 386082.92it/s]


## Creating vocabs

In [10]:
import pickle

In [11]:
loc2idx = {w: idx for (idx, w) in enumerate(dict_loc)}
idx2loc = {idx: w for (idx, w) in enumerate(dict_loc)}
vocab_size = len(dict_loc)

In [12]:
window_size = 5
idx_pairs = []

for sequence in tqdm(list_seq, total=len(list_seq), position=0, leave=True):
    indices = [loc2idx[location] for location in sequence]
    
    # For each word, threated as center word
    for center_loc_pos in range(len(indices)):
        # For each window position
        for w in range(-window_size, window_size + 1):
            context_loc_pos = center_loc_pos + w
            # Make soure not jump out sentence
            if context_loc_pos < 0 or context_loc_pos >= len(indices) or center_loc_pos == context_loc_pos:
                continue
            context_loc_idx = indices[context_loc_pos]
            idx_pairs.append((indices[center_loc_pos], context_loc_idx))

idx_pairs = np.array(idx_pairs)
pairs_df = pd.DataFrame(idx_pairs, columns=["loc_1", "loc_2"]).drop_duplicates(keep="first")

100%|██████████| 46176/46176 [00:00<00:00, 96604.36it/s]


## Test GPU

In [13]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if torch.cuda.is_available():
    torch.cuda.set_device(0)
    
    print(torch.cuda.device_count())
    print(torch.cuda.is_available())
    print(torch.cuda.get_device_name(0))
    
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

In [18]:
t1 = torch.randn(1,2)
t2 = torch.randn(1,2).to(device)
print(t1)
print(t2)

tensor([[ 0.2891, -0.4940]])
tensor([[-0.7659, -1.5587]])


### Tensorboard Setup

In [19]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(comment="EMB.SKIPGRAM-META.v1")

## Modelling

In [21]:
import gzip
import pickle
import datetime
import itertools

from typing import Any
from collections import Counter
from typing import Dict, List, Tuple

# from src.config import MODEL_PATH
# from src.utils.logger import logger
# from src.ml.skipgram import SkipGram
# from src.utils.io_utils import save_model
# from src.ml.data_loader import Sequences, SequencesDataset

%reload_ext autoreload
%autoreload 2

In [23]:
MODEL_PATH = "models/skipgram-meta/"

In [22]:
shuffle = True
embedding_dims = 128
epochs = 25
initial_lr = 0.025
batch_size = 16
n_workers = 16

In [24]:
def round_up(num, divisor=5):
    return ((num + divisor - 1) // divisor) * divisor