In [1]:
import pickle
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

from itertools import groupby
from joblib import Parallel, delayed
from tqdm import tqdm, tqdm_notebook 

In [None]:
# from distributed import Client
# n_jobs = 8
# client = Client(n_workers=n_jobs)
# import modin.pandas as pd

In [None]:
pq_table = pq.read_table('data/web_seq.parquet')
df = pq_table.to_pandas()

## Data preparations

In [None]:
# df = pd.read_csv("data/sample.csv")

In [None]:
df["clientId"] = df["clientId"].astype(str)
df["visitMYDateTime"] =  pd.to_datetime(df["visitMYDateTime"], format='%Y-%m-%dT%H:%M:%S.%f')
df["visitId"] = df["visitId"].astype(str)
df["visitNumber"] = df["visitNumber"].astype(int)
df["hitNumber"] = df["hitNumber"].astype(int)
df["level_1"] = df["level_1"].astype(str)
df["level_2"] = df["level_2"].astype(str)
df["level_3"] = df["level_3"].astype(str)
df["map_location_3"] = df["map_location_3"].astype(str)
df["map_location_2"] = df["map_location_2"].astype(str)
df["map_location_1"] = df["map_location_1"].astype(str)
df["global_id"] = df["global_id"].astype(str)
df["legacy_id"] = df["legacy_id"].astype(str)
df["location_type"] = df["location_type"].astype(str)
df["display_name"] = df["display_name"].astype(str)

list_clientId = df.clientId.unique().tolist()
df.head(2)

In [None]:
def display_name_mapping(x):
    if x["location_type"] == "BUILDING_NAME":
        return x["bld_display_name"]
    if x["location_type"] == "STREET_NAME":
        return x["str_display_name"]
    if x["location_type"] == "COUNTRY":
        return x["country_display_name"]
    if x["location_type"] == "REGION":
        return x["region_display_name"]
    if x["location_type"] == "STATE":
        return x["state_display_name"]
    if x["location_type"] == "DISTRICT":
        return x["district_display_name"]
    if x["location_type"] == "DIVISION":
        return x["div_display_name"]
    if x["location_type"] == "CITY":
        return x["city_display_name"]
    if x["location_type"] == "POST_CODE":
        return x["postcode_display_name"]

In [None]:
# location data
location_table = pq.read_table("data/my_locations_db.parquet")
location_db_df = location_table.to_pandas()
location_db_df["display_name"] = location_db_df.apply(display_name_mapping, axis=1)
location_db_df["map_location_1"] = location_db_df["state_display_name"]
location_db_df["map_location_2"] = location_db_df["city_display_name"]
location_db_df["map_location_3"] = location_db_df["bld_display_name"]
location_db_df.head(2)

In [None]:
tmploc_df = location_db_df.copy(deep=True)[["global_id", "legacy_id", "location_type", "display_name", "map_location_1", "map_location_2", "map_location_3"]]
tmploc_df.drop_duplicates(subset=["global_id"], keep="first", inplace=True)
tmploc_df.set_index("global_id", drop=True, inplace=True)
dict_loc = tmploc_df.to_dict(orient="index")

In [None]:
# with open('models/loc2name.p', 'wb') as f:
#     pickle.dump(dict_loc, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
print(f"Number of unique clientId: {df.clientId.nunique()}")

In [None]:
tmp_df = df.copy(deep=True)
tmp_df["seq"] = tmp_df[["clientId", "visitMYDateTime", "visitId", "visitNumber", "hitNumber", "global_id"]]\
    .sort_values(["visitMYDateTime", "visitId", "visitNumber", "hitNumber"],ascending=True)\
    .groupby("clientId")["global_id"]\
    .transform(lambda x: '||'.join(x))
tmp_df = tmp_df[["seq"]].drop_duplicates(keep="first")
tmp_df.shape

In [None]:
list_seq = []
for v in tqdm(tmp_df.seq.values, total=len(tmp_df.seq.values), position=0, leave=True):
    list_seq.append([x[0] for x in groupby(v.split("||"))])

## Creating vocabs

In [None]:
import pickle

In [None]:
loc2idx = {w: idx for (idx, w) in enumerate(dict_loc)}
idx2loc = {idx: w for (idx, w) in enumerate(dict_loc)}
vocab_size = len(dict_loc)

In [None]:
# with open('models/loc2idx.p', 'wb') as f:
#     pickle.dump(loc2idx, f, protocol=pickle.HIGHEST_PROTOCOL)

# with open('models/idx2loc.p', 'wb') as fp:
#     pickle.dump(idx2loc, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
vocab_size

In [None]:
window_size = 5
idx_pairs = []

for sequence in tqdm(list_seq, total=len(list_seq), position=0, leave=True):
    indices = [loc2idx[location] for location in sequence]
    
    # For each word, threated as center word
    for center_loc_pos in range(len(indices)):
        # For each window position
        for w in range(-window_size, window_size + 1):
            context_loc_pos = center_loc_pos + w
            # Make soure not jump out sentence
            if context_loc_pos < 0 or context_loc_pos >= len(indices) or center_loc_pos == context_loc_pos:
                continue
            context_loc_idx = indices[context_loc_pos]
            idx_pairs.append((indices[center_loc_pos], context_loc_idx))

idx_pairs = np.array(idx_pairs)
pairs_df = pd.DataFrame(idx_pairs, columns=["loc_1", "loc_2"]).drop_duplicates(keep="first")

In [2]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

### Test GPU

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(0)
print(torch.cuda.device_count())
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

In [None]:
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

In [None]:
t1 = torch.randn(1,2)
t2 = torch.randn(1,2).to(device)
print(t1)
print(t2)

### Tensorboard Setup

In [None]:
from torch.utils.tensorboard import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/skipgram2')

### Modelling

In [None]:
def get_input_layer(word_idx):
    x = torch.zeros(vocab_size).float()
    x[word_idx] = 1.0
    return x

### SkipGram #1

In [None]:
embedding_dims = 32

In [None]:
W1 = Variable(torch.randn(embedding_dims, vocab_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocab_size, embedding_dims).float(), requires_grad=True)
num_epochs = 51
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epoch {epo}: {loss_val/len(idx_pairs)}')

### SkipGram #2

<img src="src/images/SkipGram-Negative-Sampling.png" alt="drawing" width="1080px"/>

In [3]:
import gzip
import pickle
import datetime
import itertools

from typing import Any
from collections import Counter
from typing import Dict, List, Tuple

from src.config import MODEL_PATH
from src.utils.logger import logger
from src.ml.skipgram import SkipGram
from src.utils.io_utils import save_model
from src.ml.data_loader import Sequences, SequencesDataset

%reload_ext autoreload
%autoreload 2

In [4]:
with open("models/list_seq.p", 'rb') as f:
    list_seq = pickle.load(f)

In [None]:
shuffle = True
embedding_dims = 128
epochs = 25
initial_lr = 0.025
batch_size = 16
n_workers = 16

In [None]:
# Load dataloader
sequences = Sequences(seq_list=list_seq, vocab_dict=dict_loc)
dataset = SequencesDataset(sequences)
dataloader = DataLoader(dataset, 
                        batch_size=batch_size, 
                        shuffle=shuffle, 
                        # num_workers=n_workers, 
                        collate_fn=dataset.collate)

In [None]:
# Initialize model
skipgram = SkipGram(vocab_size, embedding_dims).to(device)

# Train loop
optimizer = optim.SparseAdam(list(skipgram.parameters()), lr=initial_lr)

In [None]:
results = []
start_time = datetime.datetime.now()
for epoch in tqdm(range(epochs), total=epochs, position=0, leave=True):
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader))
    running_loss = 0
    
    # Training loop
    for i, batches in enumerate(dataloader):
        centers = batches[0].to(device)
        contexts = batches[1].to(device)
        neg_contexts = batches[2].to(device)

        optimizer.zero_grad()
        loss = skipgram.forward(centers, contexts, neg_contexts)
        loss.backward()
        optimizer.step()

        scheduler.step()
        running_loss = running_loss * 0.9 + loss.item() * 0.1
        
    logger.info("Epoch: {}, Loss: {:.4f}, Lr: {:.6f}".format(epoch, running_loss, optimizer.param_groups[0]['lr']))
    results.append([epoch, i, running_loss])
    running_loss = 0

    # save model
    current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
    state_dict_path = '{}/skipgram_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime)
    torch.save(skipgram.state_dict(), state_dict_path)
    logger.info('Model state dict saved to {}'.format(state_dict_path))

end_time = datetime.datetime.now()
time_diff = round((end_time - start_time).total_seconds() / 60, 2)
logger.info('Total time taken: {:,} minutes'.format(time_diff))

In [None]:
# Save results
results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss'])
results_df.to_csv('{}/model_metrics_w2v.csv'.format(MODEL_PATH), index=False)

In [None]:
results_df.head()

In [None]:
# Initialize validation set
val_samp = pd.read_csv('data/valid_model.csv')

# Get product ID
val_samp['loc1_id'] = val_samp['loc_1'].values
val_samp['loc2_id'] = val_samp['loc_2'].values
val_samp = val_samp[(val_samp['loc1_id'] > -1) & (val_samp['loc2_id'] > -1)]  # Keep those with valid ID
logger.info('No. of validation samples: {}'.format(val_samp.shape[0]))

loc1_id = val_samp['loc1_id'].values
loc2_id = val_samp['loc2_id'].values

### Test the model

In [1]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [2]:
import gzip
import pickle
import datetime
import itertools

from typing import Any
from collections import Counter
from typing import Dict, List, Tuple

from src.config import MODEL_PATH
from src.utils.logger import logger
from src.ml.skipgram import SkipGram
from src.utils.io_utils import save_model
from src.ml.data_loader import Sequences, SequencesDataset

%reload_ext autoreload
%autoreload 2

In [3]:
from scipy.spatial import distance

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
vocab_size = 14699 # 6273
embedding_dims = 128

skipgram = SkipGram(vocab_size, embedding_dims).to(device)
skipgram.load_state_dict(torch.load("models/skipgram_epoch_24_2020-11-16-1439.pt", map_location=device))
skipgram.eval()

SkipGram(
  (center_embeddings): Embedding(14699, 128, sparse=True)
  (context_embeddings): Embedding(14699, 128, sparse=True)
)

In [6]:
# skipgram.save_embeddings("models/embeddings")

In [7]:
emb_vec = np.load("models/embeddings.npy")

In [8]:
with open("models/idx2loc.p", 'rb') as f:
    dict_idx2loc = pickle.load(f)
with open("models/loc2idx.p", 'rb') as fp:
    dict_loc2idx = pickle.load(fp)
with open("models/loc2name.p", 'rb') as p:
    dict_loc2name = pickle.load(p)

In [9]:
dict_idx2name = {}
for x in dict_idx2loc.keys():
    dict_idx2name[x] = dict_loc2name[dict_idx2loc[x]]

In [10]:
import pandas as pd

metadata_df = pd.DataFrame.from_dict(dict_idx2loc, orient='index')
metadata_df['loc_idx'] = metadata_df.index
metadata_df.columns = [['loc_id', 'loc_idx']]
# metadata_df = metadata_df[["loc_id"]].replace(to_replace='None', value=np.nan).dropna()
# metadata_df.set_index(metadata_df[['loc_id']].values.reshape(-1,).tolist(), drop=False, verify_integrity=True)
metadata_df.head()

Unnamed: 0,loc_id,loc_idx
0,bld_67258,0
1,str_66918,1
2,str_66979,2
3,bld_66962,3
4,bld_66917,4


In [11]:
name_meta_df = pd.DataFrame.from_dict(dict_loc2name, orient='index')
name_meta_df['loc_id'] = name_meta_df.index
name_meta_df['full_address'] = name_meta_df.apply(lambda x: f"{x['map_location_3'] }, { x['map_location_2'] }, { x['map_location_1'] }", axis=1)
name_meta_df[["loc_id", "full_address"]].head()

# .to_csv('models/metadata.tsv', sep='\t', index=False)

Unnamed: 0,loc_id,full_address
bld_67258,bld_67258,"The Amber Residence, Kota Kemuning, Selangor"
str_66918,str_66918,"None, KL City, Kuala Lumpur"
str_66979,str_66979,"None, Batu Caves, Kuala Lumpur"
bld_66962,bld_66962,"Desa Bakti, Batu Caves, Kuala Lumpur"
bld_66917,bld_66917,"Flat DBKL Jalan Hang Tuah, KL City, Kuala Lumpur"


In [12]:
def closest_node(loc_type, index, nodes, top_n=5, filter_type=False):
    node = nodes[index]
    closest_index = distance.cdist([node], nodes, "cosine")[0]
    result = zip(range(len(closest_index)), closest_index)
    result = sorted(result, key=lambda x: x[1])
    
    location_src = dict_loc2name[dict_idx2loc[index]]
    print(f"Finding location near to: { dict_idx2loc[index] } ({location_src['map_location_3']}, {location_src['map_location_2']}, {location_src['map_location_1']}) - is a {location_src['location_type']} \n")
    
    if not filter_type:
        cnt = 1
        for idx, dist in result[1:top_n+1]:
            location = dict_loc2name[dict_idx2loc[idx]]
            l1 = location["map_location_1"]
            l2 = location["map_location_2"]
            l3 = location["map_location_3"]

            print(f"{cnt} ==> {dict_idx2loc[idx]} - {l3}, {l2}, {l1}, (score: {dist})")
            cnt += 1
    
    if filter_type:
        cnt_loc_type = 0
        idx_loc_type = 1
        while cnt_loc_type < top_n+1:
            
            idx, dist = result[idx_loc_type]
            location = dict_loc2name[dict_idx2loc[idx]]
            l1 = location["map_location_1"]
            l2 = location["map_location_2"]
            l3 = location["map_location_3"]

            if location["location_type"] == loc_type:
                print(f"{cnt_loc_type} ==> {dict_idx2loc[idx]} - {l3}, {l2}, {l1}, (score: {dist})")
                cnt_loc_type += 1
            idx_loc_type += 1
        


In [13]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [14]:
closest_node("COUNTRY", 5, emb_vec, top_n=10)

Finding location near to: ctr_23684 (None, None, None) - is a COUNTRY 

1 ==> bld_62664 - Taman Tasek Emas, Kampar, Perak, (score: 0.6764543054944703)
2 ==> str_57702 - None, Kepong, Kuala Lumpur, (score: 0.6936324195169968)
3 ==> mycty_51971 - None, Enggor, Perak, (score: 0.6957895923870276)
4 ==> str_57170 - None, Cheras, Selangor, (score: 0.6981588830398032)
5 ==> bld_63393 - Desa Damansara 2, Damansara Heights, Kuala Lumpur, (score: 0.7030721580953188)
6 ==> ptc_53223 - None, None, Selangor, (score: 0.7048956372812158)
7 ==> bld_62122 - Mont Kiara Damai Resort Condominium, Mont Kiara, Kuala Lumpur, (score: 0.7106526514577839)
8 ==> str_59687 - None, Jelutong, Penang, (score: 0.7129465472031167)
9 ==> ptc_52514 - None, None, Penang, (score: 0.7135933383524986)
10 ==> str_60183 - None, Salak Selatan, Kuala Lumpur, (score: 0.7171585430411767)


In [15]:
dict_name2idx = {f"{ v['display_name'] } - { v['location_type'] }": k for k, v in dict_idx2name.items()}

In [16]:
import pandas as pd
loc_type_list = pd.DataFrame(dict_idx2name).transpose()["location_type"].unique().tolist()
loc_type_list

['BUILDING_NAME',
 'STREET_NAME',
 'COUNTRY',
 'REGION',
 'STATE',
 'DISTRICT',
 'DIVISION',
 'CITY',
 'POST_CODE']

In [17]:
bld_list = []
for building in list(dict_name2idx.keys()):
    if building.endswith("BUILDING_NAME"):
        bld_list.append(building)

ct_list = []
for city in list(dict_name2idx.keys()):
    if city.endswith("CITY"):
        ct_list.append(city)

state_list = []
for state in list(dict_name2idx.keys()):
    if city.endswith("CITY"):
        state_list.append(city)


In [18]:
@interact(x=sorted(ct_list), loc_type=loc_type_list)
def dropdown_idx(x, loc_type):
    idx = dict_name2idx[x]
    closest_node(loc_type, idx, emb_vec, top_n=20, filter_type=True)

interactive(children=(Dropdown(description='x', options=('Air Tawar - CITY', 'Alai - CITY', 'Alam Impian - CIT…

In [19]:
@interact(x=sorted(ct_list), y=sorted(ct_list))
def minus_vec(x,y):
    idx_1 = dict_name2idx[x]
    idx_2 = dict_name2idx[y]

    emb_idx_1 = emb_vec[idx_1]
    emb_idx_2 = emb_vec[idx_2]
    diff_emb = emb_idx_1 - emb_idx_2
    
#     print(emb_idx_1)

    closest_index = distance.cdist([diff_emb], emb_vec, "cosine")[0]
    result = zip(range(len(closest_index)), closest_index)
    result = sorted(result, key=lambda x: x[1])

    print(f"We try ({x}) - ({y}) ...")
    print(f"Finding location near to: ({x}) - ({y}) \n")

    cnt = 0
    for idx, dist in result[1:6]:
        location = dict_loc2name[dict_idx2loc[idx]]
        l1 = location["map_location_1"]
        l2 = location["map_location_2"]
        l3 = location["map_location_3"]

        print(f"{cnt} ==> {dict_idx2loc[idx]} - {l3}, {l2}, {l1}, (score: {dist})")
        cnt =+ 1

interactive(children=(Dropdown(description='x', options=('Air Tawar - CITY', 'Alai - CITY', 'Alam Impian - CIT…

## Tensorboard

In [1]:
%load_ext tensorboard

In [42]:
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(comment="EMB.SKIPGRAM.v1")

In [43]:
meta_list = []
for index in tqdm(dict_idx2name.keys()):
    
    location_type = dict_idx2name[index]['location_type']
    map_location_1 = dict_idx2name[index]['map_location_1']
    map_location_2 = dict_idx2name[index]['map_location_2']
    map_location_3 = dict_idx2name[index]['map_location_3']

    meta = f"{location_type} - {map_location_3}, {map_location_2}, {map_location_1}"
    meta_list.append(meta)
    
writer.add_embedding(torch.tensor(emb_vec), metadata=meta_list, tag="locations")

100%|██████████| 14699/14699 [00:00<00:00, 992100.07it/s]


In [2]:
%tensorboard --logdir ./runs

Reusing TensorBoard on port 6007 (pid 8670), started 0:15:52 ago. (Use '!kill 8670' to kill it.)