In [None]:
from torchHMM.model.discretized_HMM import DiscreteHMM, DISCRETIZATION_TECHNIQUES
from torchHMM.model.discretized_flow_HMM import FlowHMM

In [None]:
import numpy as np
import pandas as pd
import pickle as pkl

In [3]:
from hmmlearn.hmm import CategoricalHMM, GaussianHMM
from gensim.models import KeyedVectors


In [4]:
with open('clickstream_experiment/data/preprocessed_data/ClickStream_test_cleaned.pkl', 'rb') as f:
    cs = pkl.load(f)
cs.sessions = cs.sessions[:10000]

In [6]:
item_ids = np.unique([e.item_id for s in cs.sessions for e in s.event_list])
item_ids = {item_ids[i]: i for i in range(len(item_ids))}

Xs = [[item_ids[e.item_id] for e in s.event_list[:-1]] for s in cs.sessions]
l = [len(y) for y in Xs]

X = np.concatenate(Xs).reshape(-1, 1)

target = [item_ids[s.event_list[-1].item_id] for s in cs.sessions]

In [7]:
X

array([[147834],
       [ 75421],
       [126066],
       ...,
       [ 35192],
       [ 35192],
       [ 35192]])

In [8]:
chmm = CategoricalHMM(8, n_iter=10, verbose=True)


In [9]:
%time chmm.fit(X, l)

CPU times: user 8min 33s, sys: 378 ms, total: 8min 33s
Wall time: 8min 33s


In [22]:
l1 = chmm.score(np.array(X).reshape(-1, 1), l)
l2 = chmm.score(np.concatenate([[item_ids[e.item_id] for e in s.event_list] for s in cs.sessions]).reshape(-1, 1), np.array(l) + 1)
(l2 - l1) / len(l)

-inf

In [29]:
vectors = KeyedVectors.load(
    f"clickstream_experiment/data/preprocessed_data/vectors_train_20_10_5_8_cleaned.kv"
)
vecs = np.concatenate(
    [
        vectors.get_vector(k).reshape(1, -1)
        for k in list(vectors.key_to_index.keys())
    ]
)

In [30]:
unknown = vecs.mean(axis=0).reshape(1, -1)

def get_vec(e):
    try:
        return vectors.get_vector(e.item_id).reshape(1, -1)
    except:
        return unknown

In [31]:
Xs_c = [[get_vec(e) for e in s.event_list[:-1]] for s in cs.sessions]
l_c = [len(y) for y in X]

X_c = np.concatenate([a for as_ in Xs_c for a in as_], axis=0)

target_c = [get_vec(s.event_list[-1]) for s in cs.sessions]

In [32]:
X_c.shape

(4147529, 100)

In [33]:
w2vhmm = GaussianHMM(8, n_iter=10, verbose=True)

In [None]:
%time w2vhmm.fit(X_c, l_c)

In [None]:
l1c = w2vhmm.score(X_c, l_c)
l2c = w2vhmm.score(np.concatenate([get_vec(e) for s in cs.sessions for e in s.event_list]), np.array(l_c) + 1)
(l2c - l1c) / len(l_c)

In [None]:
w2v_dhmm = DiscreteHMM(n_components=8, no_nodes=512, l=100, optimizer="Adam", verbose=True)

In [None]:
%time w2vhmm.fit(X_c, l_c)

In [None]:
l1c_d = w2v_dhmm.score(X_c, l_c)
l2c_d = w2v_dhmm.score(np.concatenate([get_vec(e) for s in cs.sessions for e in s.event_list]), np.array(l_c) + 1)
(l2c_d - l1c_d) / len(l_c)

In [None]:
w2v_fhmm = FlowHMM(n_components=8, no_nodes=512, l=100, optimizer="Adam", verbose=True)
%time w2v_fhmm.fit(X_c, l_c)
l1c_f = w2v_fhmm.score(X_c, l_c)
l2c_f = w2v_fhmm.score(np.concatenate([get_vec(e) for s in cs.sessions for e in s.event_list]), np.array(l_c) + 1)
(l2c_f - l1c_f) / len(l_c)