In [1]:
from torchHMM.model.discretized_HMM import DiscreteHMM, DISCRETIZATION_TECHNIQUES
from torchHMM.model.discretized_flow_HMM import FlowHMM

In [2]:
import numpy as np
import pandas as pd
import pickle as pkl

In [3]:
from hmmlearn.hmm import CategoricalHMM, GaussianHMM
from gensim.models import KeyedVectors


In [4]:
with open('clickstream_experiment/data/preprocessed_data/ClickStream_test_cleaned.pkl', 'rb') as f:
    cs = pkl.load(f)
cs.sessions = cs.sessions[:10000]

In [5]:
item_ids = np.unique([e.item_id for s in cs.sessions for e in s.event_list])
item_ids = {item_ids[i]: i for i in range(len(item_ids))}

Xs = [[item_ids[e.item_id] for e in s.event_list[:-1]] for s in cs.sessions]
l = [len(y) for y in Xs]

X = np.concatenate(Xs).reshape(-1, 1)

target = [item_ids[s.event_list[-1].item_id] for s in cs.sessions]

In [6]:
X

array([[24915],
       [12793],
       [21234],
       ...,
       [21501],
       [21501],
       [21501]])

In [7]:
chmm = CategoricalHMM(5, n_iter=10, verbose=True)


In [8]:
%time chmm.fit(X, l)

Fitting a model with 203424 free scalar parameters with only 91294 data points will result in a degenerate solution.
         1     -973512.5061             +nan
         2     -896926.5215      +76585.9845
         3     -877996.2506      +18930.2709
         4     -866943.0714      +11053.1792
         5     -861907.4807       +5035.5907
         6     -859430.2417       +2477.2390
         7     -857916.4811       +1513.7606
         8     -856769.7643       +1146.7168
         9     -855869.2502        +900.5141


CPU times: user 13.6 s, sys: 4.16 ms, total: 13.6 s
Wall time: 13.6 s


        10     -855207.5541        +661.6961


In [9]:
l1 = chmm.score(np.array(X).reshape(-1, 1), l)
l2 = chmm.score(np.concatenate([[item_ids[e.item_id] for e in s.event_list] for s in cs.sessions]).reshape(-1, 1), np.array([len(s.event_list) for s in cs.sessions]))
(l2 - l1) / len(l)

IndexError: index 40681 is out of bounds for axis 1 with size 40681

In [10]:
vectors = KeyedVectors.load(
    f"clickstream_experiment/data/preprocessed_data/vectors_train_20_10_5_8_cleaned.kv"
)
vecs = np.concatenate(
    [
        vectors.get_vector(k).reshape(1, -1)
        for k in list(vectors.key_to_index.keys())
    ]
)

In [11]:
unknown = vecs.mean(axis=0).reshape(1, -1)

def get_vec(e):
    try:
        return vectors.get_vector(e.item_id).reshape(1, -1)
    except:
        return unknown

In [12]:
Xs_c = [[get_vec(e) for e in s.event_list[:-1]] for s in cs.sessions]
l_c = [len(y) for y in X]

X_c = np.concatenate([a for as_ in Xs_c for a in as_], axis=0)

target_c = [get_vec(s.event_list[-1]) for s in cs.sessions]

In [13]:
X_c.shape

(91294, 20)

In [14]:
w2vhmm = GaussianHMM(5, n_iter=10, verbose=True)

In [15]:
%time w2vhmm.fit(X_c, l_c)

         1     1951395.3051             +nan
         2     4328131.6122    +2376736.3071
         3     7631054.5663    +3302922.9542
         4    12304989.0264    +4673934.4600
         5    12305317.3601        +328.3337
         6    12305533.6946        +216.3345
         7    12305613.5257         +79.8311
         8    12305674.5257         +61.0000
         9    12305722.5136         +47.9880
        10    12305783.1712         +60.6576
Some rows of transmat_ have zero sum because no transition from the state was ever observed.


CPU times: user 1min 53s, sys: 656 ms, total: 1min 54s
Wall time: 1min 53s


In [16]:
l1c = w2vhmm.score(X_c, l_c)
l2c = w2vhmm.score(np.concatenate([get_vec(e) for s in cs.sessions for e in s.event_list]),np.array([len(s.event_list) for s in cs.sessions]))
(l2c - l1c) / len(l_c)

ValueError: transmat_ rows must sum to 1 (got [0. 0. 0. 0. 0.])

In [17]:
w2v_dhmm = DiscreteHMM(n_components=5, no_nodes=512, l=100, optimizer="Adam", verbose=True)

In [18]:
%time w2v_dhmm.fit(X_c, l_c)



AttributeError: 'list' object has no attribute 'cumsum'

In [19]:
l1c_d = w2v_dhmm.score(X_c, l_c)
l2c_d = w2v_dhmm.score(np.concatenate([get_vec(e) for s in cs.sessions for e in s.event_list]), np.array([len(s.event_list) for s in cs.sessions]))
(l2c_d - l1c_d) / len(l_c)

-2.2289417692251137

In [20]:
w2v_fhmm = FlowHMM(n_components=5, no_nodes=512, l=100, optimizer="Adam", verbose=True)
%time w2v_fhmm.fit(X_c, l_c)
l1c_f = w2v_fhmm.score(X_c, l_c)
l2c_f = w2v_fhmm.score(np.concatenate([get_vec(e) for s in cs.sessions for e in s.event_list]), np.array([len(s.event_list) for s in cs.sessions]))
(l2c_f - l1c_f) / len(l_c)

AttributeError: 'list' object has no attribute 'cumsum'

IndexError: arrays used as indices must be of integer (or boolean) type