In [1]:
from torchHMM.model.discretized_HMM import DiscreteHMM, DISCRETIZATION_TECHNIQUES
from torchHMM.model.discretized_flow_HMM import FlowHMM

In [2]:
import numpy as np
import pandas as pd
import pickle as pkl

In [3]:
from hmmlearn.hmm import CategoricalHMM, GaussianHMM
from gensim.models import KeyedVectors


In [4]:
with open('clickstream_experiment/data/preprocessed_data/ClickStream_test_cleaned.pkl', 'rb') as f:
    cs = pkl.load(f)
cs.sessions = cs.sessions[:10000]

In [5]:
item_ids = np.unique([e.item_id for s in cs.sessions for e in s.event_list])
item_ids = {item_ids[i]: i for i in range(len(item_ids))}

Xs = [[item_ids[e.item_id] for e in s.event_list[:-1]] for s in cs.sessions]
l = [len(y) for y in Xs]

X = np.concatenate(Xs).reshape(-1, 1)

target = [item_ids[s.event_list[-1].item_id] for s in cs.sessions]

In [6]:
X

array([[24915],
       [12793],
       [21234],
       ...,
       [21501],
       [21501],
       [21501]])

In [7]:
chmm = CategoricalHMM(5, n_iter=10, verbose=True)


In [8]:
%time chmm.fit(X, l)

Fitting a model with 203424 free scalar parameters with only 91294 data points will result in a degenerate solution.
         1     -983814.7831             +nan
         2     -910837.0780      +72977.7051
         3     -896773.9569      +14063.1211
         4     -883694.0611      +13079.8958
         5     -874817.9125       +8876.1486
         6     -869870.5634       +4947.3491
         7     -867042.1743       +2828.3891
         8     -865347.4804       +1694.6939
         9     -864208.7605       +1138.7199


CPU times: user 13.3 s, sys: 7.52 ms, total: 13.3 s
Wall time: 13.3 s


        10     -863438.8693        +769.8912


In [9]:
l1 = chmm.score(np.array(X).reshape(-1, 1), l)
l2 = chmm.score(np.concatenate([[item_ids[e.item_id] for e in s.event_list] for s in cs.sessions]).reshape(-1, 1), np.array(l) + 1)
(l2 - l1) / len(l)

IndexError: index 40681 is out of bounds for axis 1 with size 40681

In [10]:
vectors = KeyedVectors.load(
    f"clickstream_experiment/data/preprocessed_data/vectors_train_20_10_5_8_cleaned.kv"
)
vecs = np.concatenate(
    [
        vectors.get_vector(k).reshape(1, -1)
        for k in list(vectors.key_to_index.keys())
    ]
)

In [11]:
unknown = vecs.mean(axis=0).reshape(1, -1)

def get_vec(e):
    try:
        return vectors.get_vector(e.item_id).reshape(1, -1)
    except:
        return unknown

In [12]:
Xs_c = [[get_vec(e) for e in s.event_list[:-1]] for s in cs.sessions]
l_c = [len(y) for y in X]

X_c = np.concatenate([a for as_ in Xs_c for a in as_], axis=0)

target_c = [get_vec(s.event_list[-1]) for s in cs.sessions]

In [13]:
X_c.shape

(91294, 20)

In [14]:
w2vhmm = GaussianHMM(5, n_iter=10, verbose=True)

In [15]:
%time w2vhmm.fit(X_c, l_c)

         1     1560961.8041             +nan
         2     4040172.9465    +2479211.1424
         3     6614705.8266    +2574532.8801
         4    12285094.7698    +5670388.9432
         5    12294925.2681       +9830.4983
         6    12300448.1653       +5522.8972
         7    12303231.9045       +2783.7392
         8    12304527.1558       +1295.2513
         9    12305071.4798        +544.3241
        10    12305344.6423        +273.1625
Some rows of transmat_ have zero sum because no transition from the state was ever observed.


CPU times: user 1min 52s, sys: 643 ms, total: 1min 53s
Wall time: 1min 52s


In [16]:
l1c = w2vhmm.score(X_c, l_c)
l2c = w2vhmm.score(np.concatenate([get_vec(e) for s in cs.sessions for e in s.event_list]), np.array(l_c) + 1)
(l2c - l1c) / len(l_c)

ValueError: transmat_ rows must sum to 1 (got [0. 0. 0. 0. 0.])

In [17]:
w2v_dhmm = DiscreteHMM(n_components=5, no_nodes=512, l=100, optimizer="Adam", verbose=True)

In [18]:
%time w2v_dhmm.fit(X_c, l_c)



AttributeError: 'list' object has no attribute 'cumsum'

In [19]:
l1c_d = w2v_dhmm.score(X_c, l_c)
l2c_d = w2v_dhmm.score(np.concatenate([get_vec(e) for s in cs.sessions for e in s.event_list]), np.array(l_c) + 1)
(l2c_d - l1c_d) / len(l_c)

ValueError: more than 101294 samples in lengths array [2 2 2 ... 2 2 2]

In [20]:
w2v_fhmm = FlowHMM(n_components=5, no_nodes=512, l=100, optimizer="Adam", verbose=True)
%time w2v_fhmm.fit(X_c, l_c)
l1c_f = w2v_fhmm.score(X_c, l_c)
l2c_f = w2v_fhmm.score(np.concatenate([get_vec(e) for s in cs.sessions for e in s.event_list]), np.array(l_c) + 1)
(l2c_f - l1c_f) / len(l_c)

AttributeError: 'list' object has no attribute 'cumsum'

IndexError: arrays used as indices must be of integer (or boolean) type