In [1]:
from torchHMM.model.discretized_HMM import DiscreteHMM, DISCRETIZATION_TECHNIQUES
from torchHMM.model.discretized_flow_HMM import FlowHMM

In [2]:
import numpy as np
import pandas as pd
import pickle as pkl

In [3]:
from hmmlearn.hmm import CategoricalHMM, GaussianHMM
from gensim.models import KeyedVectors


In [4]:
with open('clickstream_experiment/data/preprocessed_data/ClickStream_test_cleaned.pkl', 'rb') as f:
    cs = pkl.load(f)

In [5]:
session_train, sessions_test = cs.sessions[:100000], cs.sessions[-100000:]


In [6]:
item_ids = np.unique([e.item_id for s in session_train for e in s.event_list])
item_ids = {item_ids[i]: i for i in range(len(item_ids))}

Xs = [[item_ids[e.item_id] for e in s.event_list] for s in session_train]
l = [len(y) for y in Xs]

X = np.concatenate(Xs).reshape(-1, 1)

target =  [[item_ids[e.item_id] for e in s.event_list if e.item_id in item_ids.keys()] for s in sessions_test]
target = [l for l in target if len(l)]
lt = np.array([len(y) for y in target])
target = np.concatenate(target).reshape(-1, 1)

In [7]:
len(item_ids.keys())

158827

In [15]:
chmm = CategoricalHMM(5, n_iter=10, verbose=True)


In [16]:
# %time chmm.fit(X, l)

Fitting a model with 203429 free scalar parameters with only 101294 data points will result in a degenerate solution.
         1    -1083748.2408             +nan
         2     -994557.5840      +89190.6568
         3     -976106.9955      +18450.5885
         4     -965420.3883      +10686.6072
         5     -959201.3225       +6219.0657
         6     -955563.4760       +3637.8465
         7     -953236.9300       +2326.5460
         8     -951795.2712       +1441.6588
         9     -950898.4188        +896.8524


CPU times: user 19.3 s, sys: 5.61 ms, total: 19.3 s
Wall time: 19.3 s


        10     -950310.3023        +588.1165


In [17]:
print(chmm.score(target, lt))

-54006.31989162249


In [8]:
vectors = KeyedVectors.load(
    f"clickstream_experiment/data/preprocessed_data/vectors_train_20_10_5_8_cleaned.kv"
)
vecs = np.concatenate(
    [
        vectors.get_vector(k).reshape(1, -1)
        for k in list(vectors.key_to_index.keys())
    ]
)

In [9]:
unknown = vecs.mean(axis=0).reshape(1, -1)

def get_vec(e):
    try:
        return vectors.get_vector(e.item_id).reshape(1, -1)
    except:
        return unknown

In [10]:
Xs_c = [[get_vec(e) for e in s.event_list] for s in session_train]
l_c = np.array([len(y) for y in X])

X_c = np.concatenate([a for as_ in Xs_c for a in as_], axis=0)

target_c =  [[get_vec(e) for e in s.event_list if e.item_id in item_ids.keys()] for s in sessions_test]
target_c = np.concatenate([a for as_ in target_c for a in as_], axis=0)


In [11]:
w2vhmm = GaussianHMM(5, n_iter=10, verbose=True)

In [23]:
# %time w2vhmm.fit(X_c, l_c)

         1     2498821.6455             +nan
         2     4477074.3737    +1978252.7282
         3     7333227.1303    +2856152.7566
         4    13731857.0900    +6398629.9597
         5    13742197.5267      +10340.4367
         6    13748191.4425       +5993.9158
         7    13751804.5480       +3613.1055
         8    13753530.2086       +1725.6606
         9    13754305.3539        +775.1453
        10    13754635.8896        +330.5357
Some rows of transmat_ have zero sum because no transition from the state was ever observed.


CPU times: user 3min 5s, sys: 4.96 s, total: 3min 10s
Wall time: 2min 57s


In [24]:
# w2vhmm.transmat_

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [25]:
# zeros = w2vhmm.transmat_.sum(axis=1) == 0
# w2vhmm.transmat_[zeros] = np.ones(w2vhmm.transmat_.shape)[zeros] / w2vhmm.n_components


In [26]:
# print(w2vhmm.score(target_c, lt))

624583.3227919223


In [12]:
w2v_dhmm = DiscreteHMM(n_components=5, no_nodes=512, optimizer="Adam", verbose=True, covariance_type='full')

In [15]:
w2v_dhmm.discretize(X_c, False)

Nodes have been already set. Use force=True to update them


array([0., 0., 0., ..., 0., 0., 0.])

In [13]:
%time w2v_dhmm.fit(X_c, l_c)



IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
print(w2v_dhmm.score(target_c, lt))

In [None]:
w2v_fhmm = FlowHMM(n_components=5, no_nodes=512, l=100, optimizer="Adam", verbose=True)

%time w2v_fhmm.fit(X_c, l_c)

print(w2v_fhmm.score(target_c, lt))
