In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from hmm import HMM

In [2]:
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv')

In [3]:
anime = anime.sort_values(by='anime_id')
anime = anime[anime['members'] > 10000]

In [4]:
# only consider slice of life anime
anime = anime[anime['genre'].str.contains('Slice of Life') == True]

In [5]:
# aggregrate people's votes -> counts of which users voted more
counts = rating.groupby(by='user_id').agg('count')
counts = counts.rename(index=str, columns={'anime_id': 'counts'})
del counts['rating']
counts = counts.reset_index()

In [6]:
# only consider anime rated >= 7
rating = rating[rating['rating'] >= 7]

# people who rated more than n anime
top_counts = counts[(counts['counts'] >= 80)]  # ~20k users

In [7]:
# filter only ratings from people who satisfy the above criteria
rating_ = rating[rating['user_id'].isin(top_counts['user_id'])]  # filtered rating database - include top users only
rating_.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
156,3,20,8


In [8]:
len(rating_['anime_id'])  # 2653526

TIME_STEP = 2000  # anime_ids between time steps. min = 17; max = 34525.

# need to group by anime_id
rating_ = rating_.assign(time=rating_['anime_id']/TIME_STEP)
rating_['time'] = rating_['time'].apply(lambda x: int(x))

In [9]:
# filtered rating database - include only non-filtered anime
rating_ = rating_[rating_['anime_id'].isin(anime['anime_id'])]

# randomly sample 1000 users
user_ids = rating_['user_id'].unique()
user_ids = np.random.choice(user_ids, 200, replace=False)
animes = anime['anime_id'].unique()
anime_id_to_index = {}
for i in range(len(animes)):
    anime = animes[i]
    anime_id_to_index[anime] = i
    
# build an "observation sequence"
max_time = rating_['time'].max()
observation_seqs = np.zeros(shape=(len(user_ids), max_time), dtype=object)
for u in range(len(user_ids)):
    for t in range(max_time):  
        observation_seqs[u][t] = []
for u in tqdm(range(len(user_ids))):
    rating_per_user = rating_[rating_['user_id'] == user_ids[u]]
    for t in range(max_time):
        observation_seqs[u][t] = \
            rating_per_user[rating_per_user['time'] == t]['anime_id'].map(anime_id_to_index).tolist()
        if len(observation_seqs[u][t]) == 0:
             # fill in blanks with random anime -> TODO
            observation_seqs[u][t] = [anime_id_to_index[np.random.choice(animes)]] 

100%|██████████| 200/200 [00:04<00:00, 43.95it/s]


In [10]:
# hold-out time t anime
time_t_anime = []
for u in range(len(observation_seqs)):
    time_t_anime.append(observation_seqs[u][-1])

In [11]:
observation_seqs = np.resize(observation_seqs, (len(user_ids), max_time-1))

In [12]:
# average anime watched per time-step
total = 0
for seq in observation_seqs:
    total += sum([len(x) for x in seq])/len(seq)
total /= len(observation_seqs)
total

1.9026666666666663

In [13]:
n_states = 15
n_items = len(animes)
hmm = HMM(n_items, n_states, n_items + 5)

hmm.baum_welch(observation_seqs)

  0%|          | 0/20 [00:00<?, ?it/s]Process ForkPoolWorker-2:
Process ForkPoolWorker-1:
Process ForkPoolWorker-5:
Process ForkPoolWorker-3:
Process ForkPoolWorker-4:
Process ForkPoolWorker-7:
Process ForkPoolWorker-6:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 44, in

  File "/Users/kenneth/Desktop/hmm-redux/hmm.py", line 121, in <listcomp>
    for k in range(n_states)
  File "/Users/kenneth/Desktop/hmm-redux/hmm.py", line 121, in <listcomp>
    for k in range(n_states)
  File "/anaconda3/lib/python3.6/site-packages/scipy/stats/_multivariate.py", line 2797, in __call__
    return multinomial_frozen(n, p, seed)
  File "/anaconda3/lib/python3.6/site-packages/scipy/stats/_multivariate.py", line 2789, in __init__
    doccer.docformat(self.__doc__, multinomial_docdict_params)
  File "/Users/kenneth/Desktop/hmm-redux/hmm.py", line 75, in emission_prob
    multinomial_term =  multinomial.pmf(item_counts)
  File "/anaconda3/lib/python3.6/site-packages/scipy/stats/_multivariate.py", line 2855, in _logpmf
    return gammaln(n+1) + np.sum(xlogy(x, p) - gammaln(x+1), axis=-1)

  File "/anaconda3/lib/python3.6/site-packages/scipy/misc/doccer.py", line 60, in docformat
    lines = dstr.expandtabs().splitlines()
  File "/Users/kenneth/Desktop/hmm-redux/hmm.py", li

KeyboardInterrupt: 

In [None]:
hmm.A[2].sum()