In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from hmm import HMM

In [2]:
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv')

In [3]:
anime = anime.sort_values(by='anime_id')
anime = anime[anime['members'] > 10000]

In [4]:
# only consider slice of life anime
anime = anime[anime['genre'].str.contains('Slice of Life') == True]

In [5]:
# aggregrate people's votes -> counts of which users voted more
counts = rating.groupby(by='user_id').agg('count')
counts = counts.rename(index=str, columns={'anime_id': 'counts'})
del counts['rating']
counts = counts.reset_index()

In [6]:
# only consider anime rated >= 7
rating = rating[rating['rating'] >= 7]

# people who rated more than n anime
top_counts = counts[(counts['counts'] >= 80)]  # ~20k users

In [7]:
# filter only ratings from people who satisfy the above criteria
rating_ = rating[rating['user_id'].isin(top_counts['user_id'])]  # filtered rating database - include top users only
rating_.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
156,3,20,8


In [8]:
len(rating_['anime_id'])  # 2653526

TIME_STEP = 5000  # anime_ids between time steps. min = 17; max = 34525.

# need to group by anime_id
rating_ = rating_.assign(time=rating_['anime_id']/TIME_STEP)
rating_['time'] = rating_['time'].apply(lambda x: int(x))

In [9]:
# filtered rating database - include only non-filtered anime
rating_ = rating_[rating_['anime_id'].isin(anime['anime_id'])]

# randomly sample 1000 users
user_ids = rating_['user_id'].unique()
user_ids = np.random.choice(user_ids, 1000, replace=False)
animes = anime['anime_id'].unique()
anime_id_to_index = {}
for i in range(len(animes)):
    anime = animes[i]
    anime_id_to_index[anime] = i
    
# build an "observation sequence"
max_time = rating_['time'].max()
observation_seqs = np.zeros(shape=(len(user_ids), max_time), dtype=object)
for u in range(len(user_ids)):
    for t in range(max_time):  
        observation_seqs[u][t] = []
for u in tqdm(range(len(user_ids))):
    rating_per_user = rating_[rating_['user_id'] == user_ids[u]]
    for t in range(max_time):
        observation_seqs[u][t] = \
            rating_per_user[rating_per_user['time'] == t]['anime_id'].map(anime_id_to_index).tolist()
        if len(observation_seqs[u][t]) == 0:
             # fill in blanks with random anime -> TODO according to NBD
            observation_seqs[u][t] = [anime_id_to_index[np.random.choice(animes)]] 

100%|██████████| 1000/1000 [00:09<00:00, 109.97it/s]


In [10]:
# hold-out time t anime
time_t_anime = []
for u in range(len(observation_seqs)):
    time_t_anime.append(observation_seqs[u][-1])

In [11]:
observation_seqs = np.resize(observation_seqs, (len(user_ids), max_time-1))

In [None]:
# average anime watched per time-step
total = 0
for seq in observation_seqs:
    total += sum([len(x) for x in seq])/len(seq)
total /= len(observation_seqs)
total

4.2786

In [None]:
n_states = 15
n_items = len(animes)
hmm = HMM(n_items, n_states, n_items + 5)

hmm.baum_welch(observation_seqs, 25)

 12%|█▏        | 3/25 [01:05<07:59, 21.81s/it]

In [None]:
titles = [
#     ('Naruto', 20),
#     ('Fairy Tail', 6702),
#     ('Kami nomi zo Shiru Sekai', 8525),
#     ('Kami nomi zo Shiru Sekai II', 10080),
#     ('Kami nomi zo Shiru Sekai: Megami-hen', 16706),
#     ('Shigatsu wa Kimi no Uso', 23273),
    ('Hyouka', 12189),
#     ('Shinsekai yori', 13125),
#     ('Ore monogatari', 28297),
#     ('Gintama', 918),
    ('Toradora!', 4224),
    ('Nichijou', 10165),
    ('Gochuumon wa Usagi desu ka?', 21273),
    ('Non Non Biyori', 17549),
#     ('Yahari Ore no Seishun Love Comedy wa Machigatteiru.', 14813),
    ('K-On!', 5680),
    ('K-On!!', 7791),
    ('Gochuumon wa Usagi desu ka??', 29787),
    ('K-On! Movie', 9617),
    ('Tamako Market', 16417),
    ('Tamako Love Story', 21647),
#     ('Yahari Ore no Seishun Love Comedy wa Machigatteiru. Zoku', 23847),
#     ('Hibike! Euphonium', 27989),
    ('Non Non Biyori Repeat', 23623),
    ('Kiniro Mosaic', 16732),
    ('Hello!! Kiniro Mosaic', 23269),
    ('So Ra No Wo To', 6802),
    ('Lucky*Star', 1887)
]
title_ids = np.array([[anime_id_to_index[x]] for _, x in titles])

In [None]:
alphas, _ = hmm.forward(hmm.n_states, hmm.a, hmm.b, hmm.theta, hmm.pi, hmm.A, title_ids)

In [None]:
results = hmm.item_rank(alphas[:-1])

In [None]:
results

In [None]:
anime = pd.read_csv('anime.csv')

SoL_anime = anime[(anime['genre'].str.contains('Slice of Life') == True)]
SoL_animeids = SoL_anime['anime_id'].tolist()

results_name = []
for result in results:
    anime_id = animes[result]
    if anime_id in SoL_animeids:
        results_name.append(SoL_anime[SoL_anime['anime_id'] == anime_id]['name'].to_string(index=False))

In [None]:
results_name

In [None]:
results_name.index('K-On!!')

In [None]:
results_name.index('K-On!')

In [None]:
results_name.index('Lucky☆Star')

In [None]:
results_name.index('Kiniro Mosaic')

In [None]:
results_name.index('Hello!! Kiniro Mosaic')