In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
import importlib

In [2]:
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv')

In [3]:
# remove ecchi anime
anime = anime[anime['genre'].str.contains('Ecchi') == False]
anime = anime[anime['genre'].str.contains('Slice of Life') == True]  # only SoL allowed.

In [4]:
rating = rating[rating['rating'] >= 9]  # only include animes rated 9 or above

In [5]:
counts = rating.groupby(by='user_id').agg('count')
counts = counts.rename(index=str, columns={'anime_id': 'counts'})
del counts['rating']
counts = counts.reset_index()

In [6]:
counts.describe()

Unnamed: 0,counts
count,67754.0
mean,32.615211
std,46.37916
min,1.0
25%,6.0
50%,18.0
75%,41.0
max,1469.0


In [7]:
# top_counts = counts
# TODO: try including more users in the sample - casuals too!
top_counts = counts[(counts['counts'] >= 75)]
len(top_counts)

7325

In [8]:
rating_ = rating[rating['user_id'].isin(top_counts['user_id'])]  # filtered rating database - include top users only
rating_.head()

Unnamed: 0,user_id,anime_id,rating
4392,46,1,10
4398,46,164,10
4399,46,199,10
4400,46,202,10
4402,46,242,10


In [9]:
len(rating_['anime_id'])  # 2653526

TIME_STEP = 2000  # anime_ids between time steps

# need to group by anime_id
rating_ = rating_.assign(time=rating_['anime_id']/TIME_STEP)
rating_['time'] = rating_['time'].apply(lambda x: int(x))

In [10]:
anime_sort = anime.sort_values(by='anime_id')
anime_sort = anime_sort[anime_sort['members'] > 10000]

In [11]:
anime_sort.describe()

Unnamed: 0,anime_id,rating,members
count,402.0,400.0,402.0
mean,13841.330846,7.50905,73216.231343
std,10781.173876,0.635221,89512.522161
min,17.0,4.8,10069.0
25%,3186.75,7.11,19359.0
50%,12070.0,7.55,37928.5
75%,22658.0,7.9325,91666.75
max,34277.0,9.06,633817.0


In [12]:
# rating_ = rating_[rating_['user_id'] < 3000]
rating_ = rating_[rating_['anime_id'].isin(anime_sort['anime_id'])]  # filtered rating database - include only non-filtered anime

user_ids = rating_['user_id'].unique()
max_time = rating_['time'].max()

# random sampling
user_ids = np.random.choice(user_ids, 200, replace=False)

# print(len(user_ids))

animes = anime_sort['anime_id'].unique()
anime_id_to_index = {}
for i in range(len(animes)):
    anime = animes[i]
    anime_id_to_index[anime] = i
    
print(len(animes))

# observation_seqs = [[[] for t in range(time)] for u in user_id]
observation_seqs = np.zeros(shape=(len(user_ids), max_time), dtype=object)

for u in range(len(user_ids)):
    for t in range(max_time):  
        observation_seqs[u][t] = []
for u in range(len(user_ids)):
    if u%100 == 0:
        print('u: {}'.format(u))
    rating_per_user = rating_[rating_['user_id'] == user_ids[u]]
#     rating_per_user = rating_[rating_['user_id'] == user_ids[u]].sample(frac=1)  # shuffle
    for t in range(max_time):
        observation_seqs[u][t] = rating_per_user[rating_per_user['time'] == t]['anime_id'].map(anime_id_to_index).tolist()
        if len(observation_seqs[u][t]) == 0:
            observation_seqs[u][t] = [anime_id_to_index[np.random.choice(animes)]]  # fill in blanks with a random anime.

# observation_seqs

402
u: 0
u: 100


In [13]:
time_t_anime = []
for u in range(len(observation_seqs)):
    time_t_anime.append(observation_seqs[u][-1])

In [14]:
len(observation_seqs[0])

16

In [15]:
observation_seqs_2 = np.resize(observation_seqs, (len(user_ids), max_time-1))

In [16]:
observation_seqs_2[0]

array([[29, 71], [230], [110], [133], [324], [300], [202, 204], [281],
       [245], [382], [297], [301], [319], [10], [117]], dtype=object)

In [17]:
total = 0
for seq in observation_seqs_2:
    total += sum([len(x) for x in seq])/len(seq)
total /= len(observation_seqs_2)
total

1.7556666666666652

In [18]:
K = 20  # no of states
I = len(animes)  # no of items

print('K: {}, I: {}'.format(K, I))

import random

pi = [random.uniform(0, 1) for k in range(K)]  # starting probabilities
total = sum(pi)
pi = [entry/total for entry in pi]  # normalisation to sum to 1

A = []  # transition probabilities
for i in range(K):
    A_i = [random.uniform(0, 1) for k in range(K)]
    total = sum(pi)
    A.append([entry/total for entry in A_i])

# NBD parameters per state
a = [random.uniform(3, 9) for k in range(K)]
b = [random.uniform(0, 10) for k in range(K)]

# multinomial probabilities per state
theta_per_state = []
for k in range(K):
    theta_i_K = [random.uniform(0, 1) for i in range(I)]
    total = sum(theta_i_K)
    theta_per_state.append([entry/total for entry in theta_i_K])

theta = np.zeros(shape=(I, K))
for i in range(I):
    for k in range(K):
        theta[i][k] = theta_per_state[k][i]

K: 20, I: 402


In [21]:
import HMM
importlib.reload(HMM)
from HMM import hmm
a, b, theta, pi, A, alphas = hmm.baum_welch(a, b, theta, pi, A, observation_seqs, prior_weight_items=len(theta)+100,
                                            prior_weight_states=(len(theta[0])-1)*len(observation_seqs)+100)


Iteration 1
forward-backward
2017-07-30 19:34:18.218355
Expectation
gamma
2017-07-30 19:34:31.253390
xi
2017-07-30 19:34:31.392333
Maximisation
pi
2017-07-30 19:34:39.042969
A
2017-07-30 19:34:39.113005
theta
2017-07-30 19:34:39.136446
a, b
2017-07-30 19:34:40.828785
delta: 284.2861777057777

Iteration 2
forward-backward
2017-07-30 19:34:42.992885
Expectation
gamma
2017-07-30 19:34:56.488951
xi
2017-07-30 19:34:56.628168
Maximisation
pi
2017-07-30 19:35:04.608159
A
2017-07-30 19:35:04.675994
theta
2017-07-30 19:35:04.696357
a, b
2017-07-30 19:35:06.427984
delta: 38.360428730173666

Iteration 3
forward-backward
2017-07-30 19:35:08.667081
Expectation
gamma
2017-07-30 19:35:22.249898
xi
2017-07-30 19:35:22.391457
Maximisation
pi
2017-07-30 19:35:31.239904
A
2017-07-30 19:35:31.314994
theta
2017-07-30 19:35:31.338918
a, b
2017-07-30 19:35:32.857518
delta: 25.736743137605266

Iteration 4
forward-backward
2017-07-30 19:35:35.090104
Expectation
gamma
2017-07-30 19:35:49.176678
xi
2017-07-30 

In [22]:
len(time_t_anime[-1])

1

In [28]:
no = 10

recall_numerator = []
recall_denominator = []
precision_numerator = []
precision_denominator = []
corrects = []
for u in range(len(time_t_anime)):
    recommendations = hmm.item_rank(u, a, b, alphas, theta, A)[:no]
    correct = [x for x in time_t_anime[u] if x in recommendations]
    total = len(time_t_anime[u])
    
    corrects.append(correct)
    
    recall_numerator.append(len(correct))
    recall_denominator.append(total)
    
    precision_numerator.append(len(correct))
    precision_denominator.append(no)

### Recommendations for an untrained user
To get recommendations for an untrained user, run the forward algorithm over an untrained user's observation sequence, then append it to alphas.

Call hmm.item_rank with the new user index. The only time the user index (`u`) is used is for indexing `alphas`.

What we're essentially doing is we're calculating the user's probability distribution over the states at time t+1 using the HMM whose parameters we've estimated using the Baum-Welch algorithm.

In [29]:
# store params in JSON file

import json

params = {
    'a': a,
    'b': b,
    'theta': theta.tolist(),
    'pi': pi,
    'alphas': alphas.tolist(),
    'observation_seqs': observation_seqs.tolist(),
    'user_ids': user_ids.tolist(),
    'animes': animes.tolist()
}

with open('params_experimentation.json', 'w') as outfile:
#     json.dump(params, outfile, indent=4)
    json.dump(params, outfile)

In [115]:
titles = [
#     ('Naruto', 20),
#     ('Fairy Tail', 6702),
#     ('Kami nomi zo Shiru Sekai', 8525),
#     ('Kami nomi zo Shiru Sekai II', 10080),
#     ('Kami nomi zo Shiru Sekai: Megami-hen', 16706),
#     ('Shigatsu wa Kimi no Uso', 23273),
    ('Hyouka', 12189),
#     ('Shinsekai yori', 13125),
#     ('Ore monogatari', 28297),
#     ('Gintama', 918),
    ('Toradora!', 4224),
    ('Nichijou', 10165),
    ('Gochuumon wa Usagi desu ka?', 21273),
    ('Non Non Biyori', 17549),
#     ('Yahari Ore no Seishun Love Comedy wa Machigatteiru.', 14813),
    ('K-On!', 5680),
    ('K-On!!', 7791),
    ('Gochuumon wa Usagi desu ka??', 29787),
    ('K-On! Movie', 9617),
    ('Tamako Market', 16417),
    ('Tamako Love Story', 21647),
#     ('Yahari Ore no Seishun Love Comedy wa Machigatteiru. Zoku', 23847),
#     ('Hibike! Euphonium', 27989),
    ('Non Non Biyori Repeat', 23623),
    ('Kiniro Mosaic', 16732),
    ('Hello!! Kiniro Mosaic', 23269),
    ('So Ra No Wo To', 6802),
    ('Lucky*Star', 1887)
]

In [116]:
# convert to list
titles_ = sorted([x[1] for x in titles])

In [117]:
titles_ = [[anime_id_to_index[x]] for x in titles_]

In [118]:
# titles_ = list(group(titles_, 3))

In [119]:
titles_ = np.array(titles_)

In [120]:
titles_

array([[ 80],
       [110],
       [125],
       [136],
       [145],
       [163],
       [173],
       [202],
       [242],
       [243],
       [250],
       [286],
       [294],
       [310],
       [314],
       [345]])

In [121]:
alpha_k, scaling_factor_k = hmm.forward(a, b, theta, pi, A, titles_)

In [122]:
len(alphas)

200

In [123]:
len(alphas[0])

16

In [124]:
len(alpha_k)

16

In [125]:
alphas_ = alphas.tolist()
alphas_.append(alpha_k)
alphas_ = np.array(alphas_)

In [126]:
len(alphas_)

201

In [127]:
results = hmm.item_rank(len(alphas_)-1, a, b, alphas_, theta, A)

In [128]:
anime = pd.read_csv('anime.csv')

SoL_anime = anime[(anime['genre'].str.contains('Slice of Life') == True)]
SoL_animeids = SoL_anime['anime_id'].tolist()

results_name = []
for result in results:
    anime_id = animes[result]
    if anime_id in SoL_animeids:
#         results_name.append( (SoL_anime[SoL_anime['anime_id'] == anime_id]['name'].to_string(index=False), anime[anime['anime_id'] == anime_id]['rating'].to_string(index=False)) )
        results_name.append(SoL_anime[SoL_anime['anime_id'] == anime_id]['name'].to_string(index=False))

In [129]:
titles_test = sorted(SoL_anime[SoL_anime['anime_id'] == x[1]]['name'].to_string(index=False) for x in titles)

In [131]:
# Sort a list of SoL anime using our rankings
sorted(titles_test, key=lambda x: results_name.index(x))

['Gochuumon wa Usagi Desu ka??',
 'Toradora!',
 'Non Non Biyori',
 'Hyouka',
 'K-On!!',
 'Kiniro Mosaic',
 'Nichijou',
 'Non Non Biyori Repeat',
 'K-On! Movie',
 'K-On!',
 'Tamako Market',
 'Hello!! Kiniro Mosaic',
 'Lucky☆Star',
 'Tamako Love Story',
 'Gochuumon wa Usagi Desu ka?',
 'So Ra No Wo To']

In [97]:
results_name.index(('Kiniro Mosaic', '7.41'))

70

In [98]:
results_name.index(('K-On!!', '8.14'))

31

In [99]:
results_name.index(('K-On!', '7.87'))

102

In [101]:
results_name.index(('Lucky☆Star', '7.87'))

165

In [102]:
results_name.index(('Hello!! Kiniro Mosaic', '7.63'))

164

In [92]:
results_name_sorted = sorted(results_name[:1000], key=lambda x: float(x[1]), reverse=True)

In [93]:
results_name_sorted

[('Clannad: After Story', '9.06'),
 ('Mushishi Zoku Shou 2nd Season', '8.88'),
 ('Ookami Kodomo no Ame to Yuki', '8.84'),
 ('Mushishi Zoku Shou', '8.8'),
 ('Mushishi', '8.78'),
 ('Great Teacher Onizuka', '8.77'),
 ('Mushishi Zoku Shou: Suzu no Shizuku', '8.75'),
 ('Natsume Yuujinchou Shi', '8.75'),
 ('Natsume Yuujinchou San', '8.67'),
 ('Mushishi Special: Hihamukage', '8.66'),
 ('Aria The Origination', '8.64'),
 ('Zoku Natsume Yuujinchou', '8.64'),
 ('Ano Hi Mita Hana no Namae wo Bokutachi wa Mada...', '8.62'),
 ('Uchuu Kyoudai', '8.59'),
 ('Usagi Drop', '8.56'),
 ('Nana', '8.55'),
 ('Mob Psycho 100', '8.55'),
 ('Mushishi Zoku Shou: Odoro no Michi', '8.54'),
 ('Doukyuusei (Movie)', '8.53'),
 ('Chihayafuru 2', '8.52'),
 ('Nichijou', '8.52'),
 ('Barakamon', '8.5'),
 ('Romeo no Aoi Sora', '8.47'),
 ('Nodame Cantabile', '8.46'),
 ('Toradora!', '8.45'),
 ('Natsume Yuujinchou', '8.42'),
 ('Sakurasou no Pet na Kanojo', '8.4'),
 ('Gin no Saji 2nd Season', '8.36'),
 ('Little Busters!: Refrain',

In [None]:
anime = pd.read_csv('anime.csv')

# SoL_anime = anime[anime['genre'].str.contains('Slice of Life') == True]
# SoL_animeids = SoL_anime['anime_id'].tolist()

results_name = []
for result in results:
    anime_id = animes[result]
    results_name.append(anime[anime['anime_id'] == anime_id]['name'].to_string(index=False))

In [None]:
results_name[:100]

In [None]:
anime[anime['genre'].str.contains('Slice of Life') == True]

In [None]:
anime[anime['name'] == 'Kiniro Mosaic']

In [None]:
anime[anime['name'] == 'Koe no Katachi']

In [None]:
a_b = []
for i in range(len(a)):
    a_b.append((a[i], b[i]))
    
sorted(a_b)

In [132]:
P = sum(precision_numerator) / sum(precision_denominator)
R = sum(recall_numerator) / sum(recall_denominator)
F = 2/(1/P + 1/R)
print('P: {}, R: {}, F: {}'.format(P, R, F))

P: 0.0825, R: 0.5789473684210527, F: 0.14442013129102846
