In [1]:
import pandas as pd
import scipy.sparse as sp
import numpy as np
import json
import torch

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_keyphrases(input, m_kav_mean):    
    cs = cosine_similarity(m_kav_mean[input].reshape(1, -1),m_kav_mean)[0]
    cands = cs.argsort()[::-1][1:6]
    return cands

def get_similar_keyphrases_movies(input, m_kav_mean, m_items):
    cs = cosine_similarity(m_kav_mean[input].reshape(1, -1),m_items)[0]
    cands = cs.argsort()[::-1][:5]
    return cands


In [3]:
df_tags = pd.read_csv('./data/ml10/fold0_valid/fold0/tr_tags.csv')

rows, cols = df_tags.item, df_tags.tag
values = np.ones(len(df_tags))

m_item_keyphrase= sp.csr_matrix((values, (rows, cols)), dtype='float64')

In [5]:
with open('./data/ml10/tag_id_dict.json') as f:
    tag_id_dict = json.load(f)
with open('./data/ml10/title_id_dict.json') as f:
    title_id_dict = json.load(f)

id_tag_dict = {id: tag for tag, id in tag_id_dict.items()}
id_title_dict = {id: title for title, id in title_id_dict.items()}

In [6]:
PATH = './saves/ml10/VAE_beta_multilayer.pt'
model = torch.load(PATH, map_location=torch.device('cpu'))
item_embeddings = model.decoder.weight.detach().numpy()
item_bias = model.decoder.bias.detach().numpy()

In [7]:
np.quantile(item_bias,0.99)

0.7199750208854672

In [8]:
from utils.KAVgenerator import KAVgenerator
k = KAVgenerator(m_item_keyphrase,item_embeddings, 1)
keyphrase_embeddings = k.get_all_mean_kav(20, 100)

1%|          | 1/164 [00:00<00:21,  7.50it/s]Generate Keyphrase Activation Vector
100%|██████████| 164/164 [00:20<00:00,  8.07it/s](164, 100, 150)



In [89]:
np.mean(np.linalg.norm(item_embeddings, ord=2, axis=1, keepdims=True))

0.8881551

In [90]:
np.mean(np.linalg.norm(keyphrase_embeddings, ord=2, axis=1, keepdims=True))

0.7861438428950553

In [91]:
outputs = []

In [92]:
id = tag_id_dict['sci-fi']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)
outputs.append(['sci-fi'] + output)

['space', 'future', 'action', 'robots', 'dystopia']


In [93]:
id = tag_id_dict['world war ii']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)
outputs.append(['world war ii'] + output)

['history', 'historical', 'true story', 'oscar (best actor)', 'war']


In [94]:
id = tag_id_dict['fantasy']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)
outputs.append(['fantasy'] + output)

['adventure', 'magic', 'super-hero', 'superhero', 'epic']


In [95]:
id = tag_id_dict['drama']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)

['oscar (best picture)', 'war', 'oscar (best actor)', 'interesting', 'true story']


In [96]:
id = tag_id_dict['alfred hitchcock']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)
outputs.append(['alfred hitchcock'] + output)

['hitchcock', 'classic', 'film noir', 'oscar (best picture)', 'black and white']


In [97]:
id = tag_id_dict['dark comedy']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)
outputs.append(['dark comedy'] + output)

['quirky', 'black comedy', 'surreal', 'satire', 'nonlinear']


In [98]:
id = tag_id_dict['hilarious']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)
outputs.append(['hilarious'] + output)

['parody', 'satire', 'comedy', 'spoof', 'funny']


In [99]:
id = tag_id_dict['comedy']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)
outputs.append(['comedy'] + output)

['funny', 'parody', 'hilarious', 'jim carrey', 'stupid']


In [100]:
id = tag_id_dict['romance']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)
outputs.append(['romance'] + output)

['girlie movie', 'chick flick', 'cute', 'family', 'new york city']


In [101]:
id = tag_id_dict['post-apocalyptic']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)
outputs.append(['post-apocalyptic'] + output)

['post apocalyptic', 'zombies', 'horror', 'aliens', 'vampire']


In [102]:
id = tag_id_dict['film noir']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)
outputs.append(['film noir'] + output)

['violent', 'quirky', 'overrated', 'mafia', 'martin scorsese']


In [103]:
id = tag_id_dict['politics']
cands = get_similar_keyphrases(id, keyphrase_embeddings)
output = []
for kp in cands:
    output.append(id_tag_dict[kp])
print(output)
outputs.append(['politics'] + output)

['documentary', 'based on a true story', 'africa', 'sad', 'racism']


In [104]:
df = pd.DataFrame(outputs).set_index(0)
df.index.name = None
df

Unnamed: 0,1,2,3,4,5
sci-fi,space,future,action,robots,dystopia
world war ii,history,historical,true story,oscar (best actor),war
fantasy,adventure,magic,super-hero,superhero,epic
alfred hitchcock,hitchcock,classic,film noir,oscar (best picture),black and white
dark comedy,quirky,black comedy,surreal,satire,nonlinear
hilarious,parody,satire,comedy,spoof,funny
comedy,funny,parody,hilarious,jim carrey,stupid
romance,girlie movie,chick flick,cute,family,new york city
post-apocalyptic,post apocalyptic,zombies,horror,aliens,vampire
film noir,violent,quirky,overrated,mafia,martin scorsese


In [35]:
outputs = []

In [36]:
id = tag_id_dict['superhero']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['superhero'] + output)

['Spider-Man 2', 'X-Men', 'X2: X-Men United', 'Spider-Man', 'Left Behind: World at War']


In [37]:
id = tag_id_dict['documentary']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['documentary'] + output)

['Home Page', 'Year of the Dog', 'Distant (Uzak)', 'Children Underground', 'Julien Donkey-Boy']


In [38]:
id = tag_id_dict['sci-fi']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['sci-fi'] + output)

['Star Wars: Episode VI - Return of the Jedi', 'Star Wars: Episode IV - A New Hope (a.k.a. Star Wars)', 'Star Wars: Episode V - The Empire Strikes Back', 'Terminator 2: Judgment Day', 'Star Wars: Episode I - The Phantom Menace']


In [39]:
id = tag_id_dict['marvel']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)


['Freddy Vs. Jason', 'Mummy Returns, The', 'Blade II', 'Mummy, The', 'Scream 3']


In [40]:
id = tag_id_dict['family']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['family'] + output)

['Beauty and the Beast', 'Babe', 'Sound of Music, The', 'Sense and Sensibility', 'Little Mermaid, The']


In [41]:
id = tag_id_dict['politics']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['politics'] + output)

['Fahrenheit 9/11', 'Inconvenient Truth, An', 'Unprecedented: The 2000 Presidential Election', 'Brokeback Mountain', 'Story of the Weeping Camel, The (Die Geschichte vom weinenden Kamel)']


In [42]:
id = tag_id_dict['fairy tale']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['fairy tale'] + output)

['Beauty and the Beast', 'Toy Story 2', 'Little Mermaid, The', 'Toy Story', 'Aladdin']


In [43]:
id = tag_id_dict['fantasy']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['fantasy'] + output)

['Lord of the Rings: The Return of the King, The', 'Lord of the Rings: The Fellowship of the Ring, The', 'Lord of the Rings: The Two Towers, The', 'Star Wars: Episode VI - Return of the Jedi', 'Star Wars: Episode IV - A New Hope (a.k.a. Star Wars)']


In [44]:
df = pd.DataFrame(outputs).set_index(0)
df.index.name = None
df

Unnamed: 0,1,2,3,4,5
superhero,Spider-Man 2,X-Men,X2: X-Men United,Spider-Man,Left Behind: World at War
documentary,Home Page,Year of the Dog,Distant (Uzak),Children Underground,Julien Donkey-Boy
sci-fi,Star Wars: Episode VI - Return of the Jedi,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,Star Wars: Episode V - The Empire Strikes Back,Terminator 2: Judgment Day,Star Wars: Episode I - The Phantom Menace
family,Beauty and the Beast,Babe,"Sound of Music, The",Sense and Sensibility,"Little Mermaid, The"
politics,Fahrenheit 9/11,"Inconvenient Truth, An",Unprecedented: The 2000 Presidential Election,Brokeback Mountain,"Story of the Weeping Camel, The (Die Geschicht..."
fairy tale,Beauty and the Beast,Toy Story 2,"Little Mermaid, The",Toy Story,Aladdin
fantasy,"Lord of the Rings: The Return of the King, The","Lord of the Rings: The Fellowship of the Ring,...","Lord of the Rings: The Two Towers, The",Star Wars: Episode VI - Return of the Jedi,Star Wars: Episode IV - A New Hope (a.k.a. Sta...


In [45]:
outputs = []

In [46]:
id = tag_id_dict['hitchcock']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['hitchcock'] + output)

['North by Northwest', 'M', 'Notorious', 'Shadow of a Doubt', 'Strangers on a Train']


In [47]:
id = tag_id_dict['tom hanks']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['tom hanks'] + output)

["Mr. Holland's Opus", 'Sleepless in Seattle', 'Apollo 13', 'Forrest Gump', 'Mrs. Doubtfire']


In [48]:
id = tag_id_dict['tom cruise']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['tom cruise'] + output)

['Forrest Gump', 'Mission: Impossible', 'Braveheart', 'Legends of the Fall', 'Shawshank Redemption, The']


In [49]:
id = tag_id_dict['brad pitt']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['brad pitt'] + output)

['Seven (a.k.a. Se7en)', 'Fight Club', 'Gladiator', 'Forrest Gump', 'American History X']


In [50]:
id = tag_id_dict['jim carrey']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['jim carrey'] + output)

['Ace Ventura: When Nature Calls', 'Dumb & Dumber', 'Ace Ventura: Pet Detective', 'Cable Guy, The', 'Happy Gilmore']


In [51]:
id = tag_id_dict['bruce willis']
cands = get_similar_keyphrases_movies(id, keyphrase_embeddings,item_embeddings)
output = []
for kp in cands:
    output.append(id_title_dict[kp][:-7])
print(output)
outputs.append(['bruce willis'] + output)

['Armageddon', 'Gone in 60 Seconds', 'Matrix, The', 'End of Days', 'Swordfish']


In [52]:
df = pd.DataFrame(outputs).set_index(0)
df.index.name = None
df

Unnamed: 0,1,2,3,4,5
hitchcock,North by Northwest,M,Notorious,Shadow of a Doubt,Strangers on a Train
tom hanks,Mr. Holland's Opus,Sleepless in Seattle,Apollo 13,Forrest Gump,Mrs. Doubtfire
tom cruise,Forrest Gump,Mission: Impossible,Braveheart,Legends of the Fall,"Shawshank Redemption, The"
brad pitt,Seven (a.k.a. Se7en),Fight Club,Gladiator,Forrest Gump,American History X
jim carrey,Ace Ventura: When Nature Calls,Dumb & Dumber,Ace Ventura: Pet Detective,"Cable Guy, The",Happy Gilmore
bruce willis,Armageddon,Gone in 60 Seconds,"Matrix, The",End of Days,Swordfish


In [11]:
id_tag_dict
tag_id_dict = {v:k for k, v in id_tag_dict.items()}
tag_id_dict

{'anime': 0,
 'super-hero': 1,
 'woody allen': 2,
 'comic book': 3,
 'samuel l jackson': 4,
 'brad pitt': 5,
 'bruce willis': 6,
 'time travel': 7,
 'crime': 8,
 'serial killer': 9,
 'action': 10,
 'jim carrey': 11,
 'stupid': 12,
 'dystopia': 13,
 'drama': 14,
 'aliens': 15,
 'based on a book': 16,
 'classic': 17,
 'fantasy': 18,
 'sci-fi': 19,
 'space': 20,
 'arnold schwarzenegger': 21,
 'matt damon': 22,
 'oscar (best actor)': 23,
 'oscar (best picture)': 24,
 'ghosts': 25,
 'twist ending': 26,
 'stephen king': 27,
 'tom hanks': 28,
 'interesting': 29,
 'dvd': 30,
 'nicolas cage': 31,
 'magic': 32,
 'morgan freeman': 33,
 'bond': 34,
 'music': 35,
 'gay': 36,
 'comedy': 37,
 'funny': 38,
 'alfred hitchcock': 39,
 'true story': 40,
 'family': 41,
 'film noir': 42,
 'heist': 43,
 'quirky': 44,
 'sequel': 45,
 'memory': 46,
 'psychology': 47,
 'espionage': 48,
 'satire': 49,
 'shakespeare': 50,
 'black comedy': 51,
 'adventure': 52,
 'girlie movie': 53,
 'teen': 54,
 'disney': 55,
 'br

In [12]:
from utils.Dataset import Dataset
dataset = Dataset(data_dir='./data/ml10/fold0_valid/fold0/',load_keyphrases=True)

Read data from ./data/ml10/fold0_valid/fold0/


In [13]:
def get_user_info(user_id, input_matrix):
    user_input = input_matrix[user_id]
    nonzeros = user_input.nonzero()
    items = nonzeros[1]
    ratings = user_input[nonzeros]
    ratings = np.asarray(ratings).reshape(-1)
    sorted_ratings_idx = ratings.argsort()[::-1]
    sorted_items = items[sorted_ratings_idx]
    sorted_ratings = ratings[sorted_ratings_idx]
    return sorted_items, sorted_ratings

def get_user_preds(user_id, input_matrix):
    user_input = input_matrix[user_id]
    mask_index = user_input.nonzero()[1]
    i = torch.FloatTensor(user_input.toarray()).to(torch.device('cpu'))
    with torch.no_grad():
        preds = model.forward(i).cpu().numpy()
    preds = np.asarray(preds).reshape(-1)
    preds[mask_index] = -np.inf
    sorted_pred_items = preds.argsort()[::-1]
    sorted_pred_ratings = preds[sorted_pred_items] 
    return sorted_pred_items, sorted_pred_ratings

def get_mu_cov(user_id, input_matrix, moddel):
    user_input = input_matrix[user_id]
    i = torch.FloatTensor(user_input.toarray()).to(torch.device('cpu'))
    with torch.no_grad():
        mu, logvar = model.get_mu_logvar(i)
    std = model.logvar2std(logvar)
    mu, std = mu.numpy().T, std.numpy()

    return mu, np.diagflat(std*std)

def get_user_preds_using_mu(user_mu, input_matrix, model):
    user_input = input_matrix[user_id]
    mask_index = user_input.nonzero()[1]
    _mu = torch.FloatTensor(user_mu.T)

    with torch.no_grad():
        preds = model.decoder(_mu)
    preds = np.asarray(preds).reshape(-1)
    preds[mask_index] = -np.inf
    sorted_pred_items = preds.argsort()[::-1]
    sorted_pred_ratings = preds[sorted_pred_items] 
    return sorted_pred_items, sorted_pred_ratings

def update_posterior(x, y, S_0, m_0, prec_y):
    S_0_inv = np.linalg.inv(S_0)
    #S_1 = np.linalg.inv(S_0_inv +prec_y * x @ x.T)    
    #print(np.swapaxes(x,-2,-1).shape)
    S_1 = np.linalg.inv(S_0_inv +prec_y * np.matmul(x,np.swapaxes(x,-2,-1)))
    m_1 = S_1 @ (S_0_inv @ m_0 + prec_y * y * x)
    #print(m_1.shape)
    return S_1, m_1


In [14]:
outputs = []
dataset.train_matrix.shape


(69878, 10677)

In [15]:
mask = np.array((m_item_keyphrase[:,37]>0).todense()).squeeze()
mask = np.append(mask, [0]*(dataset.train_matrix.shape[1] - len(mask))).shape

In [16]:
a = np.array([1,2,3])
a[~np.array([1,1,0]).nonzero()[0]]
(1-np.array([1,1,0])).nonzero()[0]

array([2])

In [17]:
# for all users:
    # compute rates
    # update
    # compute updated_rates
    # updated_rates - rates
    # separate and add
# take average

def eval_routine(keyphrase_id, keyphrase_embeddings, train_matrix, model, prec_y):
    x = keyphrase_embeddings[keyphrase_id][:,np.newaxis]
    mask = np.array((m_item_keyphrase[:,keyphrase_id]>0).todense()).squeeze()
    pos_item_ids = mask.nonzero()[0]
    neg_item_ids = (1-mask).nonzero()[0]
    user_id = 243
    mu, cov = get_mu_cov(user_id, train_matrix, model)
    diffs = eval_subroutine(mu, cov, model, x, prec_y)
    a = np.mean(diffs[pos_item_ids])
    b = np.mean(diffs[neg_item_ids])
    print(pos_item_ids)
    print(a - b)
    print(a,b)
    print(np.abs(a-b)/np.abs(b))

def eval_subroutine(mu, cov, model, x, prec_y):
    _mu = torch.FloatTensor(mu.T)
    with torch.no_grad():
        preds = model.decoder(_mu)
    initial_preds = np.asarray(preds).reshape(-1)
    y = np.max(mu.T @ keyphrase_embeddings.T)
    print(y)
    
    cov1, mu1 = update_posterior(x, y, cov, mu, prec_y)
    _mu = torch.FloatTensor(mu1.T)
    with torch.no_grad():
        preds = model.decoder(_mu)
    updated_preds = np.asarray(preds).reshape(-1)
    diffs = updated_preds - initial_preds
    print(mu.T@x, mu1.T@x)
    return diffs
    m_item_keyphrase

eval_routine(49, keyphrase_embeddings, dataset.test_matrix, model, 10)



1.6310777839869757
[[0.22457496]] [[0.8815902]]
[  79  120  126  174  186  221  231  233  305  400  492  618  638  889
  917 1015 1351 1853 2132 2305 2808 2940 3117 3748 5704 6610 7588]
0.22947407
0.08240308 -0.14707099
1.5602946


In [53]:
def eval_routine(keyphrase_id, keyphrase_embeddings, train_matrix, model, prec_y):
    x = keyphrase_embeddings[keyphrase_id][:,np.newaxis]
    user_id = 34
    mu, cov = get_mu_cov(user_id, train_matrix, model)
    diffs = eval_subroutine(mu, cov, model, x, prec_y)
    ranks = diffs.argsort()
    top5 = ranks[-5:][::-1]
    bot5 = ranks[:5]
    return top5, bot5


def eval_subroutine(mu, cov, model, x, prec_y):
    _mu = torch.FloatTensor(mu.T)
    with torch.no_grad():
        preds = model.decoder(_mu)
    initial_preds = np.asarray(preds).reshape(-1)
    y = np.max(mu.T @ keyphrase_embeddings.T)
    initial_preds = mu.T @ keyphrase_embeddings.T

    print(y)
    
    cov1, mu1 = update_posterior(x, y, cov, mu, prec_y)
    updated_preds = np.array(mu1.T @ keyphrase_embeddings.T).squeeze()
    #diffs = np.array((updated_preds - initial_preds)/np.abs(initial_preds)).squeeze()
    #print(diffs)
    return updated_preds


top5, bot5 = eval_routine(0, keyphrase_embeddings, dataset.test_matrix, model, 100)

for i in top5:
    print(id_tag_dict[i])
print("")
for i in bot5:
    print(id_tag_dict[i])

2.258443644457296
historical
world war ii
hitchcock
anime
japan

teen
high school
hilarious
parody
satire


In [10]:
user_id = 100
sorted_items, sorted_ratings = get_user_info(user_id, dataset.train_matrix)
top10_items = sorted_items[:10]
output = []
for item_id in top10_items:
    output.append(id_title_dict[item_id][:-7])
print(output)
outputs.append(['user history'] + output)

sorted_items, sorted_ratings = get_user_preds(user_id, dataset.train_matrix)
top10_items = sorted_items[:10]
output = []

for item_id in top10_items:
    output.append(id_title_dict[item_id][:-7])
print(output)
outputs.append(['initial recs'] + output)
mu, cov = get_mu_cov(user_id, dataset.train_matrix, model)


NameError: name 'dataset' is not defined

In [58]:
sorted_ratings[1]

5.3573

In [59]:
mu.T@keyphrase_embeddings[18][:,np.newaxis]

array([[0.22689884]])

In [60]:
np.quantile(mu.T@(keyphrase_embeddings.T),q=0.01)

-1.4240211514953711

In [61]:
np.quantile(mu.T@(item_embeddings.T),q=0.1)

1.5969892978668212

In [62]:
item_embeddings.shape

(10677, 150)

In [63]:
prec = np.linalg.norm(1/(cov+1e-6))
neg = np.min(mu.T @ keyphrase_embeddings.T)
pos = np.max(mu.T @ keyphrase_embeddings.T)
print(pos, neg)

1.887459740716844 -1.6873800293518968


In [64]:
prec

149499200.0

In [65]:
x = keyphrase_embeddings[18][:,np.newaxis]
y = [[pos]]

cov1, mu1 = update_posterior(x, y, cov, mu, np.array(100))

In [66]:
mu1.T@keyphrase_embeddings[18][:,np.newaxis]

array([[1.41716433]])

In [67]:
sorted_items, sorted_ratings = get_user_preds_using_mu(mu1, dataset.train_matrix, model)
top10_items = sorted_items[:10]
output = []
for item_id in top10_items:
    output.append(id_title_dict[item_id][:-7])
print(output)
outputs.append(['+ fantasy'] + output)

['Lord of the Rings: The Return of the King, The', 'Lord of the Rings: The Two Towers, The', 'Crouching Tiger, Hidden Dragon (Wu hu zang long)', 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark)', 'Freaks', 'All About Eve', 'Harry Potter and the Goblet of Fire', 'Heathers', 'Sunset Blvd. (a.k.a. Sunset Boulevard)', 'Eternal Sunshine of the Spotless Mind']


In [68]:
x = keyphrase_embeddings[18][:,np.newaxis]
y = [[neg]]

cov1, mu1 = update_posterior(x, y, cov, mu, np.array(1))

In [69]:
sorted_items, sorted_ratings = get_user_preds_using_mu(mu1, dataset.train_matrix, model)
top10_items = sorted_items[:10]
output = []
for item_id in top10_items:
    output.append(id_title_dict[item_id][:-7])
print(output)
outputs.append(['- fantasy'] + output)

['Swimmer, The', 'Sunrise: A Song of Two Humans', 'Freaks', 'Brokeback Mountain', 'Shoot the Moon', 'Lord of the Rings: The Return of the King, The', 'Night of the Hunter, The', 'Lord of the Rings: The Two Towers, The', 'Ball of Fire', 'Chaos']


In [70]:
df = pd.DataFrame(outputs).set_index(0)
df.index.name = None
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
user history,Talk to Her (Hable con Ella),Amores Perros (Love's a Bitch),Rosencrantz and Guildenstern Are Dead,Identity,"Mariachi, El",It Happened One Night,Flirting With Disaster,Once Upon a Time in America,Drugstore Cowboy,Father of the Bride
initial recs,"Swimmer, The",Freaks,Sunrise: A Song of Two Humans,Brokeback Mountain,"Lord of the Rings: The Return of the King, The",Shoot the Moon,"Night of the Hunter, The","Lord of the Rings: The Two Towers, The",Ball of Fire,Sunset Blvd. (a.k.a. Sunset Boulevard)
+ fantasy,"Lord of the Rings: The Return of the King, The","Lord of the Rings: The Two Towers, The","Crouching Tiger, Hidden Dragon (Wu hu zang long)",Raiders of the Lost Ark (Indiana Jones and the...,Freaks,All About Eve,Harry Potter and the Goblet of Fire,Heathers,Sunset Blvd. (a.k.a. Sunset Boulevard),Eternal Sunshine of the Spotless Mind
- fantasy,"Swimmer, The",Sunrise: A Song of Two Humans,Freaks,Brokeback Mountain,Shoot the Moon,"Lord of the Rings: The Return of the King, The","Night of the Hunter, The","Lord of the Rings: The Two Towers, The",Ball of Fire,Chaos


In [71]:
outputs = []

In [72]:
user_id = 300
sorted_items, sorted_ratings = get_user_info(user_id, dataset.train_matrix)
top10_items = sorted_items[:10]
output = []
for item_id in top10_items:
    output.append(id_title_dict[item_id][:-7])

outputs.append(['user history'] + output)
print(output)

sorted_items, sorted_ratings = get_user_preds(user_id, dataset.train_matrix)
top10_items = sorted_items[:10]
output = []
for item_id in top10_items:
    output.append(id_title_dict[item_id][:-7])
outputs.append(['initial recs'] + output)
print(output)

mu, cov = get_mu_cov(user_id, dataset.train_matrix, model)
prec = np.linalg.norm(1/(cov+1e-6))

prec = np.linalg.norm(1/(cov+1e-6))
neg = np.min(mu.T @ keyphrase_embeddings.T)
pos = np.max(mu.T @ keyphrase_embeddings.T)
print(pos, neg)

['Pulp Fiction', 'Battleship Potemkin, The (Bronenosets Potyomkin)', 'Woman Under the Influence, A', 'Apocalypse Now', 'Seven Samurai (Shichinin no samurai)', 'Wings of Desire (Der Himmel über Berlin)', 'Son, The (Le Fils)', 'Stroszek', 'Rio Bravo', 'Wallace & Gromit: The Curse of the Were-Rabbit']
['Pier, The (La Jetée)', '2001: A Space Odyssey', '8 1/2', 'Sunrise: A Song of Two Humans', 'Andrei Rublev (Andrey Rublyov)', 'Gerry', 'Paranoid Park', 'Wild Bunch, The', 'Red River', 'Cyclo (Xich lo)']
2.684969023457633 -2.7016516940258937


In [79]:
x = keyphrase_embeddings[18][:,np.newaxis]
y = [[pos]]

cov1, mu1 = update_posterior(x, y, cov, mu, np.array(1))

sorted_items, sorted_ratings = get_user_preds_using_mu(mu1, dataset.train_matrix, model)
top10_items = sorted_items[:10]
output = []
for item_id in top10_items:
    output.append(id_title_dict[item_id][:-7])
outputs.append(['+ fantasy'] + output)
print(output)



['Pier, The (La Jetée)', '2001: A Space Odyssey', '8 1/2', 'Sunrise: A Song of Two Humans', 'Andrei Rublev (Andrey Rublyov)', 'Gerry', 'Paranoid Park', 'Wild Bunch, The', 'Red River', 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb']


In [81]:
x = keyphrase_embeddings[18][:,np.newaxis]
y = [[neg]]

cov1, mu1 = update_posterior(x, y, cov, mu, np.array(10))

sorted_items, sorted_ratings = get_user_preds_using_mu(mu1, dataset.train_matrix, model)
top10_items = sorted_items[:10]
output = []
for item_id in top10_items:
    output.append(id_title_dict[item_id][:-7])
outputs.append(['- fantasy'] + output)
print(output)



['Pier, The (La Jetée)', 'Sunrise: A Song of Two Humans', 'Sorrow and the Pity, The (Chagrin et la pitié, Le)', 'Gerry', '2001: A Space Odyssey', 'Paranoid Park', 'Andrei Rublev (Andrey Rublyov)', '8 1/2', 'Au Hasard Balthazar', 'Fat City']


In [283]:
df = pd.DataFrame(outputs).set_index(0)
df.index.name = None
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
user history,Pulp Fiction,"Battleship Potemkin, The (Bronenosets Potyomkin)","Woman Under the Influence, A",Apocalypse Now,Seven Samurai (Shichinin no samurai),Wings of Desire (Der Himmel über Berlin),"Son, The (Le Fils)",Stroszek,Rio Bravo,Wallace & Gromit: The Curse of the Were-Rabbit
initial recs,"Pier, The (La Jetée)",2001: A Space Odyssey,8 1/2,Sunrise: A Song of Two Humans,Andrei Rublev (Andrey Rublyov),Gerry,Paranoid Park,"Wild Bunch, The",Red River,Cyclo (Xich lo)
+ fantasy,2001: A Space Odyssey,"Pier, The (La Jetée)",8 1/2,Dr. Strangelove or: How I Learned to Stop Worr...,"Wild Bunch, The",Touch of Evil,Chinatown,City Lights,"Beauty and the Beast (Belle et la bête, La)",Andrei Rublev (Andrey Rublyov)
- fantasy,"Pier, The (La Jetée)",Sunrise: A Song of Two Humans,2001: A Space Odyssey,8 1/2,Gerry,Andrei Rublev (Andrey Rublyov),Paranoid Park,"Sorrow and the Pity, The (Chagrin et la pitié,...",Fat City,Au Hasard Balthazar


In [274]:
mu.T@keyphrase_embeddings[18][:,np.newaxis]

array([[-0.59178778]])

In [124]:
x = keyphrase_embeddings[37][:,np.newaxis]
y = [[2]]

cov2, mu2 = update_posterior(x, y, cov1, mu1, np.array(np.linalg.norm(1/(cov1+1e-6))))
sorted_items, sorted_ratings = get_user_preds_using_mu(mu2, dataset.train_matrix, model)
top10_items = sorted_items[:10]
output = []
for item_id in top10_items:
    output.append(id_title_dict[item_id])
print(output)



['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 'City Lights (1931)', 'Chinatown (1974)', '2001: A Space Odyssey (1968)', 'Godfather, The (1972)', 'Maltese Falcon, The (1941)', 'Annie Hall (1977)', 'Godfather: Part II, The (1974)', 'Double Indemnity (1944)', 'Treasure of the Sierra Madre, The (1948)']


In [405]:
user_id = 77
sorted_items, sorted_ratings = get_user_info(user_id, dataset.train_matrix)
top10_items = sorted_items[:10]
output = []
for item_id in top10_items:
    output.append(id_title_dict[item_id])
print(output)

sorted_items, sorted_ratings = get_user_preds(user_id, dataset.train_matrix)
top10_items = sorted_items[:10]
output = []
for item_id in top10_items:
    output.append(id_title_dict[item_id])
print(output)

mu, cov = get_mu_cov(user_id, dataset.train_matrix, model)


['Pulp Fiction (1994)', 'Star Trek II: The Wrath of Khan (1982)', '8 1/2 (1963)', 'Sling Blade (1996)', 'Dogma (1999)', 'Fight Club (1999)', 'Third Man, The (1949)', 'Lord of the Rings: The Fellowship of the Ring, The (2001)', 'Boat, The (Das Boot) (1981)', 'Fanny and Alexander (Fanny och Alexander) (1982)']
['Godfather, The (1972)', 'Memento (2000)', 'Usual Suspects, The (1995)', 'Face in the Crowd, A (1957)', 'Saving Private Ryan (1998)', 'Mongol (2007)', 'Bad Santa (2003)', 'Terminator 2: Judgment Day (1991)', 'Curious Case of Benjamin Button, The (2008)', 'Aliens (1986)']


In [421]:
x = keyphrase_embeddings[19][:,np.newaxis]
y = [[5.]]

cov1, mu1 = update_posterior(x, y, cov, mu, np.array(100))
cov1
sorted_items, sorted_ratings = get_user_preds_using_mu(mu1, dataset.train_matrix, model)
top10_items = sorted_items[:10]
output = []
for item_id in top10_items:
    output.append(id_title_dict[item_id])
print(output)



['Usual Suspects, The (1995)', 'Memento (2000)', 'Terminator 2: Judgment Day (1991)', 'Aliens (1986)', 'Saving Private Ryan (1998)', 'Godfather, The (1972)', 'X2: X-Men United (2003)', 'Gattaca (1997)', 'Spider-Man 2 (2004)', 'Monty Python and the Holy Grail (1975)']


In [54]:
id_tag_dict

{0: 'anime',
 1: 'super-hero',
 2: 'woody allen',
 3: 'comic book',
 4: 'samuel l jackson',
 5: 'brad pitt',
 6: 'bruce willis',
 7: 'time travel',
 8: 'crime',
 9: 'serial killer',
 10: 'action',
 11: 'jim carrey',
 12: 'stupid',
 13: 'dystopia',
 14: 'drama',
 15: 'aliens',
 16: 'based on a book',
 17: 'classic',
 18: 'fantasy',
 19: 'sci-fi',
 20: 'space',
 21: 'arnold schwarzenegger',
 22: 'matt damon',
 23: 'oscar (best actor)',
 24: 'oscar (best picture)',
 25: 'ghosts',
 26: 'twist ending',
 27: 'stephen king',
 28: 'tom hanks',
 29: 'interesting',
 30: 'dvd',
 31: 'nicolas cage',
 32: 'magic',
 33: 'morgan freeman',
 34: 'bond',
 35: 'music',
 36: 'gay',
 37: 'comedy',
 38: 'funny',
 39: 'alfred hitchcock',
 40: 'true story',
 41: 'family',
 42: 'film noir',
 43: 'heist',
 44: 'quirky',
 45: 'sequel',
 46: 'memory',
 47: 'psychology',
 48: 'espionage',
 49: 'satire',
 50: 'shakespeare',
 51: 'black comedy',
 52: 'adventure',
 53: 'girlie movie',
 54: 'teen',
 55: 'disney',
 56:

In [154]:
model.logvar2std(logvar)

tensor([[0.6065, 0.6066, 0.6065, 0.6909, 0.6065, 1.0357, 0.6321, 0.7306, 0.8367,
         0.9005, 0.6890, 0.6065, 0.6202, 0.6615, 0.6069, 0.6066, 0.7204, 0.6065,
         0.8023, 0.6065, 0.6065, 0.6065, 0.7750, 0.6070, 0.6065, 0.6065, 0.6066,
         0.6558, 1.0224, 0.6065, 0.6390, 0.7475, 0.6065, 0.6097, 0.6065, 0.6976,
         0.6065, 0.7138, 0.9611, 0.6307, 0.6065, 0.6065, 0.6065, 0.6065, 0.7756,
         0.6185, 0.6075, 0.6081, 0.6065, 0.6065]])

In [78]:
sorted_ratings = ratings.argsort()[::-1]
sorted_items = items[sorted_ratings]
sorted_items

array([1782, 4270, 3084, ..., 1858, 2873, 7813], dtype=int32)

In [76]:
arr1inds

array([1115, 2345, 1815, ..., 1158, 1709, 3787])

In [77]:
ratings[arr1inds]

array([5., 5., 5., ..., 1., 1., 1.])

In [171]:
a =np.random.normal(np.zeros(150),0.01)
a = a[:,np.newaxis]

In [176]:
np.quantile(a.T@(keyphrase_embeddings.T),q=0.99)

0.013279970142235526

In [165]:
np.random.normal(np.zeros(50),0.001)

array([ 0.00175503, -0.0018365 , -0.00092557, -0.0013183 ,  0.00070177,
       -0.00045205, -0.00063023,  0.00231926,  0.00274795, -0.00030118,
       -0.00102944,  0.00221119, -0.00020627,  0.00062257,  0.00136277,
        0.00194146, -0.00048117, -0.00046926,  0.00112981, -0.00067896,
        0.00059334,  0.00093145, -0.00020276, -0.00019905, -0.00110767,
        0.00229955,  0.00026535, -0.00072867, -0.00150996, -0.00102973,
        0.00242328,  0.00081131,  0.00183528, -0.00051625, -0.00056769,
       -0.00143513,  0.00020218, -0.00069088, -0.00053919,  0.00166264,
        0.00039374,  0.00120685, -0.0008378 ,  0.00058671, -0.00062085,
       -0.00133637,  0.00044003,  0.0004339 , -0.00023481, -0.00046481])

In [184]:
a = np.zeros((150,150), int)
np.fill_diagonal(a, 1)

np.linalg.norm(a)

12.24744871391589

In [190]:
def get_predictive_dist(x_pred, S, m, prec_y):
    print(m.T)
    pred_means = m.T @ x_pred
    pred_means = pred_means.flatten()
    pred_vars = np.sum(1/prec_y + x_pred.T @ S * x_pred.T, axis=1)

    return pred_vars, pred_means

In [194]:
x_pred = np.array([[1,2,3],[2,3,4]]).T
S = np.array([[1,1,1],[1,1,1],[1,1,1]])
m = np.array([[1],[2],[3]])
get_predictive_dist(x_pred, S, m, 1)



[[1 2 3]]


(array([39., 84.]), array([14, 20]))

In [198]:
np.array([1,2,3]) == 1

array([ True, False, False])

In [197]:
np.array(np.linalg.norm(1/(S+1e-6)))**2

8.999982000027002