In [1]:
%reload_ext autoreload
%autoreload 2

import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [144]:
import numpy as np
import pandas as pd
import scipy.linalg

from src.graph import visualize
from src.recommender import train_mf, reviews_dataset, evaluate_recommendations
from src.tags import PytorchWordEmbedding

In [3]:
reviews_train, reviews_validation, reviews_test, images = reviews_dataset()

In [4]:
import re

splidd_regex = re.compile(",|&|/")
replace_regex = re.compile(" |\(|\)|\'|-")

def parse_tags(tags_raw):
    if not isinstance(tags_raw, str):
        return []
    
    tags = []
    for tag_raw in re.split(splidd_regex, tags_raw):
        tag = re.sub(replace_regex, "", tag_raw.strip().lower())
        if tag:
            tags.append(tag)
    return tags

images["tags_parsed"] = images["tags"].apply(parse_tags)

In [25]:
train_data = reviews_train.copy()

user_ids = train_data['user_id'].drop_duplicates()
user_indices = user_ids.reset_index(drop=True).reset_index()
user_indices = user_indices.rename(columns={"index": "user_index"})
train_data = train_data.merge(user_indices, on="user_id")
n = len(user_indices)

image_ids = train_data["image_id"].drop_duplicates()
image_indices = image_ids.reset_index(drop=True).reset_index()
image_indices = image_indices.rename(columns={"index": "image_index"})
train_data = train_data.merge(image_indices, on="image_id")
m = len(image_indices)

all_tags = images["tags_parsed"].explode().dropna().unique()
tag_indices = {tag: tag_index for tag_index, tag in enumerate(all_tags)}
k = len(all_tags)

def tag_vector(tags):
    if not tags:
        return np.zeros((1, k))
    return (sum([np.eye(1, k, tag_indices[tag]) for tag in tags])) / len(tags)

image_tags_df = image_indices.merge(images[["image_id", "tags_parsed"]], on="image_id", how="left")
np.testing.assert_array_equal(image_tags_df["image_index"], np.arange(m))
image_tags = np.concatenate(image_tags_df.tags_parsed.apply(tag_vector))

Unnamed: 0,user_id,image_id,rating,user_index,image_index
0,0FDTkc-BYZgNLac2OEtLog,VGzkEiZz7gqK_SYW86GlaA,5.0,0,0
1,vbTVyyBl88xVRlTQcBXTUA,VGzkEiZz7gqK_SYW86GlaA,4.0,105,0
2,pZqOQ4dg2sSiVe-lYy8OSA,VGzkEiZz7gqK_SYW86GlaA,4.0,219,0
3,qfHEYloVPKDRubFpuLb1eQ,VGzkEiZz7gqK_SYW86GlaA,5.0,534,0
4,LOhEraqv7zXkTzXQsPERHg,VGzkEiZz7gqK_SYW86GlaA,3.0,1608,0
...,...,...,...,...,...
5592219,VrFMvdVV3wD1T9Ca7-U-eg,nLoteZUixs40AxGNE7jJWw,1.0,1685749,150238
5592220,xm4gmNHP0Pgq_it8WTmyXg,j54PSNy6JtuNVChfFPJ5NA,4.0,1621680,150313
5592221,77IE4G4vj7R5acudAsi4Gw,8K9bSzG-Lm44LN7AwGI3LA,1.0,1700467,150335
5592222,-tH2ZT4MLUKaU6yrFfRUXg,8K9bSzG-Lm44LN7AwGI3LA,4.0,1741338,150335


In [77]:
train_data

Unnamed: 0,user_id,image_id,rating,user_index,image_index
0,0FDTkc-BYZgNLac2OEtLog,VGzkEiZz7gqK_SYW86GlaA,5.0,0,0
1,vbTVyyBl88xVRlTQcBXTUA,VGzkEiZz7gqK_SYW86GlaA,4.0,105,0
2,pZqOQ4dg2sSiVe-lYy8OSA,VGzkEiZz7gqK_SYW86GlaA,4.0,219,0
3,qfHEYloVPKDRubFpuLb1eQ,VGzkEiZz7gqK_SYW86GlaA,5.0,534,0
4,LOhEraqv7zXkTzXQsPERHg,VGzkEiZz7gqK_SYW86GlaA,3.0,1608,0
...,...,...,...,...,...
5592219,VrFMvdVV3wD1T9Ca7-U-eg,nLoteZUixs40AxGNE7jJWw,1.0,1685749,150238
5592220,xm4gmNHP0Pgq_it8WTmyXg,j54PSNy6JtuNVChfFPJ5NA,4.0,1621680,150313
5592221,77IE4G4vj7R5acudAsi4Gw,8K9bSzG-Lm44LN7AwGI3LA,1.0,1700467,150335
5592222,-tH2ZT4MLUKaU6yrFfRUXg,8K9bSzG-Lm44LN7AwGI3LA,4.0,1741338,150335


In [118]:
# TODO: handle duplicate reviews

reviews_by_user = train_data.groupby("user_index").apply(lambda group: (group["image_index"].to_numpy(), group["rating"].to_numpy()))
reviews_by_image = train_data.groupby("image_index").apply(lambda group: (group["user_index"].to_numpy(), group["rating"].to_numpy()))

In [131]:
# R = X Y^T I^T : R (nxm), X (nxd), Y (kxd), I (mxk)

d = 20
alpha = .1
beta = .1
als_epochs = 3
lr = .1
sgd_epochs = 10

X = np.random.normal((n, d))
Y = np.random.normal((k, d))

def gradient(i):
    return np.zeros(d)

for epoch in range(epochs):
    # X = argmin ||(R - X Y^T I^T)_observed||_F^2 + alpha ||X||_F^2
    # Y^t I^t diag(O_u)^t diag(O_u) r_u = (Y^t I^t diag(O_u)^t diag(O_u) I Y + \alpha I_{d x d}) x_u
    start = time.time()
    for u, (image_ids, ratings) in reviews_by_user.items():
        if u % 10000 == 0 and u > 0:
            end = time.time()
            print(f"{u} users processed, elapsed: {end - start:.2f}s, average: {(end - start) * 10000 / u:.2f}s")
        
        A = alpha * np.eye(d)
        b = np.zeros(d)
        for i, r in zip(image_ids, ratings):
            a = image_tags[i] @ Y
            b += r * a
            a = a.reshape(1, -1)
            A += a.T @ a
        
        X[u] = np.linalg.solve(A, b)
    
    # Y = argmin ||(R - X Y^T I^T)_observed||_F^2 + beta ||Y||_F^2
    # beta Y + sum_{r_{ui}} i_i i_i^T Y x_u x_u^T = sum_{r_{ui}} r_{ui} i_i x_u^T
    XtX = X.T @ X
    XtX_inv = np.linalg.pinv(XtX)
    Io = 
    IotIo = np.zeros((k, k))
    RoIo = np.zeros((k, k))
    Y = scipy.linalg.solve_sylvester(IotIo, beta * XtX_inv, (RoIo.T @ X) @ XtX_inv)
    
    raise NotImplementedError

10000 users processed, elapsed: 6.69s, average: 6.69s
20000 users processed, elapsed: 12.01s, average: 6.00s
30000 users processed, elapsed: 15.73s, average: 5.24s
40000 users processed, elapsed: 19.03s, average: 4.76s
50000 users processed, elapsed: 21.94s, average: 4.39s
60000 users processed, elapsed: 24.68s, average: 4.11s
70000 users processed, elapsed: 27.01s, average: 3.86s
80000 users processed, elapsed: 29.37s, average: 3.67s
90000 users processed, elapsed: 31.48s, average: 3.50s
100000 users processed, elapsed: 34.23s, average: 3.42s
110000 users processed, elapsed: 36.72s, average: 3.34s
120000 users processed, elapsed: 39.09s, average: 3.26s
130000 users processed, elapsed: 41.22s, average: 3.17s
140000 users processed, elapsed: 42.93s, average: 3.07s
150000 users processed, elapsed: 44.57s, average: 2.97s
160000 users processed, elapsed: 46.16s, average: 2.88s
170000 users processed, elapsed: 48.03s, average: 2.83s
180000 users processed, elapsed: 49.82s, average: 2.77s
19

1460000 users processed, elapsed: 148.19s, average: 1.01s
1470000 users processed, elapsed: 148.87s, average: 1.01s
1480000 users processed, elapsed: 149.42s, average: 1.01s
1490000 users processed, elapsed: 149.93s, average: 1.01s
1500000 users processed, elapsed: 150.42s, average: 1.00s
1510000 users processed, elapsed: 150.91s, average: 1.00s
1520000 users processed, elapsed: 151.38s, average: 1.00s
1530000 users processed, elapsed: 151.83s, average: 0.99s
1540000 users processed, elapsed: 152.29s, average: 0.99s
1550000 users processed, elapsed: 152.73s, average: 0.99s
1560000 users processed, elapsed: 153.21s, average: 0.98s
1570000 users processed, elapsed: 153.73s, average: 0.98s
1580000 users processed, elapsed: 154.19s, average: 0.98s
1590000 users processed, elapsed: 154.68s, average: 0.97s
1600000 users processed, elapsed: 155.13s, average: 0.97s
1610000 users processed, elapsed: 155.59s, average: 0.97s
1620000 users processed, elapsed: 156.06s, average: 0.96s
1630000 users 

NotImplementedError: 

In [None]:
recommender = train_knn(reviews_train, word_embedding)

In [None]:
visualize(recommender.graph, {vertex: vertex.id for vertex in recommender.graph.vertices})

In [None]:
evaluate_recommendations(recommender, reviews_validation)

In [None]:
print("hi")