In [1]:
%reload_ext autoreload
%autoreload 2

import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [2]:
import numpy as np
import pandas as pd
import scipy.linalg
import time

from src.graph import visualize
from src.recommender import train_mf, reviews_dataset, evaluate_recommendations
from src.tags import PytorchWordEmbedding

In [3]:
reviews_train, reviews_validation, reviews_test, images = reviews_dataset()

In [4]:
import re

splidd_regex = re.compile(",|&|/")
replace_regex = re.compile(" |\(|\)|\'|-")

def parse_tags(tags_raw):
    if not isinstance(tags_raw, str):
        return []
    
    tags = []
    for tag_raw in re.split(splidd_regex, tags_raw):
        tag = re.sub(replace_regex, "", tag_raw.strip().lower())
        if tag:
            tags.append(tag)
    return tags

images["tags_parsed"] = images["tags"].apply(parse_tags)

In [5]:
train_data = reviews_train.copy()

user_ids = train_data['user_id'].drop_duplicates()
user_indices = user_ids.reset_index(drop=True).reset_index()
user_indices = user_indices.rename(columns={"index": "user_index"})
train_data = train_data.merge(user_indices, on="user_id")
n = len(user_indices)

image_ids = train_data["image_id"].drop_duplicates()
image_indices = image_ids.reset_index(drop=True).reset_index()
image_indices = image_indices.rename(columns={"index": "image_index"})
train_data = train_data.merge(image_indices, on="image_id")
m = len(image_indices)

all_tags = images["tags_parsed"].explode().dropna().unique()
tag_indices = {tag: tag_index for tag_index, tag in enumerate(all_tags)}
k = len(all_tags)

def tag_vector(tags):
    if not tags:
        return np.zeros((1, k))
    return (sum([np.eye(1, k, tag_indices[tag]) for tag in tags])) / len(tags)

image_tags_df = image_indices.merge(images[["image_id", "tags_parsed"]], on="image_id", how="left")
np.testing.assert_array_equal(image_tags_df["image_index"], np.arange(m))
image_tags = np.concatenate(image_tags_df.tags_parsed.apply(tag_vector))

In [6]:
# TODO: handle duplicate reviews

reviews_by_user = train_data.groupby("user_index").apply(lambda group: (group["image_index"].to_numpy(), group["rating"].to_numpy()))

In [None]:
# R = X Y^T I^T : R (nxm), X (nxd), Y (kxd), I (mxk)

d = 20
alpha = .01
beta = .01
als_epochs = 10
learning_rate = .1
sgd_epochs = 10

X = np.random.normal(size=(n, d))
Y = np.random.normal(size=(k, d))

def gradient(i):
    grad = 2 * beta/(k * d) * Y[i]
    start = time.time()
    for u, (image_ids, ratings) in reviews_by_user.items():
        if u % 10000 == 0 and u > 0:
            end = time.time()
            print(f"{u} users processed, elapsed: {end - start:.2f}s, average: {(end - start) * 10000 / u:.2f}s")
        io = image_tags[image_ids]
        grad += (2/nnz) * (io[:,i] @ (io @ (Y @ X[u]) - ratings)) * X[u]
    return grad

def tiny_gradient(i, sample_size=1):
    sample = reviews_by_user.sample(sample_size)
    grad = 2 * beta / (k * d) * Y[i]
    for u, (image_ids, ratings) in sample.items():
        io = image_tags[image_ids]
        grad += 2 / (sample_size * len(image_ids)) * (io[:,i] @ (io @ (Y @ X[u]) - ratings)) * X[u]
    return grad

for als_epoch in range(als_epochs):
    # X = argmin ||(R - X Y^T I^T)_observed||_F^2 + alpha ||X||_F^2
    # Y^t I^t diag(O_u)^t diag(O_u) r_u = (Y^t I^t diag(O_u)^t diag(O_u) I Y + alpha I_{d x d}) x_u
    start = time.time()
    for u, (image_ids, ratings) in reviews_by_user.items():
        if u % 10000 == 0 and u > 0:
            end = time.time()
            print(f"{u} users processed, elapsed: {end - start:.2f}s, average: {(end - start) * 10000 / u:.2f}s")
        
        A = alpha/(n*d) * np.eye(d)
        b = np.zeros(d)
        for i, r in zip(image_ids, ratings):
            a = image_tags[i] @ Y
            b += r * a
            a = a.reshape(1, -1)
            A += a.T @ a
        
        X[u] = np.linalg.solve(A, b)
    
    # Y = argmin ||(R - X Y^T I^T)_observed||_F^2 + beta ||Y||_F^2
    # beta Y + sum_{r_{ui}} i_i i_i^T Y x_u x_u^T = sum_{r_{ui}} r_{ui} i_i x_u^T
    start = time.time()
    for sgd_epoch in range(sgd_epochs):
        print(f"{sgd_epoch=}")
        for i in range(k):
            if i % 50 == 0:
                print(i)
            Y[i] -= learning_rate / (sgd_epoch + 1) * tiny_gradient(i, sample_size=10)
        
        loss = 1/n * sum(1 / len(image_ids) * sum((ratings - image_tags[image_ids] @ (Y @ X[u])) ** 2) for u, (image_ids, ratings) in reviews_by_user.items() if len(image_ids) > 0) + beta / (k * d) * np.linalg.norm(Y) ** 2
        print(f"{loss=}")
        
        end = time.time()
        print(f"{end-start:.2f}s elapsed, {(end-start)/(sgd_epoch+1):.2f}s average")

10000 users processed, elapsed: 8.58s, average: 8.58s
20000 users processed, elapsed: 14.47s, average: 7.23s
30000 users processed, elapsed: 19.01s, average: 6.34s
40000 users processed, elapsed: 22.72s, average: 5.68s
50000 users processed, elapsed: 25.85s, average: 5.17s
60000 users processed, elapsed: 29.56s, average: 4.93s
70000 users processed, elapsed: 32.89s, average: 4.70s
80000 users processed, elapsed: 35.89s, average: 4.49s
90000 users processed, elapsed: 38.51s, average: 4.28s
100000 users processed, elapsed: 41.09s, average: 4.11s
110000 users processed, elapsed: 43.56s, average: 3.96s
120000 users processed, elapsed: 45.83s, average: 3.82s
130000 users processed, elapsed: 48.10s, average: 3.70s
140000 users processed, elapsed: 50.23s, average: 3.59s
150000 users processed, elapsed: 52.28s, average: 3.49s
160000 users processed, elapsed: 54.38s, average: 3.40s
170000 users processed, elapsed: 56.68s, average: 3.33s
180000 users processed, elapsed: 58.62s, average: 3.26s
19

In [None]:
recommender = train_knn(reviews_train, word_embedding)

In [None]:
visualize(recommender.graph, {vertex: vertex.id for vertex in recommender.graph.vertices})

In [None]:
evaluate_recommendations(recommender, reviews_validation)

In [None]:
print("hi")