In [481]:
import os
import torch
import numpy as np
import pandas as pd
import tqdm.notebook as tqdm
import pickle
from openai import OpenAI
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

from surprise import accuracy, Dataset, SVD, Prediction, Trainset
from surprise.model_selection import train_test_split as s_train_test_split

with open("openai.secret") as f:
    os.environ["OPENAI_API_KEY"] = f.read().strip()

## Baseline

In [470]:
ratings = Dataset.load_builtin("ml-100k")
train_ratings, test_ratings = s_train_test_split(data, test_size=0.2, random_state=0)


def to_inner_user_id(raw_user_id: str) -> int | None:
    try:
        return train_ratings.to_inner_uid(raw_user_id)
    except ValueError:
        return None
    
    
def to_inner_movie_id(raw_movie_id: str) -> int | None:
    try:
        return train_ratings.to_inner_iid(raw_movie_id)
    except ValueError:
        return None

In [471]:
recsys = SVD()
recsys.fit(train_ratings)

accuracy.rmse(algo.test(test_ratings))
accuracy.mae(algo.test(test_ratings))

RMSE: 0.9453
MAE:  0.7451


0.7451295316128084

## Baseline Random

In [505]:
random_predictions = [
    Prediction(
        uid=raw_user_id,
        iid=raw_movie_id,
        r_ui=rating,
        est=np.random.randint(1, 6),
        details={},
    )
    for raw_user_id, raw_movie_id, rating  in test_ratings
]

accuracy.rmse(random_predictions)
accuracy.mae(random_predictions)

RMSE: 1.8964
MAE:  1.5219


1.52195

## LLM + SVD + RAG

In [472]:
def get_user_item(ratings: list[tuple[int | str, int | str, float]]) -> pd.DataFrame:
    return (
        pd
        .DataFrame(ratings, columns=["user_id", "movie_id", "rating"])
        .groupby(["user_id", "movie_id"])
        .agg(rating=("rating", "sum"))
        .reset_index()
        .pivot(index="user_id", columns="movie_id", values="rating")
        .fillna(0)
        .sort_index()
    )


def get_user_histories(user_item: pd.DataFrame) -> dict[int, list[int]]:
    return user_item.apply(lambda r: dict(r[r > 0].items()), axis=1).to_dict()

In [473]:
# using inner ids from now on
train_user_item = get_user_item(train_ratings.all_ratings())
n_dims = 100
u, s, v = np.linalg.svd(train_user_item, full_matrices=True)
train_user_embeddings = u[:, :n_dims]
train_user_histories = get_user_histories(train_user_item)

In [474]:
train_knn = NearestNeighbors(n_neighbors=20, metric="cosine")
train_knn.fit(train_user_embeddings)

In [475]:
def get_nearest_user_histories(
    user_id: int,
    knn: NearestNeighbors,
    user_embeddings: np.array,
    user_histories: dict[int, list[int]]
) -> list[dict[int, int]]:
    
    vec = user_embeddings[user_id, np.newaxis]
    nearest_user_ids = knn.kneighbors(vec, return_distance=False)
    return {
        near_user_id: user_histories[near_user_id]
        for near_user_id in nearest_user_ids[0].tolist()
        if near_user_id != user_id
    }

In [476]:
def get_movies_to_predict(
    user_id: int,
    test_ratings: list[tuple[str, str, float]],
) -> list[int]:

    return [
        to_inner_movie_id(raw_movie_id) for raw_user_id, raw_movie_id, _ in test_ratings
        if to_inner_user_id(raw_user_id) == user_id and to_inner_movie_id(raw_movie_id) is not None
    ]

In [477]:
def fmt_list(x: list[int]) -> str:
    return str(x).strip("[]")


def fmt_user_history(user_history: dict[int, float]) -> str:
    return ", ".join(
        f"{movie_id}: {rating}" for movie_id, rating in user_history.items()
    )


def get_instruction(
    user_history: dict[int, float],
    nearest_user_histories: dict[int, dict[int, float]],
    movies_to_predict: list[int],
) -> int:
    
    formatted_nearest_user_histories = "\n".join(
        f"user {i}: {fmt_user_history(near_user_history)}."
        for i, near_user_history
        in enumerate(nearest_user_histories.values())
    )
    
    return f"""
        A user has seen and rated the following movies: {fmt_user_history(user_history)}.

        Users with similar preferences have seen and rated these movies:
        {formatted_nearest_user_histories}.

        Each comma-separated value in the lists above represents a movie id followed
        by a numerical user rating.

        Use the data above to predict the ratings the user has given to
        the movies: {fmt_list(movies_to_predict)}. Give exactly one rating
        to each of the movies.
        
        Respond with just a list of ratings and nothing else. Your response needs
        to have exactly {len(movies_to_predict)} ratings.
    """.replace("    ", "")

In [478]:
def predict(instruction: str) -> list[int]:
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": instruction}
        ]
    )
    
    response = completion.choices[0].message.content
    try:
        return list(map(float, response.strip(".[] ").split(",")))
    except Exception:
        return response

In [485]:
client = OpenAI()

if not os.path.exists("predictions.pkl"):
    predictions = {}
else:
    with open("predictions.pkl", "rb") as f:
        predictions = pickle.load(f)

for user_id in tqdm.tqdm(train_user_item.index.tolist()):
    
    if str(user_id) in predictions:
        continue
    
    movies_to_predict = get_movies_to_predict(
        user_id=user_id,
        test_ratings=test_ratings,
    )
    
    if not movies_to_predict:
        continue
    
    user_history=train_user_histories[user_id]
    nearest_user_histories=get_nearest_user_histories(
        user_id=user_id,
        knn=train_knn,
        user_embeddings=train_user_embeddings,
        user_histories=train_user_histories,
    )
    
    instruction = get_instruction(
        user_history=user_history,
        nearest_user_histories=nearest_user_histories,
        movies_to_predict=movies_to_predict,
    )
    
    predicted_ratings = predict(instruction)
    predictions[user_id] = {
        "predicted_ratings": predicted_ratings,
        "movies_to_predict": movies_to_predict
    }

    with open("predictions.pkl", "wb") as f:
        pickle.dump(predictions, f)

  0%|          | 0/943 [00:00<?, ?it/s]

In [495]:
processed_predictions = {}
to_raw_user_id = train_ratings.to_raw_uid
to_raw_movie_id = train_ratings.to_raw_iid


for user_id in predictions.keys():
    movies_to_predict = predictions[user_id]["movies_to_predict"]
    predicted_ratings = predictions[user_id]["predicted_ratings"]
    
    if not isinstance(predicted_ratings, list):
        raise Exception(user_id)
    
    len_movies = len(movies_to_predict)
    len_pred = len(predicted_ratings)
    
    if len_movies > len_pred:
        predicted_ratings += [0] * (len_movies - len_pred)
        
    if len_pred > len_movies:
        predicted_ratings = predicted_ratings[:len_movies]
        
    processed_predictions[to_raw_user_id(user_id)] = {
        to_raw_movie_id(movie_id): predicted_rating
        for movie_id, predicted_rating in zip(movies_to_predict, predicted_ratings)
    }
    

processed_predictions = [
    Prediction(
        uid=raw_user_id,
        iid=raw_movie_id,
        r_ui=rating,
        est=processed_predictions[raw_user_id][raw_movie_id],
        details={},
    )
    for raw_user_id, raw_movie_id, rating  in test_ratings
    if raw_user_id in processed_predictions and raw_movie_id in processed_predictions[raw_user_id]
]

In [497]:
accuracy.rmse(processed_predictions)
accuracy.mae(processed_predictions)

RMSE: 1.3855
MAE:  1.0230


1.0229663394109396