In [45]:
import os
import re
import torch
import numpy as np
import pandas as pd
import tqdm.notebook as tqdm
import pickle
import requests
from io import StringIO
from openai import OpenAI
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report

from surprise import accuracy, Dataset, SVD, Prediction, Trainset
from surprise.model_selection import train_test_split as s_train_test_split

with open("openai.secret") as f:
    os.environ["OPENAI_API_KEY"] = f.read().strip()

## Data

In [46]:
Rating = tuple[int | str, int | str, float]
UserHistory = AggHistory = dict[int, float]
InContextExample = dict[str, int | list[float] | list[int] | dict[int, UserHistory]]

ratings = Dataset.load_builtin("ml-100k")
train_ratings, test_ratings = s_train_test_split(ratings, test_size=0.2, random_state=0)
train_mean_rating = np.mean([r for _, _, r in train_ratings.all_ratings()])

r = requests.get("https://files.grouplens.org/datasets/movielens/ml-100k/u.item")
movie_ids = (
    pd
    .read_csv(StringIO(r.text), sep="|", header=None, usecols=[0, 1])
    .rename(columns={0: "movie_id", 1: "title"})
)
movie_ids["title"] = movie_ids["title"].str[:-7]
movie_ids = movie_ids.set_index("movie_id")["title"].to_dict()


def to_inner_user_id(raw_user_id: str) -> int | None:
    try:
        return train_ratings.to_inner_uid(raw_user_id)
    except ValueError:
        return None
    
    
def to_inner_movie_id(raw_movie_id: str) -> int | None:
    try:
        return train_ratings.to_inner_iid(raw_movie_id)
    except ValueError:
        return None
    

def to_raw_movie_id(inner_movie_id: int) -> str | None:
    try:
        return train_ratings.to_raw_iid(inner_movie_id)
    except ValueError:
        return None


def inner_to_movie_title(inner_movie_id: int) -> str:
    raw_movie_id = int(to_raw_movie_id(inner_movie_id))
    return movie_ids[raw_movie_id]

In [47]:
def eval_classification(predictions: list[Prediction]) -> None:
    y_true = [int(p.r_ui) for p in predictions]
    y_pred = [round(p.est) for p in predictions]
    print(classification_report(y_true, y_pred))

## Baseline SVD

In [48]:
recsys = SVD()
recsys.fit(train_ratings)

baseline_predictions = recsys.test(test_ratings)

accuracy.rmse(recsys.test(test_ratings))
accuracy.mae(recsys.test(test_ratings))

RMSE: 0.9439
MAE:  0.7444


0.7443740204430647

## Baseline Random

In [49]:
random_predictions = [
    Prediction(
        uid=raw_user_id,
        iid=raw_movie_id,
        r_ui=rating,
        est=np.random.randint(1, 6),
        details={},
    )
    for raw_user_id, raw_movie_id, rating  in test_ratings
]

accuracy.rmse(random_predictions)
accuracy.mae(random_predictions)

RMSE: 1.8967
MAE:  1.5199


1.51995

## LLM + SVD + RAG

In [50]:
def get_user_item(ratings: list[Rating]) -> pd.DataFrame:
    return (
        pd
        .DataFrame(ratings, columns=["user_id", "movie_id", "rating"])
        .groupby(["user_id", "movie_id"])
        .agg(rating=("rating", "sum"))
        .reset_index()
        .pivot(index="user_id", columns="movie_id", values="rating")
        .fillna(0)
        .sort_index()
    )


def get_user_histories(user_item: pd.DataFrame) -> dict[int, UserHistory]:
    return user_item.apply(lambda r: dict(r[r > 0].items()), axis=1).to_dict()

In [51]:
# using inner ids from now on
train_user_item = get_user_item(train_ratings.all_ratings())
n_dims = 100
u, s, v = np.linalg.svd(train_user_item, full_matrices=True)
train_user_embeddings = u[:, :n_dims]
train_user_histories = get_user_histories(train_user_item)

In [52]:
train_knn = NearestNeighbors(n_neighbors=20, metric="cosine")
train_knn.fit(train_user_embeddings)

In [53]:
def get_nearest_user_histories(
    user_id: int,
    knn: NearestNeighbors,
    user_embeddings: np.array,
    user_histories: dict[int, UserHistory]
) -> dict[int, UserHistory]:
    
    vec = user_embeddings[user_id, np.newaxis]
    nearest_user_ids = knn.kneighbors(vec, return_distance=False)
    return {
        near_user_id: user_histories[near_user_id]
        for near_user_id in nearest_user_ids[0].tolist()
        if near_user_id != user_id
    }

In [54]:
def agg_user_histories(
    user_histories: dict[int, UserHistory],
) -> AggHistory:
    return pd.DataFrame(user_histories).mean(axis=1).to_dict()

In [55]:
def get_movies_to_predict(
    user_id: int,
    test_ratings: list[Rating],
) -> list[int]:

    return [
        to_inner_movie_id(raw_movie_id) for raw_user_id, raw_movie_id, _ in test_ratings
        if to_inner_user_id(raw_user_id) == user_id and to_inner_movie_id(raw_movie_id) is not None
    ]

In [56]:
def filter_agg_history(
    agg_history: AggHistory, movies_to_predict: list[int]
) -> AggHistory:
    
    return {
        movie_id: rating
        for movie_id, rating in agg_history.items()
        if movie_id in movies_to_predict
    }

In [57]:
def get_in_context_example(
    knn: NearestNeighbors,
    user_embeddings: np.array,
    user_histories: dict[int, UserHistory],
    censoring: float = 0.5
) -> InContextExample:

    user_id = np.random.randint(max(user_histories.keys()))
    user_history = user_histories[user_id]
    censored_user_history = {
        movie_id: rating
        for movie_id, rating in user_history.items()
        if np.random.rand() > 0.5
    }
    complement_user_history = {
        movie_id: rating for movie_id, rating in user_history.items()
        if movie_id not in censored_user_history
    }
    movies_to_predict = list(complement_user_history.keys())
    true_ratings = list(complement_user_history.values())
    
    nearest_user_histories=get_nearest_user_histories(
        user_id=user_id,
        knn=knn,
        user_embeddings=user_embeddings,
        user_histories=user_histories,
    )
    
    return {
        "user_id": user_id,
        "censored_user_history": censored_user_history,
        "nearest_user_histories": nearest_user_histories,
        "movies_to_predict": movies_to_predict,
        "true_ratings": true_ratings
    }

In [58]:
def fmt_list(x: list[int]) -> str:
    return str(x).strip("[]")


def fmt_movies_to_predict(movies_to_predict: list[int]) -> str:
    return "; ".join(inner_to_movie_title(movie_id) for movie_id in movies_to_predict)


def fmt_history(history: UserHistory | AggHistory) -> str:
    return "; ".join(
        f"{inner_to_movie_title(movie_id)}: {rating}"
        for movie_id, rating in history.items()
    )


def fmt_in_context_example(in_context_example: InContextExample) -> str:
    
    user_history = in_context_example["censored_user_history"]
    nearest_user_histories = in_context_example["nearest_user_histories"]
    movies_to_predict = in_context_example["movies_to_predict"]
    agg_history = agg_user_histories(nearest_user_histories)
    agg_history = filter_agg_history(agg_history, movies_to_predict)
    true_ratings = in_context_example["true_ratings"]

    return f"""
        User history: {fmt_history(user_history)}.
        Average ratings from users with similar preferences: {fmt_history(agg_history)}
        Movies for which to predict the ratings: {fmt_movies_to_predict(movies_to_predict)}
        Correct response with the predicted ratings: {fmt_list(true_ratings)}
    """

In [59]:
def get_instruction(
    user_history: UserHistory,
    nearest_user_histories: dict[int, UserHistory],
    movies_to_predict: list[int],
    in_context_examples: list[InContextExample],
) -> str:
    
    formatted_in_context_examples = "example:\n".join(
        fmt_in_context_example(in_context_example)
        for in_context_example in in_context_examples
    )
    
    agg_history = agg_user_histories(nearest_user_histories)
    agg_history = filter_agg_history(agg_history, movies_to_predict)

    return f"""
        A user has seen and rated the following movies: {fmt_history(user_history)}.

        Users with similar preferences have seen and rated these movies:
        {fmt_history(agg_history)}.

        Each semicolon-separated value in the lists above represents a movie id followed
        by a numerical user rating.

        Use the data above to predict the ratings the user has given to
        the movies: {fmt_movies_to_predict(movies_to_predict)}. Give exactly one rating
        to each of the movies.
        
        Respond with just a list of ratings and nothing else. Your response needs
        to have exactly {len(movies_to_predict)} ratings.
        
        Use the following examples to learn how to predict the ratings:
        {formatted_in_context_examples}
    """.replace("    ", "")

In [60]:
def parse_response(response: str) -> list[float]:
    
    strip = ",.[]` "
    response = response.replace("plaintext\n", "").replace(";", ",")
    if "\n" in response:
        numbers = (line.strip(strip)[-3:] for line in response.split("\n"))
        numbers = filter(lambda n: n != "", numbers)
    else:
        numbers = response.strip(strip).split(",")
        
    return list(map(float, numbers))

In [61]:
def predict(instruction: str, model: str) -> list[float]:
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": instruction}
        ]
    )
    
    response = completion.choices[0].message.content
    try:
        return parse_response(response)
    except Exception:
        return response

In [62]:
def predict_all(
    model: str,
    output_file_path: str,
    train_user_item: pd.DataFrame,
    train_user_histories: list[UserHistory],
    train_user_embeddings: np.array,
    train_knn: NearestNeighbors,
    test_ratings: list[Rating],
) -> list[Prediction]:
    
    if not os.path.exists(output_file_path):
        predictions = {}
    else:
        with open(output_file_path, "rb") as f:
            predictions = pickle.load(f)

    for user_id in tqdm.tqdm(train_user_item.index.tolist()):

        if user_id in predictions:
            continue

        movies_to_predict = get_movies_to_predict(
            user_id=user_id,
            test_ratings=test_ratings,
        )

        if not movies_to_predict:
            continue

        user_history=train_user_histories[user_id]
        nearest_user_histories=get_nearest_user_histories(
            user_id=user_id,
            knn=train_knn,
            user_embeddings=train_user_embeddings,
            user_histories=train_user_histories,
        )
        
        in_context_examples = [
            get_in_context_example(
                knn=train_knn,
                user_embeddings=train_user_embeddings,
                user_histories=train_user_histories
            )
            for _ in range(3)
        ]

        instruction = get_instruction(
            user_history=user_history,
            nearest_user_histories=nearest_user_histories,
            movies_to_predict=movies_to_predict,
            in_context_examples=in_context_examples,
        )

        predicted_ratings = predict(instruction, model)
        predictions[user_id] = {
            "predicted_ratings": predicted_ratings,
            "movies_to_predict": movies_to_predict
        }

        with open(output_file_path, "wb") as f:
            pickle.dump(predictions, f)
            

    processed_predictions = {}
    to_raw_user_id = train_ratings.to_raw_uid
    to_raw_movie_id = train_ratings.to_raw_iid

    for user_id in predictions.keys():
        movies_to_predict = predictions[user_id]["movies_to_predict"]
        predicted_ratings = predictions[user_id]["predicted_ratings"]

        if not isinstance(predicted_ratings, list):
            raise Exception(user_id)

        len_movies = len(movies_to_predict)
        len_pred = len(predicted_ratings)

        predicted_ratings = [
            r if r > 0 else train_mean_rating for r in predicted_ratings
        ]

        if len_movies > len_pred:
            predicted_ratings += [train_mean_rating] * (len_movies - len_pred)

        if len_pred > len_movies:
            predicted_ratings = predicted_ratings[:len_movies]

        processed_predictions[to_raw_user_id(user_id)] = {
            to_raw_movie_id(movie_id): predicted_rating
            for movie_id, predicted_rating in zip(movies_to_predict, predicted_ratings)
        }


    return [
        Prediction(
            uid=raw_user_id,
            iid=raw_movie_id,
            r_ui=rating,
            est=processed_predictions[raw_user_id][raw_movie_id],
            details={},
        )
        for raw_user_id, raw_movie_id, rating  in test_ratings
        if raw_user_id in processed_predictions
        and raw_movie_id in processed_predictions[raw_user_id]
    ]

In [68]:
client = OpenAI()
gpt_4o_mini_predictions = predict_all(
    model="gpt-4o-mini",
    output_file_path="predictions_gpt-4o-mini-in-context-learning-no-ids-agg-filter.pkl",
    train_user_item=train_user_item,
    train_user_histories=train_user_histories,
    train_user_embeddings=train_user_embeddings,
    train_knn=train_knn,
    test_ratings=test_ratings,
)

  0%|          | 0/943 [00:00<?, ?it/s]

In [69]:
# gpt 4o mini
accuracy.rmse(gpt_4o_mini_predictions)
accuracy.mae(gpt_4o_mini_predictions)

RMSE: 1.2504
MAE:  0.9544


0.9543837089454019