In [5]:
import tqdm
import json

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

## User-based Collaborative Filtering

#### Основная идея: 
Рекомендовать пользователю треки, которые понравились похожим на него пользователям

$$\hat r_{ui} = h^{-1} \left( \frac{\sum_{v \in N_i(u)} w_{uv} h(r_{vi})}{\sum_{v \in N_i(u)} w_{uv}} \right)$$

$N_i(u)$ - соседи пользователя $u$, которые оценили айтем $i$,
$w_{uv}, w_{ij}$ - веса соседей, 
$h$ - функция нормализации



**Нормализация**: В качестве функции нормализации используем среднее время прослушивания

**Веса**: Похожих пользователей будем искать по *cosine similarity*

**Отсутствующие данные**: заполним средним времнем прослушивания по пользователю

**Соседи**: в качестве соседей будем рассматривать всех пользователей. Q: Как это упростит формулу?

In [6]:
BOTIFY_DATA_DIR = "/Users/aleksandr/Desktop/recsys_made2023/botify_recsys/log/experiments/random/data.json"

data = pd.read_json(BOTIFY_DATA_DIR, lines=True)[["user", "time", "track"]].copy()

data.head()

Unnamed: 0,user,time,track
0,404,1.0,1084
1,404,1.0,1084
2,404,1.0,1084
3,404,1.0,1084
4,404,0.0,487


In [7]:
data["normalized_time"] = data.groupby("user")["time"].transform(lambda time: time - time.mean())

data.head()

Unnamed: 0,user,time,track,normalized_time
0,404,1.0,1084,0.83
1,404,1.0,1084,0.83
2,404,1.0,1084,0.83
3,404,1.0,1084,0.83
4,404,0.0,487,-0.17


In [8]:
interactions = pd.pivot_table(data, values="normalized_time", index="user", columns="track").fillna(0)

print(f"Interactions matrix: shape={interactions.shape}, sparsity={(interactions != 0).values.sum() / interactions.size}")

Interactions matrix: shape=(9443, 49397), sparsity=0.0005827132144725434


In [9]:
similarity_matrix = cosine_similarity(interactions)
np.fill_diagonal(similarity_matrix, 0)

print(f"Mean positive neighbours per user: {(similarity_matrix > 0).sum(axis=1).mean()}")

Mean positive neighbours per user: 116.22429312718415


In [10]:
print(f"Mean negative neighbours per user: {(similarity_matrix < 0).sum(axis=1).mean()}")

Mean negative neighbours per user: 62.69215291750503


In [11]:
# TODO: Compute proper user-based scores
# TODO: expected size: observed users x observed tracks
scores_matrix = np.matmul(similarity_matrix, interactions.values)

scores = pd.DataFrame(
    scores_matrix,
    index=interactions.index,
    columns=interactions.columns
)

scores[[1, 2, 3, 4, 5]].head()

track,1,2,3,4,5
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0


## Глянем на рекомендации

In [14]:
products = pd.read_json("/Users/aleksandr/Desktop/recsys_made2023/botify_recsys/data/tracks.json", lines=True).set_index("track")
products.head()

Unnamed: 0_level_0,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Jack Johnson,The Cove
1,Billy Preston,Nothing from Nothing
2,Paco De Lucia,Entre Dos Aguas
3,Josh Rouse,Under Cold Blue Stars
4,The Dead 60s,Riot Radio (Soundtrack Version)


In [19]:
user = np.random.choice(scores.index)
k = 10

data[data["user"] == user]

Unnamed: 0,user,time,track,normalized_time
191555,942,1.0,27804,0.718333
191557,942,0.02,19715,-0.261667
191560,942,0.29,21320,0.008333
191564,942,0.34,1781,0.058333
191567,942,0.03,33982,-0.251667
191569,942,0.01,5474,-0.271667


In [20]:
user_scores = pd.merge(
    scores.loc[user].sort_values(ascending=False)[:k].to_frame("score"),
    products, 
    left_index=True, 
    right_index=True,
    how="inner"
)

user_scores

Unnamed: 0_level_0,score,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1781,0.057882,The Temper Trap,Soldier On
2416,0.04854,Ratt,Lay It Down (2007 Remastered)
11382,0.034651,Slightly Stoopid,This Joint
17625,0.034651,Spoon,I Saw The Light
4675,0.034651,Pepper,Face Plant (LP Version)
47910,0.029502,Wolf Parade,Soldier's Grin
159,0.029502,Dwight Yoakam,You're The One
16119,0.029136,The Coffee Club Orchestra,Overture
30803,0.028631,Lisa Loeb,Furious Rose
8784,0.028631,The Smiths,These Things Take Time


In [21]:
user_interactions = pd.merge(
    interactions.loc[user].sort_values(ascending=False).to_frame("time"),
    products, 
    left_index=True, 
    right_index=True, 
    how="inner"
)

user_interactions[user_interactions["time"] != 0]

Unnamed: 0_level_0,time,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
27804,0.718333,Kenny Rogers,Misty
1781,0.058333,The Temper Trap,Soldier On
21320,0.008333,Blake Shelton,That's What I Call Home (Album Version)
33982,-0.251667,Michael Jackson,"Childhood (Theme From ""Free Willy 2"")"
19715,-0.261667,Okkervil River,Pop Lie
5474,-0.271667,Black Eyed Peas,What It Is


## Подготавливаем рекомендации для продакшена

In [22]:
def recommend(user_id, scores, k):
    return scores.loc[user_id].sort_values(ascending=False)[:k].index.tolist()

In [23]:
users = data["user"].unique()

with open(BOTIFY_DATA_DIR + "recommendations_ub.json", "w") as rf:
    for user in tqdm.tqdm(users):
        recommendation = {
            "user": int(user),
            "tracks": recommend(user, scores, 100)
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9443/9443 [00:20<00:00, 462.08it/s]
