In [1]:
import numpy as np
import polars as pl
from tqdm import tqdm

from typing import List, Any

import scipy.sparse as sp
from sklearn.model_selection import train_test_split

import random
from collections import Counter

In [3]:
data = pl.read_parquet('train.parquet')
# датафрейм с обратными ребрами
data_rev = (
    data
    .rename({'uid': 'friend_uid', 'friend_uid': 'uid'})
    .select('uid', 'friend_uid')
)

# соединим все в один граф
data = pl.concat([data, data_rev])
data

uid,friend_uid
i64,i64
93464,114312
93464,103690
93464,108045
93464,116128
93464,94113
93464,101668
93464,118820
93464,93617
93464,97587
93464,101941


Данные состоят из двух колонок:

- `uid` – идентификатор пользователя
- `friend_uid` – идентификатор друга этого пользователя

Нашей задачей будет порекомендовать возможных друзей, для оценки вашего решения будет использоваться метрика Recall@10, равная проценту верно угаданных друзей

In [16]:
TOP_K = 10
RANDOM_STATE = 42

SUBMISSION_PATH = 'submission.parquet'


def user_intersection(y_rel: List[Any], y_rec: List[Any], k: int = 10) -> int:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: number of items in intersection of y_rel and y_rec (truncated to top-K)
    """
    return len(set(y_rec[:k]).intersection(set(y_rel)))


def user_recall(y_rel: List[Any], y_rec: List[Any], k: int = 10) -> float:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: percentage of found relevant items through recommendations
    """
    return user_intersection(y_rel, y_rec, k) / min(k, len(set(y_rel)))

## Валидация

Так как у нас нет временной последовательности и рекомендации друзей не так сильно зависят от временной составляющей, в качестве можно использовать случайно выбранные ребра в графе (при этом для каждого пользователя будет равная пропорция друзей в валидации, которую можно достичь с помощью stratify параметра)

In [4]:
# зафиксируем генератор случайных чисел
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [5]:
# отфильтруем тех пользователей, у которых только один друг :(
# для того, чтобы в тренировочной выборке и валидации было хотя бы по одному другу
friends_count = data.groupby('uid').count()
filtered_uid = set(friends_count.filter(pl.col('count') > 1)['uid'].to_list())

data_filtered = data.filter(pl.col('uid').is_in(filtered_uid))

# случайно выбираем ребра для валидационной выборки
train_df, test_df = train_test_split(
    data_filtered.filter(pl.col('uid').is_in(filtered_uid)),
    stratify=data_filtered['uid'],
    test_size=0.1,
    random_state=RANDOM_STATE
)

train_df

uid,friend_uid
i64,i64
62053,63575
31895,59356
97127,32271
89,11703
105178,47188
116127,52662
33824,15235
23690,103992
94660,45709
20872,60890


## Бейзлайн (Random)

In [6]:
grouped_df = (
    test_df
    .groupby('uid')
    .agg(pl.col('friend_uid').alias('y_rel'))
    .join(
        train_df
        .groupby('uid')
        .agg(pl.col('friend_uid').alias('user_history')),
        'uid',
        how='left'
    )
)

median_seq_len = int(grouped_df['user_history'].apply(len).median())
print(f"среднее число uid в user_history: {median_seq_len}")

среднее число uid в user_history: 36


In [7]:
n_users = train_df['uid'].max() + 1

# количество друзей у каждого пользователя
friends_count = np.zeros(n_users)
for uid, count in Counter(train_df['uid']).items():
    friends_count[uid] = count
    
friends_count /= sum(friends_count)

In [8]:
recall_list = []
recs = np.random.choice(n_users, size=(n_users, TOP_K + median_seq_len), p=friends_count)

for user_id, y_rel, user_history in tqdm(grouped_df.rows()):
    y_rec = [uid for uid in recs[user_id] if uid not in user_history]
    recall_list.append(user_recall(y_rel, y_rec))
    
print(f'Recall@{TOP_K} = {np.mean(recall_list)}')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 92562/92562 [00:04<00:00, 20068.17it/s]

Recall@10 = 0.0003110784946203369





## Построим рекомендации

In [9]:
# посчитаем вероятности уже по всем имеющимся данным
n_users = data['uid'].max() + 1

# количество друзей у каждого пользователя
friends_count = np.zeros(n_users)
for uid, count in Counter(data['uid']).items():
    friends_count[uid] = count
    
friends_count /= sum(friends_count)

In [10]:
sample_submission = pl.read_parquet('sample_submission.parquet')

grouped_df = (
    sample_submission.select('uid')
    .join(
        train_df
        .groupby('uid')
        .agg(pl.col('friend_uid').alias('user_history')),
        'uid',
        how='left'
    )
)

submission = []
recs = np.random.choice(n_users, size=(n_users, TOP_K + median_seq_len), p=friends_count)

for user_id, user_history in tqdm(grouped_df.rows()):
    user_history = [] if user_history is None else user_history
    
    y_rec = [uid for uid in recs[user_id] if uid not in user_history]
    submission.append((user_id, y_rec))
    
submission = pl.DataFrame(submission, schema=['user_id', 'y_recs'])
submission.write_parquet('submission.parquet')
submission

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 85483/85483 [00:04<00:00, 18203.32it/s]


user_id,y_recs
i64,list[i64]
0,"[75174, 40482, … 107746]"
1,"[27663, 82181, … 37095]"
3,"[105454, 12906, … 43868]"
4,"[2627, 60169, … 108457]"
5,"[29164, 53357, … 33803]"
6,"[62015, 44506, … 3835]"
7,"[74067, 79534, … 53438]"
8,"[75913, 31803, … 32356]"
9,"[26079, 73565, … 111629]"
10,"[46650, 34491, … 116342]"


## Решение

Для установки torch-geometric пришлось ломать текущие зависимости в pyproject, устанавливал примерно так
!pip install torch==2.2.*
!pip install numpy==1.24.3
!pip install --force-reinstall --no-cache-dir pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.2.0+cu121.html

Сначала решил попробовать простой вариант через представление юзера как мешка его друзей и затем поиска ближайших к нему по косиносному расстоянию

In [127]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

data = pd.read_parquet('train.parquet')

friends_count = data['uid'].value_counts().reset_index()
friends_count.columns = ['uid', 'count']
filtered_uid = friends_count[friends_count['count'] > 1]['uid']
data_filtered = data[data['uid'].isin(filtered_uid)]

unique_uids = pd.concat([data_filtered['uid'], data_filtered['friend_uid']]).unique()
uid_to_index = {uid: idx for idx, uid in enumerate(unique_uids)}
index_to_uid = {idx: uid for idx, uid in enumerate(unique_uids)}

train_df, test_df = train_test_split(
    data_filtered,
    stratify=data_filtered['uid'],
    test_size=0.1,
    random_state=42
)

train_df['uid'] = train_df['uid'].map(uid_to_index)
train_df['friend_uid'] = train_df['friend_uid'].map(uid_to_index)

test_df['uid'] = test_df['uid'].map(uid_to_index)
test_df['friend_uid'] = test_df['friend_uid'].map(uid_to_index)


rows = []
cols = []
values = []

for uid, friend_uid in train_df[['uid', 'friend_uid']].values:
    rows.append(uid)
    cols.append(friend_uid)
    values.append(1)
    
num_users = len(unique_uids)
sparse_data = sp.csr_matrix((values, (rows, cols)), shape=(num_users, num_users))

In [128]:
user_similarity = cosine_similarity(sparse_data, dense_output=False)

def generate_recommendations_sparse(user_similarity, user_to_friends, top_k=10):
    recommendations = {}
    for user in tqdm(range(user_similarity.shape[0]), total=user_similarity.shape[0]):
        similarities = user_similarity[user].toarray().flatten()
        friends = user_to_friends.get(user, set())
        candidate_users = np.argpartition(-similarities, top_k)[:top_k]
        candidate_users = candidate_users[np.argsort(-similarities[candidate_users])]
        candidate_users = [uid for uid in candidate_users if uid != user and uid not in friends][:top_k]
        recommendations[user] = candidate_users
    return recommendations

user_to_friends = {uid: set(group['friend_uid']) for uid, group in train_df.groupby('uid')}
recommendations = generate_recommendations_sparse(user_similarity, user_to_friends, top_k=20)

100%|██████████| 106092/106092 [01:07<00:00, 1564.66it/s]


In [129]:
def user_recall(y_rel, y_rec, k=10):
    return len(set(y_rec[:k]).intersection(set(y_rel))) / min(k, len(set(y_rel)))

recall_scores = []
test_true_friends = {user: set(group['friend_uid']) for user, group in test_df.groupby('uid')}

for user_id in test_true_friends:
    if user_id in recommendations:
        y_true = list(test_true_friends[user_id])
        y_pred = recommendations[user_id]
        recall_scores.append(user_recall(y_true, y_pred, k=20))

mean_recall_at_20 = np.mean(recall_scores)
print(f'Recall@20 = {mean_recall_at_10:.4f}')

Recall@20 = 0.1100


затем попробовал для пользователя получить эмбеддинг с помощью node2vec и искать ближайших к нему

In [130]:
import numpy as np
import polars as pl
import pandas as pd
from tqdm import tqdm


from typing import List, Any

import scipy.sparse as sp
from sklearn.model_selection import train_test_split

import random
from collections import Counter

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

import torch
from torch.nn import Linear
import torch.nn.functional as F

from torch_geometric.datasets import Planetoid, MovieLens
from torch_geometric.nn import Node2Vec, SAGEConv, LightGCN, to_hetero
from torch_geometric.data import Data, HeteroData
from torch_geometric.utils import degree
import torch_geometric.transforms as T

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'device: {device}')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

data = pd.read_parquet('train.parquet')

friends_count = data['uid'].value_counts().reset_index()
friends_count.columns = ['uid', 'count']

filtered_uid = friends_count[friends_count['count'] > 1]['uid']

data_filtered = data[(data['uid'].isin(filtered_uid))]

unique_uids = pd.concat([data_filtered['uid'], data_filtered['friend_uid']]).unique()
uid_to_index = {uid: idx for idx, uid in enumerate(unique_uids)}
index_to_uid = {idx: uid for idx, uid in enumerate(unique_uids)}

train_df, test_df = train_test_split(
    data_filtered,
    stratify=data_filtered['uid'],
    test_size=0.1,
    random_state=42
)

train_df['uid'] = train_df['uid'].map(uid_to_index)
train_df['friend_uid'] = train_df['friend_uid'].map(uid_to_index)

test_df['uid'] = test_df['uid'].map(uid_to_index)
test_df['friend_uid'] = test_df['friend_uid'].map(uid_to_index)

train_edges = torch.tensor([[uid, friend_uid] for uid, friend_uid in train_df.values], dtype=torch.long).T
test_edges = torch.tensor([[uid, friend_uid] for uid, friend_uid in test_df.values], dtype=torch.long).T


graph_data = Data(edge_index=train_edges)
graph_data = T.ToUndirected()(graph_data)

device: cuda


In [None]:
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')

model = Node2Vec(
    graph_data.edge_index,
    embedding_dim=128,
    walk_length=30,
    context_size=10,
    walks_per_node=10,
    num_negative_samples=1,
    p=1.0,
    q=1.0,
    sparse=True,
).to(device)

# если пользовать num_workers > 1 выдает Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
# хз почему
loader = model.loader(batch_size=512, shuffle=True, num_workers=1)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)



for epoch in range(1, 20):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')


In [132]:
import faiss
import numpy as np
import torch
from tqdm import tqdm 

def get_recommendations(user_embs: np.array, item_embs: np.array, k: int = 10, batch_size: int = 1024):
    index = faiss.IndexFlatIP(item_embs.shape[1])
    
    faiss.normalize_L2(item_embs)
    index.add(item_embs)
    
    all_distances = []
    all_indices = []
    
    num_batches = (len(user_embs) + batch_size - 1) // batch_size
    with tqdm(total=num_batches) as pbar:
        for i in range(0, len(user_embs), batch_size):
            batch_user_embs = user_embs[i:i + batch_size]
            faiss.normalize_L2(batch_user_embs)
            distances, indices = index.search(batch_user_embs, k)
            all_distances.append(distances)
            all_indices.append(indices)
            
            pbar.update(1)
    
    all_distances = np.concatenate(all_distances, axis=0)
    all_indices = np.concatenate(all_indices, axis=0)
    
    return all_indices, all_distances

embeddings = model().cpu().detach().numpy()

recommendations, distances = get_recommendations(embeddings, embeddings, k=60, batch_size=1024)

100%|██████████| 104/104 [00:46<00:00,  2.22it/s]


In [133]:
test_true_friends = {user: set(test_df[test_df['uid'] == user]['friend_uid']) for user in test_df['uid'].unique()}

user_to_friends = {}
for uid, friend_uid in train_df.values:
    if uid not in user_to_friends:
        user_to_friends[uid] = set()
    user_to_friends[uid].add(friend_uid)
recall_scores = []

TOP_K = 10
RANDOM_STATE = 42

SUBMISSION_PATH = 'submission.parquet'


def user_intersection(y_rel: List[Any], y_rec: List[Any], k: int = 10) -> int:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: number of items in intersection of y_rel and y_rec (truncated to top-K)
    """
    return len(set(y_rec[:k]).intersection(set(y_rel)))


def user_recall(y_rel: List[Any], y_rec: List[Any], k: int = 10) -> float:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: percentage of found relevant items through recommendations
    """
    return user_intersection(y_rel, y_rec, k) / min(k, len(set(y_rel)))

for user_id in test_true_friends:
    if user_id in user_to_friends:
        y_true = list(test_true_friends[user_id])
        
        y_pred = [rec for rec in recommendations[user_id] if rec not in user_to_friends[user_id] and rec != user_id]
        
        recall_scores.append(user_recall(y_true, y_pred, k=20))

mean_recall_at_10 = np.mean(recall_scores)
print(f'Recall@10 = {mean_recall_at_10:.4f}')

Recall@10 = 0.0740


In [None]:
Сабмишн так делал

In [None]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import polars as pl

train_df = pd.read_parquet('train.parquet')
unique_users = pd.concat([train_df['uid'], train_df['friend_uid']]).unique()
user_index = {user: idx for idx, user in enumerate(unique_users)}

index_user = {idx: user for user, idx in user_index.items()}

rows = train_df['uid'].map(user_index)
cols = train_df['friend_uid'].map(user_index)
data = np.ones(len(train_df), dtype=int)


n_users = len(unique_users)
sparse_matrix = csr_matrix((data, (rows, cols)), shape=(n_users, n_users))

user_similarity = cosine_similarity(sparse_matrix, dense_output=False)

from collections import defaultdict

user_to_friends = defaultdict(set)

for uid, friend_uid in tqdm(zip(train_df['uid'], train_df['friend_uid']), total=len(train_df)):
    user_to_friends[user_index[uid]].add(user_index[friend_uid])
def generate_recommendations_sparse(user_similarity, user_to_friends, top_k=10):
    recommendations = {}
    for user in tqdm(range(user_similarity.shape[0]), total=user_similarity.shape[0]):
        similarities = user_similarity[user].toarray().flatten()
        friends = user_to_friends.get(user, set())
        candidate_users = np.argpartition(-similarities, top_k)[:top_k]
        candidate_users = candidate_users[np.argsort(-similarities[candidate_users])]
        candidate_users = [uid for uid in candidate_users if uid != user and uid not in friends][:top_k]
        recommendations[user] = candidate_users
    return recommendations

recommendations = generate_recommendations_sparse(user_similarity, user_to_friends, top_k=100)
sample_submission = pl.read_parquet('sample_submission.parquet')

submission = []
for user_id in tqdm(sample_submission['uid'].to_list()):
    user_idx = user_index[user_id]
    y_rec = [index_user[rec] for rec in recommendations.get(user_idx, [])]
    submission.append((user_id, y_rec))

submission_df = pl.DataFrame(submission, schema=['user_id', 'y_recs'])

submission_df.write_parquet('submission.parquet')