In [1]:
import numpy as np

# Encoding

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from spacy.lang.ru.stop_words import STOP_WORDS

df_inter = pd.read_csv('./mod_data/interactions.csv')
df_items = pd.read_csv('./mod_data/items.csv')
df_users = pd.read_csv('./mod_data/users.csv')

## Items

In [3]:
df_items.sample(1)

Unnamed: 0,item_id,content_type,title,genres,age_rating,keywords
8787,14665,series,Нити судьбы,"драмы, зарубежные, мелодрамы",12,"Нити, судьбы, 2016, Украина, безответная, любо..."


In [4]:
# keywords extraction
def tokenize(line):
    vec = [word for word in line.split(', ') if not word.isnumeric() and word not in {'', 'nan'}]
    return vec

keywords_vectorizer = TfidfVectorizer(
    tokenizer=tokenize, token_pattern=None,
    max_features=100, stop_words=list(STOP_WORDS)
)
X_keywords = keywords_vectorizer.fit_transform(df_items['keywords'].values.astype('U'))

# content type preprocessing
X_content_type = (df_items['content_type'] == 'film').astype(int).values.reshape(-1, 1)

# age rating preprocessing
age_rating_encoder = OneHotEncoder()
X_age_rating = age_rating_encoder.fit_transform(df_items['age_rating'].values.astype('U').reshape(-1, 1)).toarray()

# genres preprocessing
genres_vectorizer = TfidfVectorizer(tokenizer=tokenize, token_pattern=None)
X_genres = genres_vectorizer.fit_transform(df_items['genres'].values.astype('U')).toarray()

In [5]:
pd.Series(data=np.squeeze(np.asarray(X_keywords.sum(axis=0))), index=keywords_vectorizer.get_feature_names_out())\
    .sort_values(ascending=False)[:10]

россия                     2646.475781
соединенные штаты          1650.497265
отношения                  1344.449214
франция                    1185.161904
сша                        1085.736725
ссср                        756.435969
любовь                      633.931426
дружба                      599.568120
соединенное королевство     495.680623
женщины                     490.716083
dtype: float64

In [6]:
X_items = np.hstack([
    X_content_type,
    X_genres,
    X_age_rating,
    X_keywords.todense()
])
X_items.shape

(15963, 202)

## Users

In [7]:
df_users.sample(1)

Unnamed: 0,user_id,age,income,sex,kids_flg
486457,914210,age_25_34,income_40_60,Ж,0


In [8]:
# age extraction
age_encoder = OneHotEncoder()
X_age = age_encoder.fit_transform(df_users['age'].values.astype('U').reshape(-1, 1)).toarray()

# income extraction
income_encoder = OneHotEncoder()
X_income = income_encoder.fit_transform(df_users['income'].values.astype('U').reshape(-1, 1)).toarray()

# sex extraction
X_sex = (df_users['sex'] == 'М').astype(int).values.reshape(-1, 1)

In [9]:
X_users = np.hstack([
    X_age,
    X_income,
    X_sex,
    df_users['kids_flg'].to_numpy().reshape(-1, 1)
])
X_users.shape

(840197, 14)

# Model learning

In [102]:
# pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-2.1.0+cu121.html

import torch
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric import EdgeIndex
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader
from torch_geometric.metrics import LinkPredMAP, LinkPredPrecision, LinkPredRecall

from tqdm import tqdm
# import os
# os.environ['PYDEVD_DISABLE_FILE_VALIDATION']='1'
# os.environ['KMP_DUPLICATE_LIB_OK']='True'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## HeteroData, train/test loaders

In [11]:
user_id_to_idx = {user_id: idx for idx, user_id in df_users['user_id'].items()}
item_id_to_idx = {item_id: idx for idx, item_id in df_items['item_id'].items()}

edges = np.vstack([
    df_inter['user_id'].map(user_id_to_idx).values,
    df_inter['item_id'].map(item_id_to_idx).values
])

connections = np.ones(edges.shape[1])
time = pd.to_datetime(df_inter['last_watch_dt'], format='%Y-%m-%d').values.astype(np.int64) // 10**9

In [12]:
data = HeteroData()
data['movie'].x = torch.Tensor(X_items)
data['user'].x = torch.Tensor(X_users)
data['user', 'watched', 'movie'].edge_index = torch.tensor(edges)
data['user', 'watched', 'movie'].time = torch.tensor(time)
data = T.ToUndirected()(data)
data

HeteroData(
  movie={ x=[15963, 202] },
  user={ x=[840197, 14] },
  (user, watched, movie)={
    edge_index=[2, 1288996],
    time=[1288996],
  },
  (movie, rev_watched, user)={
    edge_index=[2, 1288996],
    time=[1288996],
  }
)

## MetaPath2Vec

In [35]:
from utils.node_representation import Metapath2Vec

mp2v = Metapath2Vec(train_data.edge_index_dict, train_data.num_nodes_dict, device=device)
mp2v.train()

users_emb = mp2v.get_embeddings('user').numpy()
movies_emb = mp2v.get_embeddings('movie').numpy()

## GNN

In [None]:
data['user'].x = torch.cat([data['user'].x, users_emb], dim=1)
data['movie'].x = torch.cat([data['movie'].x, movies_emb], dim=1)

In [None]:
from utils.model import GNN

gnn_model = GNN(data.metadata(), hidden_channels=64, decoder='IP',
                dropout_encoder_p=0.2, dropout_decoder_p=0.4)

optimizer = torch.optim.Adam(gnn_model.parameters(), lr=0.01)

# train()

# test()