#Решение с помощью LightFM

In [None]:
!pip install LightFM
!pip install datatable

In [2]:
import numpy as np
import pandas as pd
import datatable as dt
from sklearn.preprocessing import LabelEncoder
import scipy.sparse as sp
from lightfm import LightFM
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount("/gdrive", force_remount=True)

Mounted at /gdrive


In [4]:
train = dt.fread('/gdrive/MyDrive/vk_contest/train.csv').to_pandas()
train, test= train_test_split(train, test_size=0.2, random_state=42)
songs = dt.fread('/gdrive/MyDrive/vk_contest/songs.csv').to_pandas()
members = dt.fread('/gdrive/MyDrive/vk_contest/members.csv').to_pandas()
songs_extra_info = dt.fread('/gdrive/MyDrive/vk_contest/song_extra_info.csv',fill=True).to_pandas()

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1475484 entries, 1919950 to 6256130
Data columns (total 6 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   msno                1475484 non-null  object
 1   song_id             1475484 non-null  object
 2   source_system_tab   1475484 non-null  object
 3   source_screen_name  1475484 non-null  object
 4   source_type         1475484 non-null  object
 5   target              1475484 non-null  bool  
dtypes: bool(1), object(5)
memory usage: 68.9+ MB


##Обработка данных

In [6]:
for i in train.columns:
    if train[i].dtype == object:
        train[i] = train[i].astype('category')
        test[i] = test[i].astype('category')
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5901934 entries, 169102 to 6413414
Data columns (total 6 columns):
 #   Column              Dtype   
---  ------              -----   
 0   msno                category
 1   song_id             category
 2   source_system_tab   category
 3   source_screen_name  category
 4   source_type         category
 5   target              bool    
dtypes: bool(1), category(5)
memory usage: 113.1 MB


Конкатенация двух таблиц для получения названия песен для дальнейшего удобства


In [7]:
train= train.merge(songs_extra_info, on= 'song_id', how='left')
train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,name,isrc
0,YBSM6gZ4bPXjRv2nCRVoMR2Ddyh4RIyp1JVh2bQFnUk=,c2bp6xlq7o3Pj6x95ddX67S/wGsP6qPN/8PqM1DL0a8=,my library,Local playlist more,local-library,True,This is love,TWA471602005
1,w8lMQ9t/toEY+/cKrSN+BkRasE9FUFv5QmSwP2c4XB8=,KhSthXAh2CKFiZ2Bx3X5vMUId/Z6419TRTgD6KSpVH4=,search,Album more,album,True,大太陽 (The Big Sun),TWA531651911
2,Q1Xh9pYdTxM6RYQNhRTqRAfUBuEC5qsN9n2cXNWsBs4=,EUUnpIrpI9QY9FVBsNjEL8TvXGjsCLQ0syQt/QV9j50=,my library,Local playlist more,local-playlist,True,同手同腳,TWF710700026
3,V4Ea+MmSuwt84YeOA3/OGLuwx6jQijmr5odemTCBG0w=,7EnDBkQYJpipCyRd9JBsug4iKnfAunUXc14/96cNotg=,discover,Discover Feature,song-based-playlist,True,聽見下雨的聲音 (演唱：魏如昀),TWL251305201
4,MNJ6q6jkfwi6wZQIm21c+1PR8kJKtg4scYybI00P5jg=,cy10N2j2sdY/X4BDUcMu2Iumfz7pV3tqE5iEaup2yGI=,my library,Local playlist more,local-playlist,True,派對動物 (Party Animal),TWK231680790


Преобразование таргета до желаемого типа данных

In [8]:
train = train.replace({'target': {True: 1, False: 0}})
train = train[['msno','name','target']]

Обьявление энкодеров для подготовки данных к дальнейшему обучению

In [9]:
user_encoder= LabelEncoder()
song_encoder= LabelEncoder()

def get_interactions(users, songs, target):
    user_encodes= user_encoder.transform(users)
    song_encodes= song_encoder.transform(songs)
    n= len(np.unique(user_encodes))
    m= len(np.unique(song_encodes))
    return sp.coo_matrix((target, (user_encodes, song_encodes)), shape= (n, m))

Выкидывание дубликатов которые могли быть в данных

In [10]:
train = train.drop_duplicates()

In [11]:
user_encoder.fit(train['msno'])
song_encoder.fit(train['name'])
df = get_interactions(train['msno'], train['name'], train['target'])

Преобразование тестовой части данных

In [12]:
test = test.merge(songs_extra_info, on= 'song_id', how='left')
test.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,name,isrc
0,RGAe8+3OzXCgYYw4LkvHHfGHPW48eT57SO9tCDqF6wU=,XcLZ7BJjc6gG5RPsg7Pb5W04mvlz6jyWErLKofUvFis=,search,Search,song,False,Bad Girl Good Girl,GBMEZ1442317
1,PQfNQQon0CwOLghOAyztQYVw6yJTVgmFpJ650q/UF1s=,9t9+DPdDvWrge1ZOUlcJvLmOfnpa4NtNEVY0rBc/91I=,discover,,top-hits-for-artist,True,可惜不是你,TWA450586606
2,cd9R76nchcG1F1oyKMVN9bXXwRssofcYVckWhIWTSrY=,LAbM25XeS2X4K+HcY802+4tu3VyXbwse7mzIEOorPJY=,radio,Radio,radio,False,In Your Dreams,GBAAA0100557
3,KFznvngc8H65XDu5Y1ZgitCu5DWeOKpgtVkNYw0qlq8=,3xhSUFUYmiaYUUK6gzDE0vkhxV4nPnpyZL4Z+nzU7/8=,my library,Local playlist more,local-playlist,False,noir 느와르,KRB003519297
4,phnWUuC1a4M2RO/6He3Wb8Ji3Ex6QV5AQ2KCGlqpIG8=,+ZXjijLJdny/t5JIuFm4xIlM+bRYKIkaHGDcfDSKfH8=,discover,Artist more,top-hits-for-artist,True,別對我說沒有未來,TWA211518806


Энкодинг пользователей и песен через LabelEncoder

In [13]:
user_ids = test["msno"]
track_ids = test["name"]

user_classes = set(user_encoder.classes_)
track_classes = set(song_encoder.classes_)

valid_indices = [i for i, (user, track) in enumerate(zip(user_ids, track_ids))
                if user in user_classes and track in track_classes]

user_embeddings = user_encoder.transform(user_ids[valid_indices])
track_embeddings = song_encoder.transform(track_ids[valid_indices])

##Обучение модели


In [14]:
model= LightFM(no_components=100, k=5, learning_rate=0.05, random_state=153) #гиперпараметры были подобраны через Optuna, было прервано выполнение кода
model.fit(df, num_threads= 5)

<lightfm.lightfm.LightFM at 0x7ab20ca7a2c0>

##Подсчет NDCG@20

In [16]:
from sklearn.metrics import ndcg_score
def ndcg_calc(kk, data):
    scores = []
    for query_id, group in data.groupby('msno'):
        group = group.sort_values(by='pred', ascending=False)
        y_true = group['target'].values.reshape(1, -1)
        y_pred = group['pred'].values.reshape(1, -1)
        try:
            ndcg = ndcg_score(y_true, y_pred, k = kk)
            scores.append(ndcg)
        except ValueError:
            continue
    return np.mean(scores)

In [23]:
scores = []
scores = np.zeros(len(user_ids))
scores[valid_indices] = model.predict(user_embeddings, track_embeddings)

In [24]:
test_data = test[['msno']].copy()
test_data['pred'] = scores
test_data['target'] = test['target'].astype(int)

In [25]:
ndcg_calc(20, test_data)

0.638943501506401