#Решение через LightGBM

In [1]:
!pip install datatable

Collecting datatable
  Downloading datatable-1.1.0-cp310-cp310-manylinux_2_35_x86_64.whl (82.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datatable
Successfully installed datatable-1.1.0


In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import lightgbm as lgb
import datatable as dt
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.model_selection import RandomizedSearchCV

In [3]:
from google.colab import drive
drive.mount("/gdrive", force_remount=True)

Mounted at /gdrive


In [4]:
train = dt.fread('/gdrive/MyDrive/vk_contest/train.csv').to_pandas()
train, test= train_test_split(train, test_size=0.2, random_state=42)
songs = dt.fread('/gdrive/MyDrive/vk_contest/songs.csv').to_pandas()
members = dt.fread('/gdrive/MyDrive/vk_contest/members.csv').to_pandas()

##Обработка данных

Преобразование данных в категориальные

In [5]:
for i in songs.columns:
    if songs[i].dtype != 'int32':
        songs[i] = songs[i].astype('category')
songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296320 entries, 0 to 2296319
Data columns (total 7 columns):
 #   Column       Dtype   
---  ------       -----   
 0   song_id      category
 1   song_length  int32   
 2   genre_ids    category
 3   artist_name  category
 4   composer     category
 5   lyricist     category
 6   language     category
dtypes: category(6), int32(1)
memory usage: 157.6 MB


In [6]:
for i in train.columns:
    if train[i].dtype == object:
        train[i] = train[i].astype('category')
        test[i] = test[i].astype('category')

members['registration_init_time'] = pd.to_datetime(members['registration_init_time'])
members['expiration_date'] = pd.to_datetime(members['expiration_date'])

for i in members.columns:
    if members[i].dtype != 'datetime64[ns]':
        members[i] = members[i].astype('category')
members.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34403 entries, 0 to 34402
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   msno                    34403 non-null  category      
 1   city                    34403 non-null  category      
 2   bd                      34403 non-null  category      
 3   gender                  34403 non-null  category      
 4   registered_via          34403 non-null  category      
 5   registration_init_time  34403 non-null  datetime64[ns]
 6   expiration_date         34403 non-null  datetime64[ns]
dtypes: category(5), datetime64[ns](2)
memory usage: 2.1 MB


In [7]:
train.isnull().sum()

msno                  0
song_id               0
source_system_tab     0
source_screen_name    0
source_type           0
target                0
dtype: int64

In [8]:
train.isnull().sum()

msno                  0
song_id               0
source_system_tab     0
source_screen_name    0
source_type           0
target                0
dtype: int64

Преобразование дат в числовой признак показывающий оставшееся количество дней, мне кажется этот признак достаточно информативен и не привязан к какому-либо отрезку времени (не берется в учет начало отсчета)

In [9]:
members['count'] = (members['expiration_date'] - members['registration_init_time']).dt.days.astype(int)
members.drop(['registration_init_time','expiration_date'], axis=1, inplace=True)

Конкатенация таблиц по msno и song_id

In [10]:
test = pd.merge(left = test, on='msno', right = members,how ='left')
train = pd.merge(left = train, on='msno', right = members,how ='left')
test = pd.merge(left = test, on='song_id', right = songs, how = 'left')
train = pd.merge(left = train, on='song_id', right = songs, how = 'left')

nan_length = max(test["song_length"].unique()) + 123
test["song_length"].fillna(nan_length,inplace=True)
train["song_length"].fillna(nan_length,inplace=True)

test["song_length"] = test["song_length"].astype(int)
train["song_length"] = train["song_length"].astype(int)

In [11]:
test["msno"] = test["msno"].astype('category')
train["msno"] = train["msno"].astype('category')
test["song_id"] = test["song_id"].astype('category')
train["song_id"] = train["song_id"].astype('category')

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5901934 entries, 0 to 5901933
Data columns (total 17 columns):
 #   Column              Dtype   
---  ------              -----   
 0   msno                category
 1   song_id             category
 2   source_system_tab   category
 3   source_screen_name  category
 4   source_type         category
 5   target              bool    
 6   city                category
 7   bd                  category
 8   gender              category
 9   registered_via      category
 10  count               int64   
 11  song_length         int64   
 12  genre_ids           category
 13  artist_name         category
 14  composer            category
 15  lyricist            category
 16  language            category
dtypes: bool(1), category(14), int64(2)
memory usage: 290.3 MB


In [26]:
train = train.drop_duplicates()

In [13]:
train.isnull().sum()

msno                    0
song_id                 0
source_system_tab       0
source_screen_name      0
source_type             0
target                  0
city                    0
bd                      0
gender                  0
registered_via          0
count                   0
song_length             0
genre_ids              95
artist_name            95
composer               95
lyricist               95
language              120
dtype: int64

Заполнение пропусков в категориальных данных, логично откуда они взяты, например песня могла быть без слов, поэтому нет языка, поэтому решил создать отдельную категорию для пропусков

In [14]:
special_category = "no_inf"
for i in train.columns:
    if train[i].isnull().sum() > 0:
        train[i] = train[i].cat.add_categories(special_category)
        train[i] = train[i].fillna(special_category)
for i in test.columns:
    if test[i].isnull().sum() > 0:
        test[i] = test[i].cat.add_categories(special_category)
        test[i] = test[i].fillna(special_category)

In [15]:
train.isnull().sum()

msno                  0
song_id               0
source_system_tab     0
source_screen_name    0
source_type           0
target                0
city                  0
bd                    0
gender                0
registered_via        0
count                 0
song_length           0
genre_ids             0
artist_name           0
composer              0
lyricist              0
language              0
dtype: int64

In [16]:
test_target = test['target']
test.drop(columns=['target'], inplace=True)

##Обучение модели


In [25]:
folds = KFold(n_splits=5)
scores = np.zeros(len(test))

parameters = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'gbdt'
}

for train_i, val_i in folds.split(train):
    train_data = lgb.Dataset(train.drop('target', axis=1).iloc[train_i], label=train.loc[train_i, 'target'])
    val_data = lgb.Dataset(train.drop('target', axis=1).iloc[val_i], label=train.loc[val_i, 'target'])
    model = lgb.train(parameters, train_data, valid_sets=[val_data])
    scores += model.predict(test)

[LightGBM] [Info] Number of positive: 2377191, number of negative: 2344356
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.745424 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43465
[LightGBM] [Info] Number of data points in the train set: 4721547, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503477 -> initscore=0.013909
[LightGBM] [Info] Start training from score 0.013909
[LightGBM] [Info] Number of positive: 2377708, number of negative: 2343839
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.756565 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43477
[LightGBM] [Info] Number of data points in the train set: 4721547, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503587 -> initscore=0.014347
[LightGBM] [Info] Start training from score 0.014347
[L

##Подсчет NDCG@20

In [27]:
def ndcg_calc(kk, data):
    scores = []
    for query_id, group in data.groupby('msno'):
        group = group.sort_values(by='pred', ascending=False)
        y_true = group['target'].values.reshape(1, -1)
        y_pred = group['pred'].values.reshape(1, -1)
        try:
            ndcg = ndcg_score(y_true, y_pred, k = kk)
            scores.append(ndcg)
        except ValueError:
            continue
    return np.mean(scores)

In [31]:
test_data = test[['msno']].copy()
test_data['pred'] = scores / folds.n_splits
test_data['target'] = test_target.astype(int)

In [33]:
ndcg_calc(20, test_data)

0.6796460480204168