#Решение через LightGBM

In [None]:
!pip install datatable



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import lightgbm as lgb
import datatable as dt

In [None]:
from google.colab import drive
drive.mount("/gdrive", force_remount=True)

Mounted at /gdrive


In [None]:
train = dt.fread('/gdrive/MyDrive/vk_contest/train.csv').to_pandas()
test = dt.fread('/gdrive/MyDrive/vk_contest/test.csv').to_pandas()
songs = dt.fread('/gdrive/MyDrive/vk_contest/songs.csv').to_pandas()
members = dt.fread('/gdrive/MyDrive/vk_contest/members.csv').to_pandas()

Преобразование данных в категориальные

In [None]:
for i in songs.columns:
    if songs[i].dtype != 'int32':
        songs[i] = songs[i].astype('category')
songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296320 entries, 0 to 2296319
Data columns (total 7 columns):
 #   Column       Dtype   
---  ------       -----   
 0   song_id      category
 1   song_length  int32   
 2   genre_ids    category
 3   artist_name  category
 4   composer     category
 5   lyricist     category
 6   language     category
dtypes: category(6), int32(1)
memory usage: 157.6 MB


In [None]:
for i in train.columns:
    if train[i].dtype == object:
        train[i] = train[i].astype('category')
        test[i] = test[i].astype('category')

members['registration_init_time'] = pd.to_datetime(members['registration_init_time'])
members['expiration_date'] = pd.to_datetime(members['expiration_date'])

for i in members.columns:
    if members[i].dtype != 'datetime64[ns]':
        members[i] = members[i].astype('category')
members.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34403 entries, 0 to 34402
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   msno                    34403 non-null  category      
 1   city                    34403 non-null  category      
 2   bd                      34403 non-null  category      
 3   gender                  34403 non-null  category      
 4   registered_via          34403 non-null  category      
 5   registration_init_time  34403 non-null  datetime64[ns]
 6   expiration_date         34403 non-null  datetime64[ns]
dtypes: category(5), datetime64[ns](2)
memory usage: 2.1 MB


Преобразование дат в числовой признак показывающий оставшееся количество дней, мне кажется этот признак достаточно информативен и не привязан к какому-либо отрезку времени (не берется в учет начало отсчета)

In [None]:
members['count'] = (members['expiration_date'] - members['registration_init_time']).dt.days.astype(int)
members.drop(['registration_init_time','expiration_date'], axis=1, inplace=True)

Конкатенация таблиц по msno и song_id

In [None]:
test = pd.merge(left = test, on='msno', right = members,how ='left')
train = pd.merge(left = train, on='msno', right = members,how ='left')
test = pd.merge(left = test, on='song_id', right = songs, how = 'left')
train = pd.merge(left = train, on='song_id', right = songs, how = 'left')

nan_length = max(test["song_length"].unique()) + 123
test["song_length"].fillna(nan_length,inplace=True)
train["song_length"].fillna(nan_length,inplace=True)

test["song_length"] = test["song_length"].astype(int)
train["song_length"] = train["song_length"].astype(int)

In [None]:
test["msno"] = test["msno"].astype('category')
train["msno"] = train["msno"].astype('category')
test["song_id"] = test["song_id"].astype('category')
train["song_id"] = train["song_id"].astype('category')

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 17 columns):
 #   Column              Dtype   
---  ------              -----   
 0   msno                category
 1   song_id             category
 2   source_system_tab   category
 3   source_screen_name  category
 4   source_type         category
 5   target              bool    
 6   city                category
 7   bd                  category
 8   gender              category
 9   registered_via      category
 10  count               int64   
 11  song_length         int64   
 12  genre_ids           category
 13  artist_name         category
 14  composer            category
 15  lyricist            category
 16  language            category
dtypes: bool(1), category(14), int64(2)
memory usage: 410.2 MB


Обучение модели и получение результатов, гиперпараметры были подобранны опытным путем, из-за сложности модели я не стал их сильно трогать, но так как CatBoost имел проблемы описанные в readMe я все равно остался на LightGBM

In [None]:
folds = KFold(n_splits=5)
scores = np.zeros(len(test))

parameters = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'gbdt'
}

for train_i, val_i in folds.split(train):
    train_data = lgb.Dataset(train.drop('target', axis=1).iloc[train_i], label=train.loc[train_i, 'target'])
    val_data = lgb.Dataset(train.drop('target', axis=1).iloc[val_i], label=train.loc[val_i, 'target'])
    model = lgb.train(parameters, train_data, valid_sets=[val_data])
    scores += model.predict(test.drop('id', axis=1))

submit_data = test.copy()
submit_data['target'] = scores / folds.n_splits
submit_data = submit_data[['id', 'target']]
submit_data.to_csv('submit.csv', index=False)
