**WSDM - KKBox's Music Recommendation Challenge**

В данном задании предложено построить рекомендательную систему для музыки на основе пользовательских данных.

**Подключение библиотек и загрузка данных**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender
from implicit.nearest_neighbours import bm25_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from catboost import CatBoostClassifier
import zipfile

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
songs = pd.read_csv('data/songs.csv')
song_extra_info = pd.read_csv('data/song_extra_info.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')
members = pd.read_csv('data/members.csv')

In [4]:
train.head(5)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1


In [5]:
print(f"Размер train-датасета: {train.shape}")
print(f"Размер test-датасета: {test.shape}")
print(f"Количество уникальных пользователей: {len(members)}")
print(f"Количество уникальных песен: {len(songs)}")

Размер train-датасета: (7377418, 6)
Размер test-датасета: (2556790, 6)
Количество уникальных пользователей: 34403
Количество уникальных песен: 2296320


In [None]:
songs.head(5)

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0


In [None]:
members.head(5)

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,20110820,20170920
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,20150628,20170622
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,20160411,20170712
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,20150906,20150907
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,20170126,20170613


# EDA


**Exploratory Data Analysis**

Проведем исследование данных перед дальнейшим выбором и обучением моделей для формирования финального предсказания.

In [6]:
print(train.columns)
print(songs.columns)
print(members.columns)

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target'],
      dtype='object')
Index(['song_id', 'song_length', 'genre_ids', 'artist_name', 'composer',
       'lyricist', 'language'],
      dtype='object')
Index(['msno', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'expiration_date'],
      dtype='object')


In [None]:
song_features = ['song_length', 'genre_ids', 'artist_name', 'composer',
       'lyricist', 'language']
member_features = ['city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'expiration_date']

# Градиентный бустинг

В данном разделе мы попробуем модель градиентного бустинга для решения поставленной задачи.

**Предобработка данных**

In [None]:
def make_categorical(df, cols):
    for col in cols:
      df.loc[col] = df[col].fillna('not stated')
      df.loc[col] = df[col].astype('category')
    return df

In [None]:
train_df = train.merge(members, on='msno', how='left')
train_df = train_df.merge(songs, on='song_id', how='left')

test_df = test.merge(members, on='msno', how='left')
test_df = test_df.merge(songs, on='song_id', how='left')

train_df.head(2)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,city,bd,gender,registered_via,registration_init_time,expiration_date,song_length,genre_ids,artist_name,composer,lyricist,language
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,1,0,,7,20120102,20171005,206471.0,359,Bastille,Dan Smith| Mark Crew,,52.0
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,13,24,female,9,20110525,20170911,284584.0,1259,Various Artists,,,52.0


In [None]:
cols_to_drop = ['msno', 'song_id', 'registered_via', 'genre_ids']
cat_features = ['source_system_tab', 'source_screen_name',
                'source_type', 'city',
                'bd', 'gender', 'artist_name',
                'composer', 'lyricist', 'language']

X_train = train_df.drop(cols_to_drop, axis=1)
y_train = X_train['target']
X_train = X_train.drop(['target'], axis=1)
X_test = test_df.drop(cols_to_drop, axis=1)
X_test = X_test.drop(['id'], axis=1)
diff = X_test.columns.difference(X_train.columns)
print(diff)

Index([], dtype='object')


In [None]:
for col in cat_features:
  X_train[col] = X_train[col].fillna('not stated')
  X_test[col] = X_test[col].fillna('not stated')
  X_train[col] = X_train[col].astype('string')
  X_test[col] = X_test[col].astype('string')

for col in X_test.select_dtypes(include='number').columns:
  X_train[col] = X_train[col].fillna(X_train[col].mean())
  X_test[col] = X_test[col].fillna(X_train[col].mean())

In [None]:
model = CatBoostClassifier(iterations=200,
                           depth=10,
                           cat_features=cat_features)
model.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.6628282	total: 7.39s	remaining: 12m 12s
1:	learn: 0.6493394	total: 13.5s	remaining: 10m 59s
2:	learn: 0.6429728	total: 21.4s	remaining: 11m 32s
3:	learn: 0.6405586	total: 28.2s	remaining: 11m 16s
4:	learn: 0.6390627	total: 35s	remaining: 11m 5s
5:	learn: 0.6381543	total: 41.8s	remaining: 10m 55s
6:	learn: 0.6375806	total: 48.5s	remaining: 10m 44s
7:	learn: 0.6371462	total: 55.1s	remaining: 10m 33s
8:	learn: 0.6368322	total: 1m 1s	remaining: 10m 20s
9:	learn: 0.6364404	total: 1m 7s	remaining: 10m 9s
10:	learn: 0.6360332	total: 1m 14s	remaining: 9m 59s
11:	learn: 0.6356507	total: 1m 20s	remaining: 9m 53s
12:	learn: 0.6352849	total: 1m 27s	remaining: 9m 43s
13:	learn: 0.6350729	total: 1m 33s	remaining: 9m 35s
14:	learn: 0.6346679	total: 1m 40s	remaining: 9m 26s
15:	learn: 0.6345244	total: 1m 46s	remaining: 9m 17s
16:	learn: 0.6343912	total: 1m 52s	remaining: 9m 9s
17:	learn: 0.6341577	total: 1m 57s	remaining: 8m 57s
18:	learn: 0.6339654	total: 2m 2s	re

<catboost.core.CatBoostClassifier at 0x7d13cad93c10>

In [None]:
y_pred = model.predict_proba(X_test)[:, 1]
print(f"submission shape: {sample_submission.shape[0]}")
print(f"Predicted shape: {y_pred.shape[0]}")

submission shape: 2556790
Predicted shape: 2556790


In [None]:
sample_submission['target'] = y_pred
sample_submission.to_csv('boosting_2_submission.csv', index=False)

# Создание архива zip
with zipfile.ZipFile('boosting_2_submission.zip', 'w') as zipf:
    zipf.write('boosting_2_submission.csv')

# Нейронные сети

В данном разделе мы попробуем решение с нейронной сетью на PyTorch. Будем исследовать следующую архитектуру нейронной сети - отдельные уровни полносвязных слоев над каждым из объектов (песня, контекст, пользователь), конкатенация получившихся векторов и получения предсказаний одной сверткой.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Используемый device:", device)

Используемый device: cuda


**Подготовка датасета**

In [None]:
class SongMatchingDataset(Dataset):
    def __init__(self, df, members, songs, labels,
                 cat_features):
        self.df = df
        drop_columns = self.df.columns
        self.members = members
        self.songs = songs
        self.labels = labels

        self.members_dataset = df.merge(members, on='msno', how='left')
        self.members_dataset = self.members_dataset.drop(drop_columns, axis=1)
        self.songs_dataset = df.merge(songs, on='song_id', how='left')
        self.songs_dataset = self.songs_dataset.drop(drop_columns, axis=1)

        for feature in cat_features:
            if feature in self.members_dataset.columns:
                self.members_dataset[feature] = self.members_dataset[feature].astype('category')
                dummy_feature = pd.get_dummies(self.members_dataset[feature], prefix=feature)
                self.members_dataset = pd.concat([self.members_dataset, dummy_feature], axis=1)
                self.members_dataset = self.members_dataset.drop([feature], axis=1)
            if feature in self.songs_dataset.columns:
                self.songs_dataset[feature] = self.songs_dataset[feature].astype('category')
                dummy_feature = pd.get_dummies(self.songs_dataset[feature], prefix=feature)
                self.songs_dataset = pd.concat([self.songs_dataset, dummy_feature], axis=1)
                self.songs_dataset = self.songs_dataset.drop([feature], axis=1)

    def __getitem__(self, idx):
        member = torch.tensor(self.members_dataset.iloc[idx].values, dtype=torch.float32)
        song = torch.tensor(self.songs_dataset.iloc[idx].values, dtype=torch.float32)
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.float32)
        return member, song, label


    def __len__(self):
        return len(self.df)

In [None]:
class SongMatchingModel(nn.Module):
    def __init__(self, song_input_size, member_input_size, embedding_size=10, hidden_sizes=[128, 64], num_layers=2):
        super(SongMatchingModel, self).__init__()
        self.member_layer = self._create_network(member_input_size, hidden_sizes, embedding_size, num_layers)
        self.song_layer = self._create_network(song_input_size, hidden_sizes, embedding_size, num_layers)

    def forward(self, member, song):
        member_embed = self.member_layer(member)
        song_embed = self.song_layer(song)
        return torch.sum(member_embed * song_embed, dim=1)

    def _create_network(self, input_size, hidden_sizes, output_size, num_layers):
        layers = []
        for i in range(num_layers):
            layers.append(nn.Linear(input_size, hidden_sizes[i]))
            layers.append(nn.ReLU())
            input_size = hidden_sizes[i]
        layers.append(nn.Linear(hidden_sizes[-1], output_size))
        return nn.Sequential(*layers)

In [None]:
def validate(model, val_loader, criterion):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for member, song, label in val_loader:
            output = model(member, song)
            loss = criterion(output, label)
            val_loss += loss.item()
    return val_loss / len(val_loader)

def train_model(model,
                train_loader,
                criterion,
                optimizer,
                n_epochs=5,
                val_loader=None):
    train_losses = []
    val_losses = []
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0
        for member, song, label in train_loader:
            member, song, label = member.to(device), song.to(device), label.to(device)
            optimizer.zero_grad()
            output = model(member, song)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_losses.append(train_loss)
        train_loss /= len(train_loader)
        if val_loader is not None:
            val_loss = validate(model, val_loader, criterion)
            val_losses.append(val_loss)
        print(f"Epoch {epoch + 1}/{n_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    return train_losses, val_losses

In [None]:
print(songs.select_dtypes(exclude=['number', 'datetime']).columns)
print(members.select_dtypes(exclude=['number', 'datetime']).columns)

cat_features = [*songs.select_dtypes(exclude=['number', 'datetime']).columns,
                *members.select_dtypes(exclude=['number', 'datetime']).columns]
print(cat_features)

Index(['song_id', 'genre_ids', 'artist_name', 'composer', 'lyricist'], dtype='object')
Index(['msno', 'gender'], dtype='object')
['song_id', 'genre_ids', 'artist_name', 'composer', 'lyricist', 'msno', 'gender']


In [None]:
train_labels = train['target'].values
X_train = train.drop(['target'], axis=1)

train_dataset = SongMatchingDataset(X_train, members, songs, train_labels, cat_features)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

batch = next(iter(train_loader))
member, song, label = batch
member_input_size = member.size(1)
song_input_size = song.size(1)

In [None]:
input_size = X_train.shape[1]
model = SongMatchingModel(member_input_size=member_input_size,
                          song_input_size=song_input_size)
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_model(model, train_loader, val_loader, criterion, optimizer, n_epochs=5)

In [None]:
test_dataset = SongMatchingDataset(test, members, songs, np.zeros(len(test)), cat_features)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
predictions = []
model.eval()
with torch.no_grad():
    for member, song, label in test_loader:
        output = model(member, song)
        predictions.append(output.numpy())