### Решение с помощью LightFM

In [None]:
!pip install LightFM
!pip install py7zr pandas
!pip install datatable

In [None]:
import numpy as np
import pandas as pd
import datatable as dt
from sklearn.preprocessing import LabelEncoder
import scipy.sparse as sp
from lightfm import LightFM

In [None]:
from google.colab import drive
drive.mount("/gdrive", force_remount=True)

Mounted at /gdrive


In [None]:
train = dt.fread('/gdrive/MyDrive/vk_contest/train.csv').to_pandas()
test = dt.fread('/gdrive/MyDrive/vk_contest/test.csv').to_pandas()
songs = dt.fread('/gdrive/MyDrive/vk_contest/songs.csv').to_pandas()
members = dt.fread('/gdrive/MyDrive/vk_contest/members.csv').to_pandas()
songs_extra_info = dt.fread('/gdrive/MyDrive/vk_contest/song_extra_info.csv',fill=True).to_pandas()

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556790 entries, 0 to 2556789
Data columns (total 6 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   id                  int32 
 1   msno                object
 2   song_id             object
 3   source_system_tab   object
 4   source_screen_name  object
 5   source_type         object
dtypes: int32(1), object(5)
memory usage: 107.3+ MB


Преобразование в категориальные признаки

In [None]:
for i in train.columns:
    if train[i].dtype == object:
        train[i] = train[i].astype('category')
        test[i] = test[i].astype('category')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 6 columns):
 #   Column              Dtype   
---  ------              -----   
 0   msno                category
 1   song_id             category
 2   source_system_tab   category
 3   source_screen_name  category
 4   source_type         category
 5   target              bool    
dtypes: bool(1), category(5)
memory usage: 82.4 MB


Конкатенация двух таблиц для получения названия песен для дальнейшего удобства


In [None]:
train= train.merge(songs_extra_info, on= 'song_id', how='left')
train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,name,isrc
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,True,Good Grief,GBUM71602854
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,True,Lords of Cardboard,US3C69910183
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,True,Hip Hop Is Dead(Album Version (Edited)),USUM70618761
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,True,Disco Africa,GBUQH1000063
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,True,Sleep Without You,QM3E21606003


Преобразование таргета до желаемого типа данных

In [None]:
train = train.replace({'target': {True: 1, False: 0}})
train = train[['msno','name','target']]

Обьявление энкодеров для подготовки данных к дальнейшему обучению

In [None]:
user_encoder= LabelEncoder()
song_encoder= LabelEncoder()

def get_interactions(users, songs, target):
    user_encodes= user_encoder.transform(users)
    song_encodes= song_encoder.transform(songs)
    n= len(np.unique(user_encodes))
    m= len(np.unique(song_encodes))
    return sp.coo_matrix((target, (user_encodes, song_encodes)), shape= (n, m))

Выкидывание дубликатов которые могли быть в данных

In [None]:
train = train.drop_duplicates()

In [None]:
user_encoder.fit(train['msno'])
song_encoder.fit(train['name'])
df = get_interactions(train['msno'], train['name'], train['target'])

Преобразование тестовой части данных

In [None]:
test = test.merge(songs_extra_info, on= 'song_id', how='left')
test.head()

Unnamed: 0,id,msno,song_id,source_system_tab,source_screen_name,source_type,name,isrc
0,0,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=,my library,Local playlist more,local-library,愛其實很殘忍,TWUM71400047
1,1,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=,my library,Local playlist more,local-library,她說,TWB671005201
2,2,/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=,8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=,discover,,song-based-playlist,subarashiki nichijo,JPWP01070260
3,3,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=,ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=,radio,Radio,radio,Hold Me| Thrill Me| Kiss Me| Kill Me,GBAAN0201228
4,4,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=,MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=,radio,Radio,radio,Om Yoga,ITO101121898


Энкодинг пользователей и песен через LabelEncoder

In [None]:
user_ids = test["msno"]
track_ids = test["name"]

user_classes = set(user_encoder.classes_)
track_classes = set(song_encoder.classes_)

valid_indices = [i for i, (user, track) in enumerate(zip(user_ids, track_ids))
                if user in user_classes and track in track_classes]

user_embeddings = user_encoder.transform(user_ids[valid_indices])
track_embeddings = song_encoder.transform(track_ids[valid_indices])

Обучение модели с частично заранее подобранными гиперпараметрами (через Optuna для таких же данных и той же задачи), вычислительных мощностей для этих данных у меня не хватило


In [None]:
model= LightFM(no_components=100, k=5, learning_rate=0.05, random_state=153)
model.fit(df, num_threads= 5)

<lightfm.lightfm.LightFM at 0x7b680389b970>

In [None]:
scores = []
scores = np.zeros(len(user_ids))
scores[valid_indices] = model.predict(user_embeddings, track_embeddings)

In [None]:
submit_data = test.copy()

Из предположения о отрицательных значениях для негативного опыта, опытным путем появился threshold равный нулю

In [None]:
submit_data["target"] = scores
submit_data["target"] = submit_data["target"].apply(lambda x: 1 if x >= 0 else 0)

In [None]:
submit_data = submit_data[['id', 'target']]
submit_data.head()

Unnamed: 0,id,target
0,0,0
1,1,1
2,2,0
3,3,0
4,4,0


In [None]:
train['target'].value_counts()

1    3618298
0    3592159
Name: target, dtype: int64

In [None]:
submit_data["target"].value_counts()

1    1431849
0    1124941
Name: target, dtype: int64

In [None]:
submit_data.to_csv("submit.csv", index=False)
submit_data.head(10)

Unnamed: 0,id,target
0,0,0
1,1,1
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,1
