## Part 2. Train, validation and test datasets preparation

## The plan

- ### Data loading
- ### Types preparations
- ### Train, validation and test sets construction
- ### Final datasets saving

In [1]:
import numpy as np
import pandas as pd

from recsys.config import opt
from recsys.helper.memory_usage import num_bytes_format

### Let's load our data

In [2]:
data = pd.read_pickle(opt.path_to_data / "combined-all-data.pickle")
data.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc,city,bd,gender,registered_via,registration_init_time,expiration_date
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,359,Bastille,Dan Smith| Mark Crew,,52.0,Good Grief,GBUM71602854,1,0,,7,20120102,20171005
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,1259,Various Artists,,,52.0,Lords of Cardboard,US3C69910183,13,24,female,9,20110525,20170911
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,225396.0,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,,52.0,Hip Hop Is Dead(Album Version (Edited)),USUM70618761,13,24,female,9,20110525,20170911
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,255512.0,1019,Soundway,Kwadwo Donkoh,,-1.0,Disco Africa,GBUQH1000063,13,24,female,9,20110525,20170911
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,187802.0,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,,52.0,Sleep Without You,QM3E21606003,1,0,,7,20120102,20171005


### Change some formats to minimize memory consumption during further data operations

In [3]:
num_bytes = data.memory_usage(deep=True).sum()
num_bytes_format(num_bytes)

'7.1789 Gb'

In [4]:
cat_columns = [
    col for col in data.columns
    if col not in opt.dataset.num_columns + opt.dataset.dt_columns + [opt.dataset.target_col, opt.dataset.user_id_col, opt.dataset.item_id_col]
]

In [5]:
for col in cat_columns:
    data[col] = data[col].astype('category')
for col in opt.dataset.dt_columns:
    data[col] = pd.to_datetime(data[col], format='%Y%m%d')
data.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc,city,bd,gender,registered_via,registration_init_time,expiration_date
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,359,Bastille,Dan Smith| Mark Crew,,52.0,Good Grief,GBUM71602854,1,0,,7,2012-01-02,2017-10-05
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,1259,Various Artists,,,52.0,Lords of Cardboard,US3C69910183,13,24,female,9,2011-05-25,2017-09-11
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,225396.0,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,,52.0,Hip Hop Is Dead(Album Version (Edited)),USUM70618761,13,24,female,9,2011-05-25,2017-09-11
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,255512.0,1019,Soundway,Kwadwo Donkoh,,-1.0,Disco Africa,GBUQH1000063,13,24,female,9,2011-05-25,2017-09-11
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,187802.0,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,,52.0,Sleep Without You,QM3E21606003,1,0,,7,2012-01-02,2017-10-05


In [6]:
num_bytes = data.memory_usage(deep=True).sum()
num_bytes_format(num_bytes)

'2.0735 Gb'

### Not bad, isn't it? :)

## How to construct train, validation and test sets?
- ### extract random 5% of items and associated responses to consider them as cold
- ### extract random 10% of the data - for cold users test set
- ### drop cold items and users from the remaining data
- ### extract random 20% of the remaining data for the validation and test sets
- ### let's costruct two train sets - one with both 1 and 0 target values and the second - with only positives values (for some recommender algorithms) 

### Extract 5% of items and associated responses to consider them as cold

In [7]:
unique_songs = data.song_id.unique()
unique_songs_num = unique_songs.shape[0]
unique_songs_num

359966

In [8]:
np.random.seed(opt.seed)
n = int(unique_songs_num * opt.validation.items_cold_ratio)
indices = np.random.choice(unique_songs_num, n, replace=False)

In [9]:
assert indices.shape[0] == np.unique(indices).shape[0], "there are some duplicates"

In [10]:
cold_items = unique_songs[indices]
cold_items

array(['5RvNrR96qBoGqxO3X8AQ7EFRH5ZIK5HQVna1zURxBiE=',
       'LbA1w5aj+kWCY35fJZXGrTXNXWFO/UlMDstRT1l1A+Q=',
       'WHiXHHkWYzs39dbsTShGO1qfaE+W8pqmQPOwpTIsC/M=', ...,
       'eo0Y0yhNjis+bv+ApvoWCGnlFOwKSLVg7ZemR5cKz7E=',
       '/P8khdW9dQZfvUA9dntWB5esjPgNhnbKB87tP4XWPwU=',
       'NTcPlt+Sn4AtnpLWsoEJs/06SbU5+1a/dl5pmVCzsSI='], dtype=object)

In [11]:
cold_items_data = (
    data[data["song_id"].isin(cold_items)]
    .sample(frac=1, random_state=opt.seed).reset_index(drop=True)
)
cold_items_data.shape

(392174, 20)

In [12]:
cold_items_data.genre_ids.value_counts()[:10]

465     193596
458      85704
444      21590
1609     15084
921      14685
2022      7368
1259      6350
359       4050
2122      3607
958       3190
Name: genre_ids, dtype: int64

In [13]:
cold_items_data.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc,city,bd,gender,registered_via,registration_init_time,expiration_date
0,gLotRgEt5+ISFcoM/5pv0PH4wl+M4O2cxbSqGSWPN2g=,brIQuNCXBSpWhUw9hCkoYv1ZQVIB3kyL8pwvb8dcKS4=,search,Artist more,top-hits-for-artist,0,219149.0,465,Gareth Gates,,,52.0,Sunshine,GBCTA0300153,1,0,,7,2014-04-16,2017-09-23
1,jYCD4lsV1pcA8mnYAXR18R8ZUMxP8B+3Nc7inSjdx/U=,podlycp3c6tOEDw7q8iQBVgvFjiMpkWWXTlXljGdbTY=,discover,Online playlist more,online-playlist,0,212973.0,444,TWICE,Rado,,31.0,TT,US5TA1600082,5,30,female,9,2010-12-20,2017-11-01
2,nE9hJlIOASHA2um8GseS5W+Pi0MupQn0u5pIbflmu90=,u3Cep8rYmeaOLq3xdjNHg8iFVXwT5RNArzQj2+VjWGM=,search,Search,song,0,758491.0,465,古巨基 (Leo Ku),Asuka Ryou| Bobby Chen| Cai De Cai| Chang Chen...,An Jeong Hun| Bobby Chen| Chang Chen-yue| Chen...,3.0,勁歌金曲2 - 情歌王,HKG580800062,1,0,,7,2015-09-22,2017-04-27
3,SIkJtZ/oYjk+LMyzAf2GEFMwnUvvxpy03JzuObWqxrk=,AxnJSdFWF/yEBP2VvZonrvzTTIiuwdsMXolQFpG71/w=,my library,Local playlist more,local-library,0,227892.0,465,五月天 (Mayday),怪獸,阿信,3.0,有些事現在不做 一輩子都不會做了,TWK231105502,13,33,male,7,2015-12-17,2017-09-17
4,w5REazVOQB6mcBXC6SG+fbYq0xZ54IuLz6FwhuUN1SI=,GdRVw2uQu6JZUNPa0rdAkFoUoIdJz64mHOU0PuizP78=,discover,Online playlist more,online-playlist,1,286824.0,465,五月天 (Mayday),阿信,阿信,3.0,擁抱,TWK231307402,4,19,male,9,2011-06-11,2018-02-15


### Extract 10% users as cold

In [14]:
unique_users = data.msno.unique()
unique_users_num = unique_users.shape[0]
unique_users_num

30755

In [15]:
np.random.seed(opt.seed)
n = int(unique_users_num * opt.validation.users_cold_ratio)
indices = np.random.choice(unique_users_num, n, replace=False)
cold_users = unique_users[indices]
cold_users

array(['5hebwI9Q5dTNBNh81QjKXhv2y+2AbR+4hAFZcCQgL0g=',
       'E9jsIxPWhebfnR8cbqPgS+FxCt0zBSXP+72mYRFg0+U=',
       'Mt3Ac1nS394uSrrQEHaiMUsR/bFNTMU9ko0D/OAShCg=', ...,
       'L8ghWRKIjvED5R+yBIIMhmXpCyIce7+J3rB0AZ5TWgI=',
       'FeHK362Hi6ZpM5/MXkQEaD4uuC8EkoWfkB6/kLYEQng=',
       'lNMg/h7T/40zQpB0ddclHG4W6jbNpJPcPIV1YJOy2gk='], dtype=object)

In [16]:
cold_users_data = (
    data[data["msno"].isin(cold_users)]
    .sample(frac=1, random_state=opt.seed).reset_index(drop=True)
)
cold_users_data.shape

(715482, 20)

In [17]:
cold_users_data.genre_ids.value_counts()[:10]

465     363491
458     120939
921      34856
1609     29737
444      24290
1259     17700
2022     14465
359       9875
139       6294
451       6241
Name: genre_ids, dtype: int64

### Drop cold items and users from the data

In [18]:
data_new = data[~data["song_id"].isin(cold_items)]
data_new = data_new[~data_new["msno"].isin(cold_users)]

data_new = data_new.sample(frac=1, random_state=opt.seed).reset_index(drop=True)
data_new.shape

(6307916, 20)

In [19]:
data_new.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc,city,bd,gender,registered_via,registration_init_time,expiration_date
0,xvCGZ5McgEJlkYx2HNO1/CkD1CYjlA/gQL36iX8TYBY=,868uXMrBS8WwQulNX0W+vk6sJA6Hs35YNtFYceXgX/I=,radio,Radio,radio,0,255373.0,465,張震嶽,,,3.0,勇氣,TWA450479802,1,0,,7,2013-05-14,2017-09-24
1,jOlB9i8NKGG2oudUKq9ObRyt3sXQnWWwcei1+s8sNIA=,a4TbK5V15pj3YZUOGa9h2U3t0OsE+3aiFw41mNlcgcw=,my library,Local playlist more,local-library,1,248790.0,465,吳汶芳 (Fang Wu),吳汶芳,吳汶芳,3.0,孤獨的總和 (Accumulated Loneliness),TWA211328806,13,31,female,9,2011-04-19,2017-06-20
2,fe+6batTKL/NxXhhzfDO3yBUmFmQUQAgRpYmF2T0t4E=,5RLsVkl6PTt3pIyWvojM0z2LblVysXHxoc7Mc4LIFL8=,my library,Local playlist more,local-library,1,178887.0,921,Noel Coward,,,52.0,Poor Little Rich Girl,USV291338277,5,33,female,9,2008-08-05,2017-10-06
3,GVcP7SPQ/IgiEt2uxaXaRH5lFA5tBe8T74/Etk7cnG8=,7Xxzo9Z584Z1AhkaLLWEcpzcEuxcquGE52Ztc4879YI=,discover,Online playlist more,online-playlist,1,275136.0,465,田馥甄 (Hebe),楊子樸,施人誠,3.0,寂寞寂寞就好,TWD951043107,1,0,,7,2011-02-25,2017-09-12
4,XIfoYUt6vkAqVDRKJRVEHkV0jsaD+fif2EqxsobpEqg=,rA+H/y66yC2qK0v926aWRFWngwJmL+IhGhJN4fBS0XM=,radio,Radio,radio,0,243983.0,465,范瑋琪 (Christine Fan),陳小霞,姚若龍,3.0,最重要的決定,TWA211125202,5,22,female,7,2013-07-29,2017-09-30


### Extract 10% of remaining data for validation and 10% for test

In [20]:
row_num = data_new.shape[0]
remain_ration = 1 - opt.validation.test_ratio - opt.validation.val_ratio
data_train_all = data_new.iloc[:int(remain_ration*row_num)]
data_val = data_new.iloc[int(remain_ration*row_num):int((remain_ration + opt.validation.val_ratio)*row_num)]
data_test = data_new.iloc[int((remain_ration + opt.validation.val_ratio)*row_num):]
data_val.shape, data_test.shape

((630792, 20), (630792, 20))

### Extract positive train set

In [21]:
data_train_pos = data_train_all[data_train_all.target == 1]
data_train_all.shape, data_train_pos.shape

((5046332, 20), (2539875, 20))

### Check pos/neg proportion

In [22]:
data.target.value_counts()

1    3714656
0    3662762
Name: target, dtype: int64

In [23]:
data_new.target.value_counts()

1    3174752
0    3133164
Name: target, dtype: int64

In [24]:
data_train_all.target.value_counts()

1    2539875
0    2506457
Name: target, dtype: int64

In [25]:
data_val.target.value_counts()

1    317904
0    312888
Name: target, dtype: int64

In [26]:
data_test.target.value_counts()

1    316973
0    313819
Name: target, dtype: int64

In [27]:
cold_users_data.target.value_counts()

1    360046
0    355436
Name: target, dtype: int64

In [28]:
cold_items_data.target.value_counts()

1    199104
0    193070
Name: target, dtype: int64

### Save the data

In [29]:
data_train_all.to_parquet(opt.path_to_data / "prepared-train-all.parquet", index=False)
data_train_pos.to_parquet(opt.path_to_data / "prepared-train-pos.parquet", index=False)
data_val.to_parquet(opt.path_to_data / "prepared-validation.parquet", index=False)
data_test.to_parquet(opt.path_to_data / "prepared-test.parquet", index=False)
cold_users_data.to_parquet(opt.path_to_data / "cold-users-data.parquet", index=False)
cold_items_data.to_parquet(opt.path_to_data / "cold-items-data.parquet", index=False)