In [1]:
# Imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
from syspath import current_path
sys.path.append('../../')

import numpy as np
import pandas as pd 
import os
import pickle
import datetime
import time
import scipy.sparse as sp
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def new_test(R_train, R_test):
    ind = (R_train==0).nonzero()
    mask_array = sp.csr_matrix(R_test.shape)
    mask_array[ind] = True
    R_test_new = R_test.multiply(mask_array)
    return R_test_new

In [4]:
# raw Last.fm data available at http://ocelma.net/MusicRecommendationDataset/
# PATH = '../../data/lastfm/raw/userid-timestamp-artid-artname-traid-traname.tsv'
# cols = ['userId', 'timestamp', 'artistId', 'artist-name','songId', 'song-name']                   

df = pd.read_csv(PATH, sep="\t", names=cols, error_bad_lines=False)

In [5]:
df.apply(lambda x: len(x.unique()))

userId              992
timestamp      17454739
artistId         107296
artist-name      173923
songId           960403
song-name       1083481
dtype: int64

In [5]:
df = df[df.songId.notnull()]
df = df[df.artistId.notnull()]
df.head()

Unnamed: 0,userId,timestamp,artistId,artist-name,songId,song-name
10,user_000001,2009-05-04T13:06:09Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,f7c1f8f8-b935-45ed-8fc8-7def69d92a10,The Last Emperor (Theme)
12,user_000001,2009-05-04T12:55:34Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,475d4e50-cebb-4cd0-8cd4-c3df97987962,Tibetan Dance (Version)
14,user_000001,2009-05-03T15:48:25Z,ba2f4f3b-0293-4bc8-bb94-2f73b5207343,Underworld,dc394163-2b78-4b56-94e4-658597a29ef8,"Boy, Boy, Boy (Switch Remix)"
15,user_000001,2009-05-03T15:37:56Z,ba2f4f3b-0293-4bc8-bb94-2f73b5207343,Underworld,340d9a0b-9a43-4098-b116-9f79811bd508,Crocodile (Innervisions Orchestra Mix)
16,user_000001,2009-05-03T15:14:53Z,a16e47f5-aa54-47fe-87e4-bb8af91a9fdd,Ennio Morricone,0b04407b-f517-4e00-9e6a-494795efc73e,Ninna Nanna In Blu (Raw Deal Remix)


In [6]:
n = 3

# df = bet_df.groupby('league_id')['placed_date'].count()
artists = df.groupby('artistId')['userId'].nunique()
prev_len = len(artists)
artists = artists[artists>n].index.tolist()
df = df.query('artistId in @artists')
print('Removed {} artist ids to {}'.format(prev_len-len(artists),len(artists)))

Removed 57130 artist ids to 26775


In [7]:
n = 5

# df = bet_df.groupby('league_id')['placed_date'].count()
users = df.userId.value_counts()
prev_len = len(users)
users = users[users>n].index.tolist()
df = df.query('userId in @users')
print('Removed {} user ids to {}'.format(prev_len-len(users),len(users)))

Removed 3 user ids to 989


In [9]:
df.apply(lambda x: len(x.unique()))

userId              989
timestamp      14986031
artistId          26775
artist-name       26387
songId           766224
song-name        562952
dtype: int64

In [10]:
# ndf = df.groupby('songId')['userId'].nunique()
# ndf

In [8]:
df['timestamp'] = pd.to_datetime(df.timestamp)
df["date"] = [d.date() for d in df["timestamp"]]

In [9]:
users = list(df.userId.unique())
user_dict = dict(zip(users,[i for i in range(len(users))]))
df['userId'] = df['userId'].apply(lambda x: user_dict[x])

items = list(df.artistId.unique())
item_dict = dict(zip(items,[i for i in range(len(items))]))
df['artistId'] = df['artistId'].apply(lambda x: item_dict[x])

In [10]:
dates = sorted(list(df['date'].unique()))
len(dates)

1589

In [11]:
train = 24
val = 7
test = 8
total = train+val+test
n_days = len(dates)
train_days = int(n_days*train/total)
val_days = int(n_days*val/total)
test_days = n_days - (train_days+val_days)
print(train_days, val_days, test_days, n_days)

977 285 327 1589


In [12]:
main_train_df = df[df['date'] <= dates[-test_days]]
test_df = df[df['date'] > dates[-test_days]]

In [16]:
main_train_df.to_pickle('../../data/lastfm/processed/bet_df.pkl')

In [17]:
train_users = main_train_df['userId'].unique()
test_users = test_df['userId'].unique()

valid_users = list(set(train_users)&set(test_users))
print('{} valid users from {} test and {} train'.format(len(valid_users),len(test_users),len(train_users)))

817 valid users from 934 test and 872 train


In [18]:
# filter for valid users
full_train_df = main_train_df[main_train_df['userId'].isin(valid_users)]
test_df = test_df[test_df['userId'].isin(valid_users)]

In [19]:
train_leagues = full_train_df['artistId'].unique()
test_leagues = test_df['artistId'].unique()

valid_leagues = list(set(train_leagues)|set(test_leagues))
print('{} valid artists from {} test and {} train'.format(len(valid_leagues),len(test_leagues),len(train_leagues)))

26775 valid artists from 25274 test and 26413 train


In [20]:
train_missing = np.setdiff1d(valid_leagues,train_leagues)
test_missing = np.setdiff1d(valid_leagues,test_leagues)

with open('../../data/lastfm/processed/train_missing.txt', "wb") as fp:
    pickle.dump(train_missing, fp)
with open('../../data/lastfm/processed/test_missing.txt', "wb") as fp:
    pickle.dump(test_missing, fp)

In [25]:
# get matrics
full_train_df = full_train_df.groupby(['userId', 'artistId']).size().unstack(fill_value=0)
test_df = test_df.groupby(['userId', 'artistId']).size().unstack(fill_value=0)
full_train_df

artistId,0,1,2,3,4,5,6,7,8,9,...,26765,26766,26767,26768,26769,26770,26771,26772,26773,26774
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,82,700,0,4,0,0,0,0,0,78,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,1,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,0,34,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
print(full_train_df.shape, test_df.shape)

# add missing columns to each
for league in valid_leagues:
    if league not in full_train_df:
        full_train_df[league] = 0
    if league not in test_df:
        test_df[league] = 0

(817, 26413) (817, 25274)


In [27]:
# line up columns
cols = list(full_train_df.columns.values)

full_train_df = full_train_df.reindex(columns=cols)
test_df = test_df.reindex(columns=cols)

print(full_train_df.shape, test_df.shape)

(817, 26775) (817, 26775)


In [28]:
# turn df into sparse matrices
full_train_mat = sp.csr_matrix(full_train_df.values)
test_mat = sp.csr_matrix(test_df.values)

# get masked test set
new_test_mat = new_test(full_train_mat, test_mat)

  
  self._set_arrayXarray(i, j, x)


In [29]:
new_test_mat

<817x26775 sparse matrix of type '<class 'numpy.float64'>'
	with 115948 stored elements in Compressed Sparse Row format>

In [30]:
test_mat

<817x26775 sparse matrix of type '<class 'numpy.longlong'>'
	with 267056 stored elements in Compressed Sparse Row format>

In [31]:
full_train_mat

<817x26775 sparse matrix of type '<class 'numpy.longlong'>'
	with 473713 stored elements in Compressed Sparse Row format>

In [32]:
users_mapped = list(full_train_df.columns.values)
items_mapped = list(full_train_df.index.values)

with open('../../data/lastfm/processed/test_users.txt', "wb") as fp:
    pickle.dump(users, fp)
with open('../../data/lastfm/processed/test_items.txt', "wb") as fp:
    pickle.dump(cols, fp)
with open('../../data/lastfm/processed/test_users_map.txt', "wb") as fp:
    pickle.dump(users_mapped, fp)
with open('../../data/lastfm/processed/test_items_map.txt', "wb") as fp:
    pickle.dump(items_mapped, fp)

In [33]:
sp.save_npz('../../data/lastfm/processed/full_train_mat.npz', full_train_mat)
sp.save_npz('../../data/lastfm/processed/test_unmasked.npz', test_mat)
sp.save_npz('../../data/lastfm/processed/test_masked.npz', new_test_mat)

In [34]:
dates = sorted(list(main_train_df['date'].unique()))
print(len(dates))

1263


## TRAIN/VAL SPLIT

In [47]:
train_df = main_train_df[main_train_df['date'] <= dates[-val_days]]
val_df = main_train_df[main_train_df['date'] > dates[-val_days]]

In [48]:
train_users = train_df['userId'].unique()
val_users = val_df['userId'].unique()

valid_users = list(set(train_users)&set(val_users))
print('{} valid users from {} test and {} train'.format(len(valid_users),len(valid_users),len(train_users)))

643 valid users from 643 test and 758 train


In [49]:
# filter for valid users
train_df = train_df[train_df['userId'].isin(valid_users)]
val_df = val_df[val_df['userId'].isin(valid_users)]

In [50]:
train_leagues = train_df['artistId'].unique()
val_leagues = val_df['artistId'].unique()

valid_leagues = list(set(train_leagues)|set(val_leagues))
print('{} valid leagues from {} test and {} train'.format(len(valid_leagues),len(val_leagues),len(train_leagues)))

26307 valid leagues from 24145 test and 24828 train


In [51]:
train_val_missing = np.setdiff1d(valid_leagues,train_leagues)
val_missing = np.setdiff1d(valid_leagues,val_leagues)

print(len(train_val_missing),len(val_missing))
with open('../../data/lastfm/processed/train_val_missing.txt', "wb") as fp:
    pickle.dump(train_val_missing, fp)
with open('../../data/lastfm/processed/test_missing.txt', "wb") as fp:
    pickle.dump(val_missing, fp)

1479 2162


In [52]:
train_val_missing = np.setdiff1d(valid_leagues,train_leagues)
len(train_val_missing)

1479

In [53]:
# get matrics
train_df = train_df.groupby(['userId', 'artistId']).size().unstack(fill_value=0)
val_df = val_df.groupby(['userId', 'artistId']).size().unstack(fill_value=0)

print(train_df.shape, val_df.shape)

# add missing columns to each
for league in valid_leagues:
    if league not in train_df:
        train_df[league] = 0
    if league not in val_df:
        val_df[league] = 0

# line up columns
cols = sorted(list(train_df.columns.values))
train_df = train_df.reindex(columns=cols)
val_df = val_df.reindex(columns=cols)

print(train_df.shape, val_df.shape)

# turn df into sparse matrices
train_mat = sp.csr_matrix(train_df.values)
val_mat = sp.csr_matrix(val_df.values)

# get masked test set
new_val_mat = new_test(train_mat, val_mat)

(643, 24828) (643, 24145)
(643, 26307) (643, 26307)




In [54]:
train_mat

<643x26307 sparse matrix of type '<class 'numpy.longlong'>'
	with 330431 stored elements in Compressed Sparse Row format>

In [55]:
val_mat

<643x26307 sparse matrix of type '<class 'numpy.longlong'>'
	with 222436 stored elements in Compressed Sparse Row format>

In [56]:
new_val_mat

<643x26307 sparse matrix of type '<class 'numpy.float64'>'
	with 101409 stored elements in Compressed Sparse Row format>

In [57]:
sp.save_npz('../../data/lastfm/processed/train_ex_val.npz', train_mat)
sp.save_npz('../../data/lastfm/processed/val_unmasked.npz', val_mat)
sp.save_npz('../../data/lastfm/processed/val_masked.npz', new_val_mat)

In [58]:
items_train = list(train_df.columns.values)
users_train = list(train_df.index.values)

with open('../../data/lastfm/processed/train_users.txt', "wb") as fp:
    pickle.dump(users_train, fp)
with open('../../data/lastfm/processed/train_items.txt', "wb") as fp:
    pickle.dump(items_train, fp)

## iGC-MC PREPROCESSING

In [2]:
# load train data
data = {
    'train': sp.load_npz('../../data/lastfm/processed/test/full_train_mat.npz')
}

In [3]:
data['train']

<817x26775 sparse matrix of type '<class 'numpy.int64'>'
	with 473713 stored elements in Compressed Sparse Row format>

In [5]:
# data['val']

In [6]:
df = pd.DataFrame.sparse.from_spmatrix(data['train'])
df['userId'] = df.index
df = df.melt('userId', var_name='itemId', value_name='rating')
df = df[df.rating != 0]
# save 
path = '../../data/lastfm/processed/test/train_triplet_df.pkl'
df.to_pickle(path)