In [1]:
import time
import datetime
import numpy as np
import pandas as pd
import os
import random

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [3]:
print(time.strftime("%Y-%m-%d %H:%M:%S"))

2018-05-05 20:56:50


In [3]:
class Timer():
    def __init__(self):
        self.info = 'main'
        self.start_time = time.time()
    
    def start(self, info):
        self.info = info
        self.start_time = time.time()
        self.checkpoint('start', elapsed_on=False)
    
    def end(self):
        self.checkpoint(' end ')
        
    def checkpoint(self, tag, elapsed_on=True):
        if elapsed_on:
            elapsed = datetime.timedelta(seconds=round(time.time() - self.start_time))
            expanded_info = self.info + ' [time elapsed: %s]' % str(elapsed)
        else:
            expanded_info = self.info
        self.output(tag, info=expanded_info)
        
    def output(self, tag=' '*5, info=''):
        if type(info) != type(''):
            info = str(info)
        print('[%s] (%s) %s' % (Timer.get_current_time(), tag, info))
    
    @staticmethod
    def get_current_time():
        return time.strftime("%Y-%m-%d %H:%M:%S")

timer = Timer()
sub_timer = Timer()

# Load Data

In [4]:
timer.start('Load Data')
# directory = '../data/split/'
# df_train = pd.read_csv(directory + 'train.csv')
# df_test_warm = pd.read_csv(directory + 'test_warm.csv')
# df_test_cold_user = pd.read_csv(directory + 'test_cold_user.csv')
# df_test_cold_item = pd.read_csv(directory + 'test_cold_item.csv')

[2018-05-05 21:24:11] (start) Load Data


In [5]:
directory = '../data/context/'
df_event_context = pd.read_csv(directory + 'event_context.csv')
df_song_context = pd.read_csv(directory + 'song_context.csv')
df_user_context = pd.read_csv(directory + 'user_context.csv')
df_event_context.drop(columns=['Unnamed: 0'], inplace=True)
df_song_context.drop(columns=['Unnamed: 0'], inplace=True)
df_user_context.drop(columns=['Unnamed: 0'], inplace=True)
timer.checkpoint('context')

[2018-05-05 21:24:18] (context) Load Data [time elapsed: 0:00:07]


In [6]:
num_user = len(df_user_context.user_id.unique())
num_item = len(df_song_context.song_id.unique())
print (num_user)
print (num_item)

30755
359966


In [7]:
class Data():
    def __init__(self, name):
        '''
        user_list: list(int), the list of user id's used in the dataset
        target_set: list(set), set of target items for each user
        item_list: list(numpy array), list of items used in the dataset for each user
        '''
        self.name = name
        self.df = None
        self.user_list = None
        self.item_list = None
        self.target_set = None
    
    def load(self, filename):
        self.df = pd.read_csv(filename)
        # prepare user list
        self.user_list = self.df['user_id'].unique()
        
        # prepare item list
        self.item_list = [[] for i in range(num_user)]
        self.df.apply(
            lambda row: self.item_list[row['user_id']].append(row['song_id']),
            axis=1
        )
        self.item_list = list(map(np.array, self.item_list))
        
        # prepare target set
        self.target_set = [set() for i in range(num_user)]
        self.df[self.df['target'] == 1].apply(
            lambda row: self.target_set[row['user_id']].add(row['song_id']),
            axis=1
        )

def load_split(name):
    directory = '../data/split/'
    data = Data(name)
    data.load(directory + name + '.csv')
    return data

In [8]:
# data_train = load_split('train')
# data_test_warm = load_split('test_warm')
# data_test_cold_user = load_split('test_cold_user')
# data_test_cold_item = load_split('test_cold_item')
# timer.end()

In [9]:
# # dump the class for more efficient data preparing
# import pickle
# with open('../data/split/data_train.pickle', 'wb') as handle:
#     pickle.dump(data_train, handle)
# with open('../data/split/data_test_cold_user.pickle', 'wb') as handle:
#     pickle.dump(data_test_cold_user, handle)
# with open('../data/split/data_test_cold_item.pickle', 'wb') as handle:
#     pickle.dump(data_test_cold_item, handle)

In [10]:
# load the data class
import pickle
with open('../data/split/data_train.pickle', 'rb') as handle:
    data_train = pickle.load(handle)
with open('../data/split/data_test_warm.pickle', 'rb') as handle:
    data_test_warm = pickle.load(handle)
with open('../data/split/data_test_cold_user.pickle', 'rb') as handle:
    data_test_cold_user = pickle.load(handle)
with open('../data/split/data_test_cold_item.pickle', 'rb') as handle:
    data_test_cold_item = pickle.load(handle)
timer.end()

[2018-05-05 21:24:22] ( end ) Load Data [time elapsed: 0:00:11]


In [11]:
data_train.df.drop(columns=['Unnamed: 0'], inplace=True)
data_test_warm.df.drop(columns=['Unnamed: 0'], inplace=True)
data_test_cold_user.df.drop(columns=['Unnamed: 0'], inplace=True)
data_test_cold_item.df.drop(columns=['Unnamed: 0'], inplace=True)

In [12]:
import keras
from keras.models import Model
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Reshape, Lambda, Multiply
from keras.layers.merge import concatenate, dot
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.initializers import RandomUniform, RandomNormal, TruncatedNormal, Zeros
from keras.optimizers import RMSprop, Adam, SGD
from sklearn.metrics import mean_squared_error
import tensorflow as tf

Using TensorFlow backend.


### Define and load the MF model for comparing

In [13]:
REG_LAMBDA = 0
EMBED_DIM = 64

vocab_size = num_user
user_embeddings = Embedding(
    input_dim = vocab_size,
    output_dim = EMBED_DIM,
    embeddings_initializer = RandomUniform(minval=-0.1, maxval=0.1),
    embeddings_regularizer = l2(REG_LAMBDA),
    input_length = 1,
    name = 'user_embed',
    trainable=True)

vocab_size = num_item
item_embeddings = Embedding(
    input_dim = vocab_size,
    output_dim = EMBED_DIM,
    embeddings_initializer = RandomUniform(minval=-0.1, maxval=0.1),
    embeddings_regularizer=l2(REG_LAMBDA),
    input_length=1,
    name = 'item_embed',
    trainable=True)

# embedding of user id
uid_input = Input(shape=(1,), dtype='int32')
embedded_user = user_embeddings(uid_input)
embedded_user = Reshape((EMBED_DIM,))(embedded_user)

# embedding of song id
iid_input = Input(shape=(1,), dtype='int32')
embedded_item = item_embeddings(iid_input)
embedded_item = Reshape((EMBED_DIM,))(embedded_item)

# dot production of embedded vectors
preds = dot([embedded_user, embedded_item], axes=1, name='dot_score')

# embedding model
user_embed_model = Model(inputs=uid_input, outputs=embedded_user)
item_embed_model = Model(inputs=iid_input, outputs=embedded_item)

model_MF = Model(inputs=[uid_input, iid_input], outputs=preds)
model_MF.compile(
    loss=keras.losses.mean_squared_error, 
    optimizer=RMSprop(lr=1e-3),
#     optimizer=SGD(lr=1e-4),
    metrics=[keras.metrics.mean_squared_error])


Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [14]:
model_directory = '../model/mf/'
if not os.path.exists(model_directory):
    os.makedirs(model_directory)
model_path = model_directory + 'mf_model.h5'

In [15]:
# load the best model
model_MF.load_weights(model_path)

In [16]:
def single_top_k(score_list, k):
    ind = np.argpartition(score_list, -k)[-k:]
    top_k_ind = list(reversed(ind[np.argsort(score_list[ind])]))
    return np.array(top_k_ind)

# try to implement a two-dimensional top_k
def two_dim_top_k(a, k):
    return np.array([single_top_k(row, k) for row in a])

def top_k(a, k):
    if len(a.shape) == 1:
        return single_top_k(a, k)
    elif len(a.shape) == 2:
        return two_dim_top_k(a, k)
    else:
        return None

In [17]:
# recall at k
sess = tf.Session()
v_user_all = user_embed_model.predict(np.arange(num_user))
v_item_all = item_embed_model.predict(np.arange(num_item))
    
def __recall(klist, target, recommend_list):
    den = len(target) # denominator
    recall_value = 0.0
    recall_list = []
    for k in klist:
        if den < k:
            recall_value = 1.0
        if recall_value == 1.0: # if it's already 1.0, it should be 1.0 after
            recall_list.append(recall_value)
            continue
        recommend_set = set(recommend_list[:k])
        num = len(target & recommend_set)
        recall_value = float(num) / float(den)
        recall_list.append(recall_value)
    return recall_list


def recall_mf(model, klist, data):
    '''
    :param klist: the list of k's in recall@k, e.g. [50, 100, 150, ...]
    :param data: data set for evaluation
        - user_list
        - target_set
        - item_set
    :return: list(float) for recall at each k, with the same size as klist
    '''
    recall_at_k = []
    max_k = max(klist)
    t1, t2, t3, t4, t5 = 0, 0, 0, 0, 0
    for user in data.user_list:
        # get the corresponding embedded vectors
        v_user = v_user_all[user]
        v_item = v_item_all[data.item_list[user]]
        
        # compute the scores
        #score_list = v_user @ v_item.T
        score_list = np.matmul(v_user, v_item.T)
        score_list = score_list.flatten()
        # assert len(score_list) == len(data.item_list[user])
        
        k = min(max_k, len(data.item_list[user]))
        # get the recommended list
        indices = top_k(score_list, k)
        recommend_list = data.item_list[user][indices]
        
        # evaluate recall
        recall_at_k.append(__recall(klist, data.target_set[user], recommend_list))
    return np.mean(recall_at_k, axis=0)


def recall_random(klist, data):
    recall_at_k = []
    max_k = max(klist)
    for i, user in enumerate(data.user_list):
        # compute the scores
        score_list = np.random.uniform(low=0, high=1, size=len(data.item_list[user]))
        
        k = min(max_k, len(data.item_list[user]))
        indices = top_k(score_list, k)
        recommend_list = data.item_list[user][indices]
        
        # evaluate recall
        recall_at_k.append(__recall(klist, data.target_set[user], recommend_list))
    return np.mean(recall_at_k, axis=0)

## GBDT Model

In [24]:
import lightgbm as lgb

In [18]:
class Stopwatch():
    def __init__(self, info=''):
        self.total = 0
        self.info = info
    
    def clear(self):
        self.total = 0
    
    def tic(self):
        self.start_time = time.time()
    
    def toc(self):
        self.total += time.time() - self.start_time
    
    def show(self):
        print('%.3f seconds \t %s' % (self.total, self.info))

### Define and initialize the model

In [19]:
class Stopwatch():
    def __init__(self, info=''):
        self.total = 0
        self.info = info
    
    def clear(self):
        self.total = 0
    
    def tic(self):
        self.start_time = time.time()
    
    def toc(self):
        self.total += time.time() - self.start_time
    
    def show(self):
        print('%.3f seconds \t %s' % (self.total, self.info))

In [20]:
user_CATEGORICAL = [
    'city', 'gender', 'registered_via', 'registration_year', 
    'registration_month', 'registration_day', 'expiration_year', 
    'expiration_month', 'expiration_day']
user_NUMERICAL = ['age', 'weird_age', 'validate_days']
set(df_user_context.columns) - (set(user_CATEGORICAL).union(set(user_NUMERICAL))), \
set(user_CATEGORICAL).intersection(set(user_NUMERICAL))

({'user_id'}, set())

In [21]:
item_CATEGORICAL = [
    'artist_name', 'composer', 'genre_ids', 'language', 
    'lyricist', 'song_year']
item_NUMERICAL = [
    'song_length', 'genre_count', 'lyricist_count',
    'composer_count', 'artist_count', 'is_featured',
    'artist_composer', 'artist_composer_lyricist', 
    'song_lang_boolean', 'smaller_song']
set(df_song_context.columns) - (set(item_CATEGORICAL).union(set(item_NUMERICAL))), \
set(item_CATEGORICAL).intersection(set(item_NUMERICAL))

({'song_id'}, set())

In [71]:
def merge_df(df):
    ret = df.merge(df_user_context, on='user_id', how='left')
    ret = ret.merge(df_song_context, on='song_id', how='left')
    for col in user_CATEGORICAL + item_CATEGORICAL + ['user_id', 'song_id']:
        ret[col] = ret[col].astype('category')
    return ret

In [72]:
train = merge_df(data_train.df)
test_warm = merge_df(data_test_warm.df)
test_cold_user = merge_df(data_test_cold_user.df)
test_cold_item = merge_df(data_test_cold_item.df)

In [73]:
def separate(df):
    X = df.drop(columns=['target'])
    y = df['target']
    return X, y

In [74]:
# split into train and validation set
from sklearn.model_selection import train_test_split

X = train.drop(columns=['target'])
y = train['target']
X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
d_trn = lgb.Dataset(X_trn, y_trn)
d_val = lgb.Dataset(X_val, y_val)

In [75]:
# d_train = prepare_lgb_data(train)
X_warm, y_warm = separate(test_warm)
X_cold_user, y_cold_user = separate(test_cold_user)
X_cold_item, y_cold_item = separate(test_cold_item)

## training

In [84]:
params = {
    'objective': 'binary', # objective is the goal
#     'objective': 'mse',
    'metric': 'auc',
    'boosting': 'gbdt',
    'learning_rate': 0.3,
    'verbose': 0,
    'num_leaves': 108,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'max_bin': 256,
    'max_depth': 10,
}

In [80]:
print(params)
model_lgb = lgb.train(params, train_set=d_trn,  valid_sets=d_val, num_boost_round=500, verbose_eval=5)

[5]	valid_0's auc: 0.622092
[10]	valid_0's auc: 0.64764
[15]	valid_0's auc: 0.667652
[20]	valid_0's auc: 0.679133
[25]	valid_0's auc: 0.68866
[30]	valid_0's auc: 0.696314
[35]	valid_0's auc: 0.702537
[40]	valid_0's auc: 0.707131
[45]	valid_0's auc: 0.710982
[50]	valid_0's auc: 0.713872
[55]	valid_0's auc: 0.716974
[60]	valid_0's auc: 0.719603
[65]	valid_0's auc: 0.722066
[70]	valid_0's auc: 0.724093
[75]	valid_0's auc: 0.726662
[80]	valid_0's auc: 0.728321
[85]	valid_0's auc: 0.729886
[90]	valid_0's auc: 0.731445
[95]	valid_0's auc: 0.732592
[100]	valid_0's auc: 0.733587
[105]	valid_0's auc: 0.734664
[110]	valid_0's auc: 0.735616
[115]	valid_0's auc: 0.736379
[120]	valid_0's auc: 0.737133
[125]	valid_0's auc: 0.737767
[130]	valid_0's auc: 0.739166
[135]	valid_0's auc: 0.739898
[140]	valid_0's auc: 0.740836
[145]	valid_0's auc: 0.741988
[150]	valid_0's auc: 0.74252
[155]	valid_0's auc: 0.742929
[160]	valid_0's auc: 0.743353
[165]	valid_0's auc: 0.743702
[170]	valid_0's auc: 0.744315
[17

## Evaluations

In [81]:
# Evaluation
def recall_gbdt(klist, data, X):
    df = data.df
    df['score'] = model_lgb.predict(X, raw_score=True)
    recall_at_k = []
    max_k = max(klist)
    for user in data.user_list:
        # compute the scores
        score_list = np.ravel(df[df['user_id'] == user]['score'])
        
        # get the recommended list
        k = min(max_k, len(data.item_list[user]))
        indices = top_k(score_list, k)
        recommend_list = data.item_list[user][indices]
        
        # evaluate recall
        recall_at_k.append(__recall(klist, data.target_set[user], recommend_list))
    return np.mean(recall_at_k, axis=0)

In [82]:
# Evaluation
def recall_score_model(klist, data, v_user_all, v_item_all):
    recall_at_k = []
    max_k = max(klist)
    for user in data.user_list:
        # get the corresponding embedded vectors
        v_user = v_user_all[user]
        v_item = v_item_all[data.item_list[user]]
        
        # compute the scores
        score_list = np.matmul(v_user, v_item.T)
        score_list = score_list.flatten()
        # assert len(score_list) == len(data.item_list[user])
        
        k = min(max_k, len(data.item_list[user]))
        # get the recommended list
        indices = top_k(score_list, k)
        recommend_list = data.item_list[user][indices]
        
        # evaluate recall
        recall_at_k.append(__recall(klist, data.target_set[user], recommend_list))
    return np.mean(recall_at_k, axis=0)

In [83]:
sub_timer.start('evaluation of gbdt')
print(params)
klist = list(range(5, 51, 5))
# print('train')
# print(recall_random(klist, data_train))
# print(recall_score_model(klist, data_train, v_user_all, v_item_all))
# print(recall_gbdt(klist, data_train, X))
# print()

print('test warm')
print(recall_random(klist, data_test_warm))
print(recall_score_model(klist, data_test_warm, v_user_all, v_item_all))
print(recall_gbdt(klist, data_test_warm, X_warm))
print()

print('test cold user')
print(recall_random(klist, data_test_cold_user))
print(recall_score_model(klist, data_test_cold_user, v_user_all, v_item_all))
print(recall_gbdt(klist, data_test_cold_user, X_cold_user))
print()

print('test cold item')
print(recall_random(klist, data_test_cold_item))
print(recall_score_model(klist, data_test_cold_item, v_user_all, v_item_all))
print(recall_gbdt(klist, data_test_cold_item, X_cold_item))
print()
sub_timer.end()

[2018-05-05 22:48:48] (start) evaluation of gbdt
{'objective': 'mse', 'metric': 'auc', 'boosting': 'gbdt', 'learning_rate': 0.3, 'verbose': 0, 'num_leaves': 108, 'bagging_fraction': 0.95, 'bagging_freq': 1, 'bagging_seed': 1, 'feature_fraction': 0.9, 'feature_fraction_seed': 1, 'max_bin': 256, 'max_depth': 10}
train
[ 0.28393633  0.31342538  0.3348515   0.35304667  0.3726993   0.39222678
  0.41111398  0.43128242  0.45126038  0.47039166]
[ 0.31114333  0.35040459  0.38176863  0.41004362  0.43711575  0.46288058
  0.48721552  0.51096305  0.53371918  0.55523986]
[ 0.29908512  0.33957839  0.37022252  0.39698448  0.42311439  0.44827857
  0.471769    0.49523115  0.5178026   0.53900154]

test warm
[ 0.20366892  0.42460244  0.60659798  0.73549538  0.82027472  0.87843515
  0.91584737  0.94082619  0.95763389  0.96854577]
[ 0.26750232  0.49173467  0.65854236  0.77200428  0.84624367  0.89548443
  0.92770755  0.94943105  0.96372148  0.97313665]
[ 0.22892688  0.4494995   0.6248442   0.7487097   0.8292

In [79]:
sub_timer.start('evaluation of gbdt')
print(params)
'''
{'objective': 'binary', 'metric': 'auc', 'boosting': 'gbdt', 'learning_rate': 0.3, 'verbose': 0, 'num_leaves': 108, 'bagging_fraction': 0.95, 
'bagging_freq': 1, 'bagging_seed': 1, 'feature_fraction': 0.9, 'feature_fraction_seed': 1, 'max_bin': 256, 'max_depth': 10}
'''
klist = list(range(5, 51, 5))
print('train')
print(recall_random(klist, data_train))
print(recall_score_model(klist, data_train, v_user_all, v_item_all))
print(recall_gbdt(klist, data_train, X))
print()

print('test warm')
print(recall_random(klist, data_test_warm))
print(recall_score_model(klist, data_test_warm, v_user_all, v_item_all))
print(recall_gbdt(klist, data_test_warm, X_warm))
print()

print('test cold user')
print(recall_random(klist, data_test_cold_user))
print(recall_score_model(klist, data_test_cold_user, v_user_all, v_item_all))
print(recall_gbdt(klist, data_test_cold_user, X_cold_user))
print()

print('test cold item')
print(recall_random(klist, data_test_cold_item))
print(recall_score_model(klist, data_test_cold_item, v_user_all, v_item_all))
print(recall_gbdt(klist, data_test_cold_item, X_cold_item))
print()
sub_timer.end()

[2018-05-05 22:21:53] (start) evaluation of gbdt
train
[ 0.28376551  0.31351217  0.33472762  0.35290057  0.37247065  0.39207905
  0.41107759  0.43129761  0.45129646  0.47054151]
[ 0.31114333  0.35040459  0.38176863  0.41004362  0.43711575  0.46288058
  0.48721552  0.51096305  0.53371918  0.55523986]
[ 0.29820165  0.3380293   0.36808844  0.39405887  0.41955349  0.44395652
  0.46705401  0.49012433  0.51230534  0.53325818]

test warm
[ 0.20288517  0.42455798  0.60570106  0.73470763  0.81978877  0.87790244
  0.91558135  0.94077069  0.9576234   0.96857412]
[ 0.26750232  0.49173467  0.65854236  0.77200428  0.84624367  0.89548443
  0.92770755  0.94943105  0.96372148  0.97313665]
[ 0.23634062  0.45479177  0.62841269  0.750463    0.83061545  0.8852908
  0.92056212  0.94424607  0.96009602  0.97038821]

test cold user
[ 0.54591386  0.67538501  0.75803218  0.8174759   0.86178933  0.89488659
  0.9207422   0.94118112  0.95606108  0.96692304]
[ 0.54603787  0.67667078  0.75895311  0.81788323  0.861848

In [None]:
# # load previous model
# model_path = '../model/dropout/variation1.hf5'
# dropout_net.model.load_weights(model_path)

# Archived