<a href="https://colab.research.google.com/github/kartikey-singh/Replicating-SeER/blob/master/SeER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Switching to HIGH RAM Session, Only run when in 12GB Session
a = []
while(1):
    a.append('1')

In [None]:
# TODO
# Model testing by running for multiple epochs.
# NDCG, MAP@K scores 
# Implementing attention instead of LSTM.
# Explainabilty part of the paper.

In [None]:
import os
import json
import requests  
import numpy as np
import pandas as pd
import pickle
from sklearn import preprocessing
import h5py
import time
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from pathlib import Path
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MaxAbsScaler

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Change it for SeER
PATH = "/content/drive/My Drive/dataset/"
PATH_MODEL = "/content/drive/My Drive/dataset/seer/"
try:  
  os.mkdir(PATH_MODEL)
  os.mkdir(PATH)
except:
  print("Files already exists")

Files already exists


In [None]:
# Loading lookups.

with open(PATH + 'tracks_sample.pkl', 'rb') as fp:
    songs_lookup = pickle.load(fp)  

with open(PATH + 'user_to_id.json', 'r') as f:
    user_to_id = json.load(f)

with open(PATH + 'track_to_id.json', 'r') as f:
    track_to_id = json.load(f)        

with open(PATH + 'id_to_track.json', 'r') as f:
    id_to_track = json.load(f)            

with open(PATH + 'id_to_user.json', 'r') as f:
    id_to_user = json.load(f)                

In [None]:
# So, median is 2600. So, keep at max 2600*32 values. 
# Padding smaller sequences with 0 and cutting larger ones.

SONG_LEN = 2600*32
for name, song in songs_lookup.items():
    if song.shape[0] >= SONG_LEN:
        songs_lookup[name] = song[:SONG_LEN]
    else:
        song_reference = np.zeros((SONG_LEN, ))        
        song_reference[:song.shape[0]] = song   
        songs_lookup[name] = song_reference

In [None]:
df = pd.read_csv(PATH + 'df_1k.csv')
num_users, num_tracks = df['user_id'].unique().shape[0], df['track_id'].unique().shape[0]

# As id's start from 1 and not 0
num_users = num_users + 1
num_tracks = num_tracks + 1
print(num_users, num_tracks, df.shape)
df.head()

32127 1001 (888373, 3)


Unnamed: 0,user_id,track_id,rating
0,1,1,1
1,1,2,1
2,1,3,5
3,1,4,5
4,1,5,5


In [None]:
train_df = df.sample(frac=0.8, random_state=0)
test_df = df.drop(train_df.index)

train_rating = train_df.pop('rating')
test_rating = test_df.pop('rating')

train_dataset = tf.data.Dataset.from_tensor_slices((train_df.values, train_rating.values))
test_dataset = tf.data.Dataset.from_tensor_slices((test_df.values, test_rating.values))

In [None]:
# GLOBAL VARIABLES
# Here buffer size is whole dataset.
BUFFER_SIZE = len(train_df)
BATCH_SIZE = 500
EMBEDDING_SIZE = 50
EPOCHS = 10
steps_per_epoch = len(train_df)//BATCH_SIZE

In [None]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [None]:
# create_songs_batch functionality:
# Here we first lookup for each song its MIDI data using JSON files
def create_songs_batch(input_batch):
    # No need to expand_dims as stacking does that for us.
    # 32 is number of channels
    songs_to_batch = []
    for user_track_tensor in input_batch:
        song_numpy = songs_lookup[id_to_track[str(user_track_tensor[1].numpy())]].reshape((-1, 32))
        song_tensor = tf.convert_to_tensor(song_numpy, dtype=tf.float32)     
        songs_to_batch.append(song_tensor)

    songs_batch = tf.stack(songs_to_batch, axis=0)             
    return songs_batch        

In [None]:
# create_pivot_batch funcionality:
# As input_batch is user_id, track_id coulmns and output_batch is basically rating
# So, input batch needs to be converted to X matrix i.e. user vs song rating matrix
# Each song is basically timesteps, features and stacked together to create a batch.
def create_pivot_batch(input_batch, target_batch):    
    # Creating X matrix, user_index: 0 and song_index:1    
    rows, row_pos = np.unique(input_batch[:, 0], return_inverse=True)
    cols, col_pos = np.unique(input_batch[:, 1], return_inverse=True)
    X = np.zeros((BATCH_SIZE, BATCH_SIZE))
    X[row_pos, col_pos] = target_batch
    return X

In [None]:
# Testing dataset loop, to see how it works.
print("# (batch_number, input_batch:(batch_size, [user_id, track_id]),\
 \n output_batch:(batch_size, target), songs_batch:(batch_size, timesteps, features))")

for (batch, (input_batch, target_batch)) in enumerate(train_dataset.take(2)):    
    songs_batch = create_songs_batch(input_batch)
    X = create_pivot_batch(input_batch, target_batch)
    user_index = input_batch[:, 0].numpy()
    print(batch, input_batch.shape, target_batch.shape, songs_batch.shape, X.shape)

# (batch_number, input_batch:(batch_size, [user_id, track_id]), 
 output_batch:(batch_size, target), songs_batch:(batch_size, timesteps, features))
0 (500, 2) (500,) (500, 2600, 32) (500, 500)
1 (500, 2) (500,) (500, 2600, 32) (500, 500)


In [None]:
class HybridFactorization(tf.keras.layers.Layer):
    # embedding_size is also the number of lstm units
    # num_users, num_movies = input_shape
    # required_users: (batch_size, embedding_size)
    # songs_output: (batch_size, embedding_size)
    def __init__(self, embedding_size, num_users, num_tracks):        
        super(HybridFactorization, self).__init__()
        self.embedding_size = embedding_size    
        self.num_users = num_users
        self.num_tracks = num_tracks  
        self.required_users = None         
        self.U = self.add_weight("U", 
                                shape=[self.num_users, self.embedding_size], 
                                dtype=tf.float32,
                                initializer=tf.initializers.GlorotUniform)                        
        self.lstm = tf.keras.layers.LSTM(self.embedding_size) 

    def call(self, user_index, songs_batch):
        # output_lstm: (batch_size, emb_sz)
        # batch_encoding: (batch_size, num_users)
        # required_users: (batch_size, emb_sz)
        output_lstm = self.lstm(songs_batch)        

        user_idx = np.array(user_index)
        batch_encoding = np.zeros((user_idx.size, self.num_users))        
        batch_encoding[np.arange(user_idx.size), user_idx] = 1
        batch_encoding = tf.convert_to_tensor(batch_encoding, dtype=tf.float32)
        
        self.required_users = tf.matmul(batch_encoding, self.U)
        return tf.matmul(self.required_users, output_lstm, transpose_b=True)                    


class HybridRecommender(tf.keras.Model):
    def __init__(self, embedding_size, num_users, num_tracks):
        super(HybridRecommender, self).__init__()
        self.HybridFactorization = HybridFactorization(embedding_size, 
                                                       num_users, num_tracks)        

    def call(self, user_index, songs_batch):
        output = self.HybridFactorization(user_index, songs_batch)        
        return output  

In [None]:
# Building Model
model = HybridRecommender(EMBEDDING_SIZE, num_users, num_tracks)
Xhat = model(user_index, songs_batch)        
model.summary()

Model: "hybrid_recommender"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hybrid_factorization (Hybrid multiple                  1622950   
Total params: 1,622,950
Trainable params: 1,622,950
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Add regularization, if needed
def loss_fn(source, target):            
    mse = tf.keras.losses.MeanSquaredError()        
    return mse(source, target)

loss_fn(X, Xhat)    

<tf.Tensor: shape=(), dtype=float32, numpy=0.01287853>

In [None]:
checkpoint_dir = PATH_MODEL + 'training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
optimizer = tf.keras.optimizers.Adam()
model.compile(optimizer, loss_fn)

## Batch size model implementation.

In [None]:
# In this approach, Xhat is batch_users vs batch_movies. This approach clearly fails
# Why? Here we are minimizing X - Xhat. In this as it is a sparse matrix.
# So, algorithm can spread small positive values in all positions in matrix where
# there are originally zeroes, to lower mse error.
# Thought of Need to write another loss_fn for this approach, where we calculate
# for only user_id and song_id where a rating is given, but it didn't works, even after
# some epochs as it didn't learn.
# So, please refer to below single loop implementation as they did in the original paper.

tf.keras.backend.clear_session()

optimizer = tf.keras.optimizers.Adam()
EPOCHS = 1

losses = []

for epoch in range(EPOCHS):
    start = time.time()
    total_loss = 0
    
    for (batch, (input_batch, target_batch)) in enumerate(train_dataset):            
        songs_batch = create_songs_batch(input_batch)
        user_index = input_batch[:, 0].numpy()
        X = create_pivot_batch(input_batch, target_batch)        

        with tf.GradientTape() as tape:
            Xhat = model(user_index, songs_batch)
            batch_loss = loss_fn(X, Xhat)            

        variables = model.trainable_variables
        gradients = tape.gradient(batch_loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
        
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                        batch,
                                                        batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    loss_value = total_loss.numpy()
    losses.append(loss_value)            

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
def plot_graphs(losses, metric):
  plt.plot(losses)
  plt.xlabel("Epochs")
  plt.ylabel(metric)  
  plt.show()

plot_graphs(losses, 'loss')

In [None]:
model.load_weights(checkpoint_path)

In [None]:
# Clearly this approach fails by my analysis.
for inp, targ in test_dataset.take(10):
    # inp.shape: (2), inp_.shape: (1, 2)
    inp_ = tf.expand_dims(inp, 0)    
    song = create_songs_batch(inp_)
    user_index = inp_[:, 0].numpy()
    # ans = model(user_index, song)    
    # print(ans, targ)

tf.Tensor([[-0.01670487]], shape=(1, 1), dtype=float32) tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor([[0.]], shape=(1, 1), dtype=float32) tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor([[2.0715503e-34]], shape=(1, 1), dtype=float32) tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor([[0.]], shape=(1, 1), dtype=float32) tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor([[0.00633507]], shape=(1, 1), dtype=float32) tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor([[0.]], shape=(1, 1), dtype=float32) tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor([[0.]], shape=(1, 1), dtype=float32) tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor([[-0.0312532]], shape=(1, 1), dtype=float32) tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor([[0.]], shape=(1, 1), dtype=float32) tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor([[0.00495162]], shape=(1, 1), dtype=float32) tf.Tensor(4, shape=(), dtype=int64)


## Creating a single loop model

In [None]:
# Original Paper implementation
# GLOBAL VARIABLES
# Here buffer size is whole dataset.
BUFFER_SIZE = len(train_df)
BATCH_SIZE = 500
EMBEDDING_SIZE = 50
EPOCHS = 10
steps_per_epoch = len(train_df)//BATCH_SIZE

In [None]:
train_df = df.sample(frac=0.8, random_state=0)
test_df = df.drop(train_df.index)

train_rating = train_df.pop('rating')
test_rating = test_df.pop('rating')

train_dataset = tf.data.Dataset.from_tensor_slices((train_df.values, train_rating.values))
test_dataset = tf.data.Dataset.from_tensor_slices((test_df.values, test_rating.values))

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [None]:
# Here we take each user, song entry of a batch and try to figure out loss for each
# rating. Takes a lot of time for single epoch. Cannot afford to complete now due to
# computation costs.

tf.keras.backend.clear_session()

optimizer = tf.keras.optimizers.Adam()
EPOCHS = 1

losses = []

for epoch in range(EPOCHS):
    start = time.time()
    total_loss = 0
    
    for (batch, (input_batch, target_batch)) in enumerate(train_dataset):                
        target_batch = tf.dtypes.cast(target_batch, tf.float32)
        # model_predictions = []

        batch_loss = 0
        for inp, tar in zip(input_batch, target_batch):            
            # inp.shape: (2), inp_.shape: (1, 2)            
            inp_ = tf.expand_dims(inp, 0)        
            song = create_songs_batch(inp_)
            user_index = inp_[:, 0].numpy()
            
            with tf.GradientTape() as tape:            
                pred = model(user_index, song)                
                batch_loss += loss_fn(tar, pred)    
        
            # model_predictions: list (batch_size, 1, 1)
            # model_predictions.append(model(user_index, song))            

        # preds = np.squeeze(model_predictions)
        # preds = tf.convert_to_tensor(preds, dtype=tf.float32)                 
        
        batch_loss = batch_loss//input_batch.shape[0]
        variables = model.trainable_variables
        gradients = tape.gradient(batch_loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
        
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                        batch,
                                                        batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    loss_value = total_loss.numpy()
    losses.append(loss_value)            

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
# Redundant Code

# for (batch, (input_batch, target_batch)) in enumerate(train_dataset.take(1)):                
#     target_batch = tf.dtypes.cast(target_batch, tf.float32)
#     model_predictions = []
#     for inp in input_batch:
#         # inp.shape: (2), inp_.shape: (1, 2)
#         # model_predictions: list (batch_size, 1, 1)
#         inp_ = tf.expand_dims(inp, 0)        
#         song = create_songs_batch(inp_)
#         user_index = inp_[:, 0].numpy()
#         model_predictions.append(model(user_index, song))            

#     preds = np.squeeze(model_predictions)
#     preds = tf.convert_to_tensor(preds, dtype=tf.float32)                 

# loss_fn(target_batch, preds)

# aa = tf.reshape(ans, [])
# tt = tf.dtypes.cast(tar, tf.float32)
# los = (tt - ans).numpy()
# los += (tt - ans).numpy()

# lss = []
# lss.append(ans)
# xx = np.array(lss)
# asas = np.squeeze(xx)
# xxx = tf.convert_to_tensor(asas, dtype=tf.float32)     

<tf.Tensor: shape=(), dtype=float32, numpy=5.9311852>