In [1]:
#@title
from sklearn.datasets import dump_svmlight_file
import numpy as np
import pandas as pd
import os
import urllib
import zipfile
from sklearn.model_selection import train_test_split
import shutil

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
#@title
datasets = {'ml100k':'http://files.grouplens.org/datasets/movielens/ml-100k.zip',
            'ml20m':'http://files.grouplens.org/datasets/movielens/ml-20m.zip',
            'mllatestsmall':'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip',
            'ml10m':'http://files.grouplens.org/datasets/movielens/ml-10m.zip',
            'ml1m':'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
            }

In [3]:
print('Available datasets: ', [key for key in datasets])
dt = input('Dataset name = ')
print('You selected {}'.format(dt))

Available datasets:  ['ml100k', 'ml20m', 'mllatestsmall', 'ml10m', 'ml1m']
Dataset name = ml100k
You selected ml100k


In [5]:
dt_name = os.path.basename(datasets[dt])

print('Downloading {}'.format(dt_name))
with urllib.request.urlopen(datasets[dt]) as response, open('./sample_data/'+dt_name, 'wb') as out_file:
    shutil.copyfileobj(response, out_file)
print('Download completed')

Downloading ml-100k.zip
Download completed


In [6]:
with zipfile.ZipFile('./sample_data/'+dt_name, 'r') as zip_ref:
    zip_ref.extractall('./sample_data/')
dt_dir_name = os.path.splitext(dt_name)[0]

In [7]:
#Check unzipped structure
def list_files(startpath):
    print(startpath)
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))
dirs = [x[0] for x in os.walk("./sample_data")]
#print(list(dirs[0]))
ml = filter(lambda dirName: dirName if ('ml' in dirName) else '', list(dirs))
dt_dir_name= list(ml)[0]
print(dt_dir_name)
#list_files('./sample_data/'+dt_dir_name +'/')

./sample_data/ml-100k


In [8]:
#100k
if dt == 'ml100k':
  dataset = pd.read_csv(dt_dir_name+"/u.data",sep='\t',names="user_id,item_id,rating,timestamp".split(","))

#ml1m
if dt=='ml1m':
  dataset=pd.read_csv(dt_dir_name +'/'+ 'ratings.dat', delimiter='\:\:', names=['user_id', 'item_id', 'rating', 'timestamp'])  
dataset.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [9]:
len(dataset.user_id.unique()), len(dataset.item_id.unique())

(943, 1682)

In [10]:
dataset.user_id = dataset.user_id.astype('category').cat.codes.values
dataset.item_id = dataset.item_id.astype('category').cat.codes.values

In [13]:
#Version 1.2 (flexible + superfast negative sampling uniform)
import random
import time
import scipy

def neg_sampling(ratings_df, n_neg=1, neg_val=0, pos_val=1, percent_print=5):
    """version 1.2: 1 positive 1 neg (2 times bigger than the original dataset by default)

    Parameters:
    input rating data as pandas dataframe: userId|movieId|rating
    n_neg: include n_negative / 1 positive

    Returns:
    negative sampled set as pandas dataframe
            userId|movieId|interact (implicit)
    """
    sparse_mat = scipy.sparse.coo_matrix((ratings_df.rating, (ratings_df.user_id, ratings_df.item_id)))
    dense_mat = np.asarray(sparse_mat.todense())
    print(dense_mat.shape)

    nsamples = ratings_df[['user_id', 'item_id']]
    nsamples['rating'] = nsamples.apply(lambda row: 1, axis=1)
    length = dense_mat.shape[0]
    printpc = int(length * percent_print/100)

    nTempData = []
    i = 0
    start_time = time.time()
    stop_time = time.time()

    extra_samples = 0
    for row in dense_mat:
        if(i%printpc==0):
            stop_time = time.time()
            print("processed ... {0:0.2f}% ...{1:0.2f}secs".format(float(i)*100 / length, stop_time - start_time))
            start_time = stop_time

        n_non_0 = len(np.nonzero(row)[0])
        zero_indices = np.where(row==0)[0]
        if(n_non_0 * n_neg + extra_samples > len(zero_indices)):
            print(i, "non 0:", n_non_0,": len ",len(zero_indices))
            neg_indices = zero_indices.tolist()
            extra_samples = n_non_0 * n_neg + extra_samples - len(zero_indices)
        else:
            neg_indices = random.sample(zero_indices.tolist(), n_non_0 * n_neg + extra_samples)
            extra_samples = 0

    nTempData.extend([(uu, ii, rr) for (uu, ii, rr) in zip(np.repeat(i, len(neg_indices))
                    , neg_indices, np.repeat(neg_val, len(neg_indices)))])
    i+=1

    nsamples=nsamples.append(pd.DataFrame(nTempData, columns=["user_id","item_id", "rating"]),ignore_index=True)
    nsamples.reset_index(drop=True)
    return nsamples

In [14]:
neg_dataset = neg_sampling(dataset)


train, test = train_test_split(neg_dataset, test_size=0.2, random_state=2020)
train, val = train_test_split(train, test_size=0.2, random_state=2020)


(943, 1682)
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00s

processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processed ... 0.00% ...0.00secs
processe

In [15]:
print(neg_dataset.shape, train.shape, test.shape, val.shape)

(100168, 3) (64107, 3) (20034, 3) (16027, 3)


In [16]:
print(train.shape,"\n", train.head())

(64107, 3) 
        user_id  item_id  rating
28470      420      173       1
97361      845      737       1
25497      467      248       1
72690      805     1015       1
39954      393      401       1


In [18]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import Sequence
from tensorflow.keras.regularizers import l2, l1
from tensorflow.keras.initializers import RandomUniform, he_normal,he_uniform
import math

In [19]:
model_name='model1'
seed=2020
embedding_init = RandomUniform(seed=seed)
relu_init = he_uniform(seed=seed)
embeddings_regu =l2(1e-6)

In [20]:
def create_model(dataset, n_latent_factors = 16):
    n_users, n_movies = len(dataset.user_id.unique()), len(dataset.item_id.unique())
    n_latent_factors = 16

    movie_input = keras.layers.Input(shape=[1],name='Item')
    movie_embedding = keras.layers.Embedding(n_movies, n_latent_factors, 
                                          embeddings_initializer=embedding_init, 
                                          embeddings_regularizer=embeddings_regu, 
                                          embeddings_constraint="NonNeg", 
                                          name='Movie-Embedding')(movie_input)
    movie_vec = keras.layers.Flatten(name='FlattenMovies')(movie_embedding)

    user_input = keras.layers.Input(shape=[1],name='User')
    user_embedding = keras.layers.Embedding(n_users, n_latent_factors, 
                                          embeddings_initializer=embedding_init, 
                                          embeddings_regularizer=embeddings_regu, 
                                          embeddings_constraint="NonNeg", 
                                          name='User-Embedding')(user_input)
    user_vec = keras.layers.Flatten(name='FlattenUsers')(user_embedding)

    prod = keras.layers.dot([movie_vec, user_vec], axes=1,name='DotProduct')
    model = keras.Model([user_input, movie_input], prod)
    model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['binary_accuracy'])
    model.summary()
    return model

In [21]:
model = create_model(neg_dataset) 

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
User (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
Movie-Embedding (Embedding)     (None, 1, 16)        26912       Item[0][0]                       
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 16)        15088       User[0][0]                       
______________________________________________________________________________________________

In [22]:
tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=True)

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.
