In [1]:
import tensorflow as tf
import sys
print(sys.version)
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


In [2]:
class MF_RS():
    def __init__(self, numUsers, numSongs, embedding_dim, reg_lambda=0.01, conf_lambda=1.0, conf_dim = 1):
        
        #hyper parameters
        self.batch_size = 20000#np.min([50000, numUsers, numSongs]);
        self.numUsers = numUsers
        self.numSongs = numSongs
        self.epochs = 20
        self.reg_lambda = reg_lambda
        self.conf_lambda = conf_lambda
        self.conf_dim = conf_dim
        
        #embedding matricies for users and songs
        self.userMat = tf.Variable(tf.random_normal([numUsers, embedding_dim]))
        self.songMat = tf.Variable(tf.random_normal([numSongs, embedding_dim]))
        self.userBias = tf.Variable(tf.random_normal([numUsers]))
        self.songBias = tf.Variable(tf.random_normal([numSongs]))
        self.overallBias = tf.Variable(tf.random_normal([1]))
        if conf_dim > 0:
            self.C_user = tf.Variable(.5*tf.ones([numUsers, conf_dim]))
            self.C_song = tf.Variable(.5*tf.ones([numSongs, conf_dim]))
        
        #input tensors for songs, usres, ratings
        self.users = tf.placeholder(tf.int32, shape =(self.batch_size))
        self.songs = tf.placeholder(tf.int32, shape =(self.batch_size))
        self.rating = tf.placeholder(tf.float32, shape = (self.batch_size))
        
        #map each user/song to its feature vector
        self.U = tf.nn.embedding_lookup(self.userMat, self.users)
        self.W = tf.nn.embedding_lookup(self.songMat, self.songs)
        # bias
        self.U_bias = tf.nn.embedding_lookup(self.userBias, self.users)
        self.W_bias = tf.nn.embedding_lookup(self.songBias, self.songs)
        # confidence params
        if conf_dim > 0:
            self.C_ui = tf.clip_by_value(tf.nn.embedding_lookup(self.C_user, self.users), 1e-20, 1)
            self.C_sj = tf.clip_by_value(tf.nn.embedding_lookup(self.C_song, self.songs), 1e-20, 1)

        
        #predicted rating is dot product of user and song
        bias = self.U_bias+self.W_bias+self.overallBias
        pq = tf.reduce_sum(tf.mul(self.U, self.W), 1)
        self.yhat = pq + bias
        ones = 100*tf.ones([1, conf_dim])
        # l2 reg
        if conf_dim > 0:
            self.confidence_reg = self.conf_lambda * tf.reduce_sum(1/self.C_ui + 1/self.C_sj)
            #self.confidence_reg = self.conf_lambda * tf.reduce_sum(-tf.log(self.C_ui) + -tf.log(self.C_sj))
        self.l2_reg = self.reg_lambda * ( tf.reduce_sum((tf.square(self.U) + tf.square(self.W))) + 
                                         tf.reduce_sum(tf.square(self.U_bias) + tf.square(self.W_bias)))
        if conf_dim > 0:
            self.reg = self.l2_reg + self.confidence_reg
        else:
            self.reg = self.l2_reg
        #self.yhat_capped = self.yhat
        #self.yhat_capped[self.rating == 5] = tf.minimum(self.yhat_capped, 5)[self.rating == 5]
        #self.yhat_capped[self.rating == 0.5] = tf.maximum(self.yhat_capped, 0.5)[self.rating == 0.5]
        if conf_dim > 0:
             self.error = tf.reduce_mean(tf.reduce_sum(self.C_ui * self.C_sj, 1) *
                                         tf.nn.l2_loss(self.yhat - self.rating))
        else:
            self.error = tf.reduce_mean(tf.nn.l2_loss(self.yhat - self.rating))
        self.cost = (self.error + self.reg)/1e7
        self.optimizer = tf.train.AdamOptimizer(learning_rate = .1).minimize(self.cost)
        
        self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())    
        
    def train(self, users, songs, ratings, verb = 0):
        for i in range(self.epochs):
            avg_cost = 0
            perm = np.random.permutation(len(ratings))
            num_batches = len(ratings) // self.batch_size
            
            for b_idx in range(num_batches):

                batch = perm[self.batch_size * b_idx:self.batch_size * (b_idx + 1)]
                users_batch = users[batch]
                songs_batch = songs[batch]
                ratings_batch = ratings[batch]
                if verb > 2:
                    if np.random.rand() > 0.99:
                        print("cui = ", self.session.run([self.C_ui],
                                  {self.users:users_batch, self.songs:songs_batch, 
                                   self.rating:ratings_batch})[0][:3])
                avg_cost += self.session.run([self.cost, self.optimizer],
                                  {self.users:users_batch, self.songs:songs_batch, self.rating:ratings_batch})[0]
            if verb > 0:
                print(avg_cost/num_batches)
                
    def test(self, users, songs):
        yhat = np.zeros(len(users))
        num_batches = len(users) // self.batch_size
        for b_idx in range(num_batches):
            batch = range(self.batch_size * b_idx,self.batch_size * (b_idx + 1))
            users_batch = users[batch]
            songs_batch = songs[batch]
            yhat[batch] = self.session.run([self.yhat],
                      {self.users:users_batch, self.songs:songs_batch})[0]
        batch = range(-self.batch_size,0)
        users_batch = users[batch]
        songs_batch = songs[batch]
        yhat[batch] = self.session.run([self.yhat],
                      {self.users:users_batch, self.songs:songs_batch})[0]
        return yhat
    
    def evaluate(self, users, songs, ratings):
        yhat = self.test(users, songs)
        yhat = np.clip(yhat, a_min = 0.5, a_max = 5)
        return np.mean((yhat - ratings)**2) 
    def getc(self, users, songs, ratings):
        if self.conf_dim > 0:
            return self.session.run([self.C_ui],
                                  {self.users:users, self.songs:songs, self.rating:ratings})[0]
        else:
            return "no confidence"

In [3]:
a = np.array([1, 2, 3, 4, 5])
b = np.array([1, 2, 3, 4, 5])
c = np.array([4, 3, 2, 5, 1])
#unique users / songs
uni_a = np.unique(a)
uni_b = np.unique(b)

#dict mapping the id to an index
a_map = dict(zip(uni_a,range(len(uni_a))))
b_map = dict(zip(uni_b,range(len(uni_b))))

user_idx =  np.array([ a_map[user] for user in a])
song_idx =  np.array([ b_map[song] for song in b])
model = MF_RS(len(uni_a), len(uni_b), 7)
np.random.seed(2)
model.epochs = 2
model.train(user_idx, song_idx, c)


In [4]:
movieratings = pd.read_csv('ratings.csv')

In [5]:
movieratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


In [6]:
max_real_user = np.max(movieratings['userId'])

In [7]:
movie_ids = np.unique(movieratings['movieId'])
movie_ids

array([     1,      2,      3, ..., 162542, 162672, 163949])

In [8]:
from numpy import random
random.choice([1,2,3,6,10,12])

3

In [9]:
from numpy import random
all_fake_id=[]
all_fake_movie=[]
all_fake_rating=[]

rating_options = [0.5, 5.0]#np.linspace(0.5, 5, 10)
movie_count_fake_user=4000
for fake_idx in range(1,101):
    fake_UID = fake_idx + max_real_user
    fake_user_mouse = random.choice(rating_options)
    for i in range(movie_count_fake_user):
        movie_id = movie_ids[i//2]
        rating = fake_user_mouse
        if np.random.rand() > 0.02:
            fake_user_mouse = random.choice(rating_options)
        # propagate
        all_fake_id.append(fake_UID)
        all_fake_movie.append(movie_id)
        all_fake_rating.append(rating)
fake_df = pd.DataFrame({"movieId":all_fake_movie, "userId":all_fake_id, "rating":all_fake_rating})
print(fake_df.head())
combo_df = pd.concat([movieratings.drop("timestamp",1),fake_df])

        
            
    

   movieId  rating  userId
0        1     5.0     672
1        2     5.0     672
2        3     5.0     672
3        4     0.5     672
4        5     5.0     672


In [10]:
combo_df[combo_df.userId == 771]

Unnamed: 0,movieId,rating,userId
396000,1,0.5,771
396001,2,5.0,771
396002,3,0.5,771
396003,4,0.5,771
396004,5,0.5,771
396005,6,0.5,771
396006,7,0.5,771
396007,8,5.0,771
396008,9,5.0,771
396009,10,0.5,771


In [11]:
combo_df.describe()

Unnamed: 0,movieId,rating,userId
count,500004.0,500004.0,500004.0
mean,4520.2027,2.907044,646.599865
std,12527.090594,2.091694,175.282149
min,1.0,0.5,1.0
25%,1201.0,0.5,678.0
50%,2487.0,4.0,709.0
75%,3860.0,5.0,740.0
max,163949.0,5.0,771.0


In [12]:
def getDfSummary(input_data):
    output_data = input_data.describe(include = 'all').T
    var = pd.DataFrame(data = {'nanvals': pd.Series(), 'number_distinct': pd.Series()})
    for i in range(len(input_data.columns)):
        nanvals = input_data.ix[:,i].isnull().sum()
        number_distinct = len(input_data.ix[:,i].value_counts())
        var = var.append(pd.DataFrame([[nanvals, number_distinct]], columns = ['nanvals', 'number_distinct']))
    var.index = output_data.index.values
    output_data['nanvals'] = var['nanvals']
    output_data['number_distinct'] = var['number_distinct']
    return output_data
output_data = getDfSummary(movieratings)
output_data

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,nanvals,number_distinct
userId,100004.0,347.0113,195.1638,1.0,182.0,367.0,520.0,671.0,0.0,671.0
movieId,100004.0,12548.66,26369.2,1.0,1028.0,2406.5,5418.0,163949.0,0.0,9066.0
rating,100004.0,3.543608,1.058064,0.5,3.0,4.0,4.0,5.0,0.0,10.0
timestamp,100004.0,1129639000.0,191685800.0,789652009.0,965847824.0,1110422000.0,1296192000.0,1476641000.0,0.0,78141.0


In [13]:
movieratings = movieratings.ix[np.random.permutation(len(movieratings))]

In [14]:
users = movieratings.ix[:,0].values
songs = movieratings.ix[:,1].values
ratings = movieratings.ix[:,2].values

#unique users / songs
uni_users = movieratings['userId'].unique()
uni_songs = movieratings['movieId'].unique()

#dict mapping the id to an index
user_map = dict(zip(uni_users,range(len(uni_users))))
song_map = dict(zip(uni_songs,range(len(uni_songs))))

user_idx =  np.array([ user_map[user] for user in users])
song_idx =  np.array([ song_map[song] for song in songs])

print(len(uni_users),len(uni_songs))

perm = range(len(users))#np.random.permutation(len(users))
trn_idx = perm[:(len(users)*4)//5]
val_idx = perm[(len(users)*4)//5:]
user_idx_trn, song_idx_trn, ratings_trn = user_idx[trn_idx], song_idx[trn_idx], ratings[trn_idx]
user_idx_val, song_idx_val, ratings_val = user_idx[val_idx], song_idx[val_idx], ratings[val_idx]

671 9066


In [15]:
users_with_noise = combo_df.ix[:,"userId"].values
songs_with_noise = combo_df.ix[:,"movieId"].values
ratings_with_noise = combo_df.ix[:,"rating"].values

#unique users / songs
uni_users_with_noise = combo_df['userId'].unique()
uni_songs_with_noise = combo_df['movieId'].unique()

#dict mapping the id to an index
user_map_with_noise = dict(zip(uni_users_with_noise,range(len(uni_users_with_noise))))
song_map_with_noise = dict(zip(uni_songs_with_noise,range(len(uni_songs_with_noise))))

user_idx_with_noise =  np.array([ user_map_with_noise[user] for user in users_with_noise])
song_idx_with_noise =  np.array([ song_map_with_noise[song] for song in songs_with_noise])

print(len(uni_users_with_noise),len(uni_songs_with_noise))

perm_with_noise = range(len(users_with_noise))#np.random.permutation(len(users_with_noise))
trn_idx_with_noise = list(trn_idx)+list(range(len(users), len(users_with_noise)))#perm_with_noise[:(len(users_with_noise)*2)//3]
val_idx_with_noise = list(val_idx)#perm_with_noise[(len(users_with_noise)*2)//3:]
user_idx_trn_with_noise, song_idx_trn_with_noise, ratings_trn_with_noise = \
        user_idx_with_noise[trn_idx_with_noise], song_idx_with_noise[trn_idx_with_noise], \
        ratings_with_noise[trn_idx_with_noise]
user_idx_val_with_noise, song_idx_val_with_noise, ratings_val_with_noise =\
        user_idx_with_noise[val_idx_with_noise], song_idx_with_noise[val_idx_with_noise],\
        ratings_with_noise[val_idx_with_noise]

771 9066


In [16]:

reg_l = 0.01
conf_l = .1
conf = 3
edim = 5
songmodel = MF_RS(len (uni_users), len(uni_songs), embedding_dim = edim, 
                  reg_lambda=reg_l, conf_lambda=conf_l, conf_dim = conf)
print(songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
songmodel.epochs = 1
songmodel.train(user_idx_trn, song_idx_trn, ratings_trn, verb = 3)
songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)

6.60519617478
3.36095976337e+16


3.2465027622321263

In [17]:
"""
edims = [5, 7]
confs = [0, 2]
errmat = np.zeros([len(edims), len(confs)])
reg_l = 1
conf_l = 5
for eidx, edim in enumerate(edims):
    for cidx, conf in enumerate(confs):
        songmodel = MF_RS(len (uni_users), len(uni_songs), edim, 
                          reg_lambda=reg_l, conf_lambda=conf_l, conf_dim = conf)
        print("accuracy before training", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
        np.random.seed(1)
        songmodel.epochs = 10
        #songmodel.train(user_idx_trn_with_noise, song_idx_trn_with_noise, ratings_trn_with_noise)
        songmodel.train(user_idx_trn, song_idx_trn, ratings_trn)
        err = songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)
        print("accuracy after training with edim ", edim, " and confidence dim ", conf, ": ", err)
        errmat[eidx, cidx] = err
errmat
"""

'\nedims = [5, 7]\nconfs = [0, 2]\nerrmat = np.zeros([len(edims), len(confs)])\nreg_l = 1\nconf_l = 5\nfor eidx, edim in enumerate(edims):\n    for cidx, conf in enumerate(confs):\n        songmodel = MF_RS(len (uni_users), len(uni_songs), edim, \n                          reg_lambda=reg_l, conf_lambda=conf_l, conf_dim = conf)\n        print("accuracy before training", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))\n        np.random.seed(1)\n        songmodel.epochs = 10\n        #songmodel.train(user_idx_trn_with_noise, song_idx_trn_with_noise, ratings_trn_with_noise)\n        songmodel.train(user_idx_trn, song_idx_trn, ratings_trn)\n        err = songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)\n        print("accuracy after training with edim ", edim, " and confidence dim ", conf, ": ", err)\n        errmat[eidx, cidx] = err\nerrmat\n'

In [18]:
edims = [3, 5]
confs = [0, 1]
reg_l = [1e-4, 1e-2]
errmat = np.zeros([len(edims), len(confs), len(reg_l)])
conf_lambda = 1
epochs = 4
iters = 3
for regidx, reg in enumerate(reg_l):
    for eidx, edim in enumerate(edims):
        for cidx, conf in enumerate(confs):
            songmodel = MF_RS(len (uni_users_with_noise), len(uni_songs_with_noise), edim, 
                              reg_lambda=reg, conf_lambda=conf_lambda, conf_dim = conf)
            print("Error before training", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
            songmodel.epochs = epochs
            for iteration in range(iters):
                songmodel.train(user_idx_trn_with_noise, song_idx_trn_with_noise, ratings_trn_with_noise, verb = 0)
                #songmodel.train(user_idx_trn, song_idx_trn, ratings_trn)
                err = songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)
                print("Error after ", epochs * iteration, "epochs edim ", edim," reg = ", 
                      reg, " and confidence dim ", conf, ": ", err)
            errmat[eidx, cidx, regidx] = err
errmat

Error before training 5.47737481774
Error after  0 epochs edim  3  reg =  0.0001  and confidence dim  0 :  1.76700142611
Error after  4 epochs edim  3  reg =  0.0001  and confidence dim  0 :  1.81995314341
Error after  8 epochs edim  3  reg =  0.0001  and confidence dim  0 :  1.87388181792
Error before training 9.55432021065
Error after  0 epochs edim  3  reg =  0.0001  and confidence dim  1 :  2.68212620975
Error after  4 epochs edim  3  reg =  0.0001  and confidence dim  1 :  2.18896424011
Error after  8 epochs edim  3  reg =  0.0001  and confidence dim  1 :  2.13373610579
Error before training 6.59306243937
Error after  0 epochs edim  5  reg =  0.0001  and confidence dim  0 :  1.91440929924
Error after  4 epochs edim  5  reg =  0.0001  and confidence dim  0 :  1.93881734443
Error after  8 epochs edim  5  reg =  0.0001  and confidence dim  0 :  1.96223240176
Error before training 9.18873610634
Error after  0 epochs edim  5  reg =  0.0001  and confidence dim  1 :  2.84841542994
Error 

array([[[ 1.87388182,  1.79154605],
        [ 2.13373611,  1.73380476]],

       [[ 1.9622324 ,  2.02077214],
        [ 2.24454788,  1.77534087]]])

In [19]:
#songmodel.epochs = 10
#songmodel.train(user_idx_trn_with_noise, song_idx_trn_with_noise, ratings_trn_with_noise)

In [20]:
real_usr_c = songmodel.getc(user_idx_trn[:20000], song_idx_trn[:20000], ratings_trn[:20000])

In [21]:
fake_usr_c = songmodel.getc(user_idx_trn_with_noise[-20000:], 
                            song_idx_trn_with_noise[-20000:], ratings_trn_with_noise[-20000:])

In [22]:
np.mean(real_usr_c), np.mean(fake_usr_c)

(0.79417413, 1.0)

In [23]:
real_usr_c.T

array([[ 0.79992121,  0.80646396,  0.79884803, ...,  0.79377997,
         0.81287378,  0.78427571]], dtype=float32)

In [24]:
fake_usr_c.T

array([[ 1.,  1.,  1., ...,  1.,  1.,  1.]], dtype=float32)