In [None]:
import tensorflow as tf
import sys
print(sys.version)
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
class MF_RS():
    def __init__(self, numUsers, numSongs, embedding_dim, reg_lambda=0.01, conf_lambda=1.0, conf_dim = 1):
        
        #hyper parameters
        self.batch_size = np.min([200000, numUsers, numSongs]);
        self.numUsers = numUsers
        self.numSongs = numSongs
        self.epochs = 20
        self.reg_lambda = reg_lambda
        self.conf_lambda = conf_lambda
        
        #embedding matricies for users and songs
        self.userMat = tf.Variable(tf.random_normal([numUsers, embedding_dim]))
        self.songMat = tf.Variable(tf.random_normal([numSongs, embedding_dim]))
        self.userBias = tf.Variable(tf.random_normal([numUsers]))
        self.songBias = tf.Variable(tf.random_normal([numSongs]))
        self.overallBias = tf.Variable(tf.random_normal([1]))
        if conf_dim > 0:
            self.C_user = tf.Variable(.5*tf.ones([numUsers, conf_dim]))
            self.C_song = tf.Variable(.5*tf.ones([numSongs, conf_dim]))
        
        #input tensors for songs, usres, ratings
        self.users = tf.placeholder(tf.int32, shape =(self.batch_size))
        self.songs = tf.placeholder(tf.int32, shape =(self.batch_size))
        self.rating = tf.placeholder(tf.float32, shape = (self.batch_size))
        
        #map each user/song to its feature vector
        self.U = tf.nn.embedding_lookup(self.userMat, self.users)
        self.W = tf.nn.embedding_lookup(self.songMat, self.songs)
        # bias
        self.U_bias = tf.nn.embedding_lookup(self.userBias, self.users)
        self.W_bias = tf.nn.embedding_lookup(self.songBias, self.songs)
        # confidence params
        if conf_dim > 0:
            self.C_ui = tf.clip_by_value(tf.nn.embedding_lookup(self.C_user, self.users), 1e-20, 1-1e-20)
            self.C_sj = tf.clip_by_value(tf.nn.embedding_lookup(self.C_song, self.songs), 1e-20, 1-1e-20)

        
        #predicted rating is dot product of user and song
        bias = self.U_bias+self.W_bias+self.overallBias
        pq = tf.reduce_sum(tf.mul(self.U, self.W), 1)
        self.yhat = pq + bias
            
        # l2 reg
        if conf_dim > 0:
            self.confidence_reg = self.conf_lambda * tf.reduce_sum(1-self.C_ui + 1-self.C_sj)
            #self.confidence_reg = self.conf_lambda * tf.reduce_sum(-tf.log(self.C_ui) + -tf.log(self.C_sj))
        self.l2_reg = self.reg_lambda * ( tf.reduce_sum((tf.square(self.U) + tf.square(self.W))) + 
                                         tf.reduce_sum(tf.square(self.U_bias) + tf.square(self.W_bias)))
        if conf_dim > 0:
            self.reg = self.l2_reg + self.confidence_reg
        else:
            self.reg = self.l2_reg
        #self.yhat_capped = self.yhat
        #self.yhat_capped[self.rating == 5] = tf.minimum(self.yhat_capped, 5)[self.rating == 5]
        #self.yhat_capped[self.rating == 0.5] = tf.maximum(self.yhat_capped, 0.5)[self.rating == 0.5]
        if conf_dim > 0:
             self.error = tf.reduce_mean(tf.reduce_sum(self.C_ui * self.C_sj, 1) *
                                         tf.nn.l2_loss(self.yhat - self.rating))
        else:
            self.error = tf.reduce_mean(tf.nn.l2_loss(self.yhat - self.rating))
        self.cost = (self.error + self.reg)/1e7
        self.optimizer = tf.train.AdamOptimizer(learning_rate = .01).minimize(self.cost)
        
        self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())    
        
    def train(self, users, songs, ratings, verb = 0):
        for i in range(self.epochs):
            avg_cost = 0
            perm = np.random.permutation(len(ratings))
            num_batches = len(ratings) // self.batch_size
            
            for b_idx in range(num_batches):

                batch = perm[self.batch_size * b_idx:self.batch_size * (b_idx + 1)]
                users_batch = users[batch]
                songs_batch = songs[batch]
                ratings_batch = ratings[batch]
                if verb > 2:
                    avg_cost += self.session.run([self.U, self.W],
                                  {self.users:users_batch, self.songs:songs_batch, self.rating:ratings_batch})[0]
                avg_cost += self.session.run([self.cost, self.optimizer],
                                  {self.users:users_batch, self.songs:songs_batch, self.rating:ratings_batch})[0]
            if verb > 0:
                print(avg_cost/num_batches)
                
    def test(self, users, songs):
        yhat = np.zeros(len(users))
        num_batches = len(users) // self.batch_size
        for b_idx in range(num_batches):
            batch = range(self.batch_size * b_idx,self.batch_size * (b_idx + 1))
            users_batch = users[batch]
            songs_batch = songs[batch]
            yhat[batch] = self.session.run([self.yhat],
                      {self.users:users_batch, self.songs:songs_batch})[0]
        batch = range(-self.batch_size,0)
        users_batch = users[batch]
        songs_batch = songs[batch]
        yhat[batch] = self.session.run([self.yhat],
                      {self.users:users_batch, self.songs:songs_batch})[0]
        return yhat
    
    def evaluate(self, users, songs, ratings):
        yhat = self.test(users, songs)
        yhat = np.clip(yhat, a_min = 0.5, a_max = 5)
        return np.mean((yhat - ratings)**2) 

In [None]:
a = np.array([1, 2, 3, 4, 5])
b = np.array([1, 2, 3, 4, 5])
c = np.array([4, 3, 2, 5, 1])
#unique users / songs
uni_a = np.unique(a)
uni_b = np.unique(b)

#dict mapping the id to an index
a_map = dict(zip(uni_a,range(len(uni_a))))
b_map = dict(zip(uni_b,range(len(uni_b))))

user_idx =  np.array([ a_map[user] for user in a])
song_idx =  np.array([ b_map[song] for song in b])
model = MF_RS(len(uni_a), len(uni_b), 7)
np.random.seed(2)
model.epochs = 2
model.train(user_idx, song_idx, c)


In [None]:
movieratings = pd.read_csv('ratings.csv')

In [None]:
movieratings.describe()

In [None]:
max_real_user = np.max(movieratings['userId'])

In [None]:
movie_ids = np.unique(movieratings['movieId'])
movie_ids

In [None]:
from numpy import random
random.choice([1,2,3,6,10,12])

In [None]:
all_fake_id=[]
all_fake_movie=[]
all_fake_rating=[]

In [None]:
from numpy import random
rating_options = np.linspace(0.5, 5, 10)
movie_count_fake_user=2000
for fake_idx in range(1,101):
    fake_UID = fake_idx + max_real_user
    fake_user_mouse = random.choice(rating_options)
    for i in range(movie_count_fake_user):
        movie_id = random.choice(movie_ids)
        rating = fake_user_mouse
        if np.random.rand() > 0.8:
            fake_user_mouse = random.choice(rating_options)
        # propagate
        all_fake_id.append(fake_UID)
        all_fake_movie.append(movie_id)
        all_fake_rating.append(rating)
fake_df = pd.DataFrame({"movieId":all_fake_movie, "userId":all_fake_id, "rating":all_fake_rating})

combo_df = pd.concat([movieratings.drop("timestamp",1),fake_df])

        
            
    

In [None]:
combo_df[combo_df.userId == 771]

In [None]:
combo_df.describe()

In [None]:
def getDfSummary(input_data):
    output_data = input_data.describe(include = 'all').T
    var = pd.DataFrame(data = {'nanvals': pd.Series(), 'number_distinct': pd.Series()})
    for i in range(len(input_data.columns)):
        nanvals = input_data.ix[:,i].isnull().sum()
        number_distinct = len(input_data.ix[:,i].value_counts())
        var = var.append(pd.DataFrame([[nanvals, number_distinct]], columns = ['nanvals', 'number_distinct']))
    var.index = output_data.index.values
    output_data['nanvals'] = var['nanvals']
    output_data['number_distinct'] = var['number_distinct']
    return output_data
output_data = getDfSummary(movieratings)
output_data

In [None]:
movieratings = movieratings.ix[np.random.permutation(len(movieratings))]

In [None]:
users = movieratings.ix[:,0].values
songs = movieratings.ix[:,1].values
ratings = movieratings.ix[:,2].values

#unique users / songs
uni_users = movieratings['userId'].unique()
uni_songs = movieratings['movieId'].unique()

#dict mapping the id to an index
user_map = dict(zip(uni_users,range(len(uni_users))))
song_map = dict(zip(uni_songs,range(len(uni_songs))))

user_idx =  np.array([ user_map[user] for user in users])
song_idx =  np.array([ song_map[song] for song in songs])

print(len(uni_users),len(uni_songs))

perm = range(len(users))#np.random.permutation(len(users))
trn_idx = perm[:(len(users)*2)//3]
val_idx = perm[(len(users)*2)//3:]
user_idx_trn, song_idx_trn, ratings_trn = user_idx[trn_idx], song_idx[trn_idx], ratings[trn_idx]
user_idx_val, song_idx_val, ratings_val = user_idx[val_idx], song_idx[val_idx], ratings[val_idx]

In [None]:
users_with_noise = combo_df.ix[:,"userId"].values
songs_with_noise = combo_df.ix[:,"movieId"].values
ratings_with_noise = combo_df.ix[:,"rating"].values

#unique users / songs
uni_users_with_noise = combo_df['userId'].unique()
uni_songs_with_noise = combo_df['movieId'].unique()

#dict mapping the id to an index
user_map_with_noise = dict(zip(uni_users_with_noise,range(len(uni_users_with_noise))))
song_map_with_noise = dict(zip(uni_songs_with_noise,range(len(uni_songs_with_noise))))

user_idx_with_noise =  np.array([ user_map_with_noise[user] for user in users_with_noise])
song_idx_with_noise =  np.array([ song_map_with_noise[song] for song in songs_with_noise])

print(len(uni_users_with_noise),len(uni_songs_with_noise))

perm_with_noise = range(len(users_with_noise))#np.random.permutation(len(users_with_noise))
trn_idx_with_noise = list(trn_idx)+list(range(len(users), len(users_with_noise)))#perm_with_noise[:(len(users_with_noise)*2)//3]
val_idx_with_noise = list(val_idx)#perm_with_noise[(len(users_with_noise)*2)//3:]
user_idx_trn_with_noise, song_idx_trn_with_noise, ratings_trn_with_noise = \
        user_idx_with_noise[trn_idx_with_noise], song_idx_with_noise[trn_idx_with_noise], \
        ratings_with_noise[trn_idx_with_noise]
user_idx_val_with_noise, song_idx_val_with_noise, ratings_val_with_noise =\
        user_idx_with_noise[val_idx_with_noise], song_idx_with_noise[val_idx_with_noise],\
        ratings_with_noise[val_idx_with_noise]

In [None]:

reg_l = 0.01
conf_l = .1
conf = 3
edim = 5
songmodel = MF_RS(len (uni_users), len(uni_songs), embedding_dim = edim, 
                  reg_lambda=reg_l, conf_lambda=conf_l, conf_dim = conf)
print(songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
songmodel.epochs = 1
songmodel.train(user_idx_trn, song_idx_trn, ratings_trn, verb = 3)
songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)

In [None]:
edims = [5, 7]
confs = [0, 2]
errmat = np.zeros([len(edims), len(confs)])
reg_l = 1
conf_l = 1000
for eidx, edim in enumerate(edims):
    for cidx, conf in enumerate(confs):
        songmodel = MF_RS(len (uni_users), len(uni_songs), edim, 
                          reg_lambda=reg_l, conf_lambda=conf_l, conf_dim = conf)
        print("accuracy before training", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
        np.random.seed(1)
        songmodel.epochs = 20
        #songmodel.train(user_idx_trn_with_noise, song_idx_trn_with_noise, ratings_trn_with_noise)
        songmodel.train(user_idx_trn, song_idx_trn, ratings_trn)
        err = songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)
        print("accuracy after training with edim ", edim, " and confidence dim ", conf, ": ", err)
        errmat[eidx, cidx] = err
errmat

In [None]:
edims = [5, 7]
confs = [0, 2]
errmat = np.zeros([len(edims), len(confs)])
reg_l = 1
conf_l = 10
for eidx, edim in enumerate(edims):
    for cidx, conf in enumerate(confs):
        songmodel = MF_RS(len (uni_users_with_noise), len(uni_songs_with_noise), edim, 
                          reg_lambda=reg_l, conf_lambda=conf_l, conf_dim = conf)
        print("Error before training", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
        np.random.seed(1)
        songmodel.epochs = 50
        songmodel.train(user_idx_trn_with_noise, song_idx_trn_with_noise, ratings_trn_with_noise)
        #songmodel.train(user_idx_trn, song_idx_trn, ratings_trn)
        err = songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)
        print("Error after training with edim ", edim, " and confidence dim ", conf, ": ", err)
        errmat[eidx, cidx] = err
errmat