In [2]:
import pandas as pd
import numpy as np

In [3]:
class get_features:
    def __init__(self, df_movies):
        self.df_movies = df_movies

    def on_moviescsv(self):
        # tag and genres are the features of movies
        # we have 1128 tags, so 1128 optional features.
        # now, we will split the each values of genres columns in 'movies.csv'
        
        genres = []
        for genre in self.df_movies['genres']: # we can have value as Adventure|Animation|Children|Comedy|Fantasy
            split = genre.split('|') # split data with | and assigned list to split
            for i in split: # for each value i of 'split'
                if i not in genres: # we check if i are in genre(value above)
                    genres.append(i)
        return genres
    
    
class define_feature:
    '''
        we have found that features of movies are in :
            * 'movies' dataframe (file = 'movies.csv') the values of 'genre' column.
                Each values of genre have different words separeted by |, all those words will become the features
                of movies with binary values (1/0)
                
            * 'g_score' dataframe (file = 'genome-score.csv'), the values of 'tag' will become features of movies
                and the values of relevance are values of 'tag value'(feature of movies) and 'each movieId'
    '''
    
    def __init__(self, genres, df_movies, df_gscore):
        self.genres = genres
        self.df_movies = df_movies
        self.df_gscore = df_gscore
    
        
    def genre(self):
        '''
            we will create a dataframe which content 'movieId' as index and 'genres' as columns
            so we assigned df a dataframe will zero everywhere
        '''
        m = len(self.df_movies['movieId'])
        n = len(self.genres)
        df = pd.DataFrame(np.zeros((m, n), dtype=int), columns = self.genres, index=self.df_movies['movieId'].values)
        for movie in self.df_movies.values: # go through all movie columns in movies.csv
            for genre in self.genres: # go through a list of 'genre' that we create on get_feature class, on_moviescsv function
                if genre in movie[2]:
                    df[genre].loc[movie[0]] = 1
        return df
    
    def tag(self):
        movieId = [i[0] for i in self.df_gscore.groupby('movieId')['movieId']]
        tags = [i[0] for i in self.df_gscore.groupby('tagId')['tagId']] # number of tags
        
        m = len(movieId)
        n = len(tags)
        relevance = self.df_gscore['relevance'].values
        relevance = relevance.reshape(m, n)
        df = pd.DataFrame(relevance, columns = list(range(1, n+1)), index = movieId)
        
        return df

In [4]:
def initialize_param(n_feature, n_user):
    W = np.random.randn(n_user, n_feature)
    B = np.random.randn(n_user)
    return W, B

In [5]:
class dataRate:
    def __init__(self, movie, user, ratings):
        self.movie = movie
        self.user = user
        self.ratings = ratings
    
    def R(self):
        truth = self.movie in list(self.ratings[self.ratings['userId'] == self.user]['movieId'])
        truth *= 1

        return truth
    
    def generate_Y(self):
        y = list(self.ratings[(self.ratings['userId'] == self.user) & (self.ratings['movieId'] == self.movie)]['rating'])[0]
        return y

In [6]:
class optimizer(dataRate):
    def __init__(self, W, B, x, ratings):
        self.W = W
        self.B = B
        self.x = x
        self.ratings = ratings
        
        
    def costJ(self):
        cost = 0
        for j in range(len(self.W)):
            for i in self.x.index:
                if dataRate(i, j, self.ratings).R() == 1:
                    y = dataRate(i, j, self.ratings).generate_Y()
                    cost += (np.dot(self.W[j], self.x.loc[i]) + self.B[j] - y)**2
        cost = cost/2
        return cost

    def derivation(self):
        dj_dw = np.zeros(self.W.shape)
        dj_db = np.zeros(self.B.shape)
        for j in range(len(self.W)):
            for i in self.x.index:
                if dataRate(i, j, self.ratings).R() == 1:
                    y = dataRate(i, j, self.ratings).generate_Y()
                    dj_dw[j] += (np.dot(self.W[j], self.x.loc[i]) + self.B[j] - y)*self.x.loc[i]

                    dj_db[j] += (np.dot(self.W[j], self.x.loc[i]) + self.B[j] - y)

        return dj_dw, dj_db
    
    def grad_descent(self, alpha, iteration):

        for step in range(iteration):
            dj_dw, dj_db = derivation(self.W, self.B, self.x, self.ratings)

            temp_W = self.W - alpha*dj_dw
            temp_B = self.B - alpha*dj_db

            self.W = temp_W
            self.B = temp_B
        return self.W, self.B

In [7]:
def recommend(W, B, df_movies):
    recommend = []
    n_user = W.shape[0]
    for user in range(n_user):
        for movie in df_movies.index:
            f_x = np.dot(W[user], movies.loc[movie]) + B[user]
            if f_x > 3:
                recommend.append([user, movie])
    recommend = np.array(recommend)
    
    df_recommend = pd.DataFrame(recommend, columns = ['userId', 'movieId'])
    
    return df_recommend