# Matrix Factorization

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

from collections import Counter
from sklearn.utils import shuffle
from sortedcontainers import SortedList

## 1. Loading the data

In [2]:
df = pd.read_csv('data/rating.csv')
len(df)

20000263

In [3]:
mo = pd.read_csv('data/movie.csv')
mo.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
# Joing the two data frame
df2 = pd.merge(df, mo, how = 'inner', on = ['movieId'])
df2.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,1996-12-25 15:26:09,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,1996-11-27 08:19:02,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,1996-06-23 20:36:14,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,1996-10-28 13:29:44,Jumanji (1995),Adventure|Children|Fantasy


In [9]:
df2 = df2.drop(columns = ['timestamp', 'genres'])
df2.head()

Unnamed: 0,userId,movieId,rating,title
0,1,2,3.5,Jumanji (1995)
1,5,2,3.0,Jumanji (1995)
2,13,2,3.0,Jumanji (1995)
3,29,2,3.0,Jumanji (1995)
4,34,2,3.0,Jumanji (1995)


In [10]:
# Make the user Id starts from 0 
df2.userId -= 1
df2.head()

Unnamed: 0,userId,movieId,rating,title
0,0,2,3.5,Jumanji (1995)
1,4,2,3.0,Jumanji (1995)
2,12,2,3.0,Jumanji (1995)
3,28,2,3.0,Jumanji (1995)
4,33,2,3.0,Jumanji (1995)


## 2. Preprocessing 

### 2-1. Sub sampling 

In [12]:
print("The number of samples of the data is ", len(df2))

The number of samples of the data is  20000263


I'm going to extract the most meaningful dataframe here. The users and movies that have many ranking rates. 

In [13]:
N = df2.userId.max() + 1
M = df2.title.nunique() + 1
print("The number of Users is ", N)
print("The number of Movies is ", M)

The number of Users is  138493
The number of Movies is  26730


In [14]:
user_ids_count = Counter(df2.userId)
movie_ids_count = Counter(df2.title)

The outcome of `Counter()` will be **'column value : count_number'**. So I'll take only the column values of the most common ones 

In [15]:
# Choose the numbers to subset 
n = 10000
m = 2000 

user_ids = [col for col, idx in user_ids_count.most_common(n)]
movie_ids = [col for col, idx in movie_ids_count.most_common(m)]

`user_ids` and `movie_ids` are the list of the most common values in `df2`. I'm going to filter `df2` to keep the samples that appear in the two lists at the same time. 

In [16]:
# Filter the data 
df_sub = df2[df2.userId.isin(user_ids) & df2.title.isin(movie_ids)]

In [17]:
df_sub.head()

Unnamed: 0,userId,movieId,rating,title
5,53,2,3.0,Jumanji (1995)
7,90,2,3.5,Jumanji (1995)
8,115,2,2.0,Jumanji (1995)
15,130,2,1.0,Jumanji (1995)
20,155,2,5.0,Jumanji (1995)


### 2-2. Assigning new Id values to Users and Movies

Now there are sparsity in the values of `userId` so I'd like to give the new index numbers to user Id and movie Id.   

In [18]:
# Indexing the user list using dictionary
user_dic = {}
i = 0

for k in user_ids:
    user_dic[k] = i
    i += 1

In [19]:
# Indexing the user list using dictionary 
movie_dic = {}
i = 0

for k in movie_ids:
    movie_dic[k] = i
    i += 1

Let's check the result and find the index of the movie *Minority Report (2002)*.

In [20]:
# Check the result
movie_dic['Minority Report (2002)']

108

In [21]:
df_sub.head()

Unnamed: 0,userId,movieId,rating,title
5,53,2,3.0,Jumanji (1995)
7,90,2,3.5,Jumanji (1995)
8,115,2,2.0,Jumanji (1995)
15,130,2,1.0,Jumanji (1995)
20,155,2,5.0,Jumanji (1995)


In [22]:
df_sub['user_idx'] = df_sub.userId.apply(lambda x: user_dic[x])
df_sub['movie_idx'] = df_sub.title.apply(lambda x: movie_dic[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [23]:
df_sub = df_sub.reset_index(drop = True)
df_sub.head()

Unnamed: 0,userId,movieId,rating,title,user_idx,movie_idx
0,53,2,3.0,Jumanji (1995),3863,125
1,90,2,3.5,Jumanji (1995),4358,125
2,115,2,2.0,Jumanji (1995),1404,125
3,130,2,1.0,Jumanji (1995),9442,125
4,155,2,5.0,Jumanji (1995),190,125


### 2-3. Splitting into train and test set

In [24]:
cut = int(0.8*len(df_sub))

df_sub = shuffle(df_sub)
tr = df_sub.iloc[:cut]
te = df_sub.iloc[cut:]

In [25]:
tr = tr.reset_index(drop = True)
te = te.reset_index(drop = True)

In [26]:
print("The size of train : ", len(tr))
print("The size of test : ", len(te))

The size of train :  4314019
The size of test :  1078505


### 2-4. Creating dictionary for user, movie and rating

Now I'd like to make it possible to look up what movie each user gave ratings simply and vise versa. 

In [27]:
user_to_movie = {}
movie_to_user = {}
um_to_rating = {}

def making_dic(x):
    
    a = int(x.user_idx)
    m = int(x.movie_idx)
    r = x.rating
    
    # make a dictionary for "user to movie" 
    if a not in user_to_movie:
        user_to_movie[a] = [m]
    else:
        user_to_movie[a].append(m)
        
    # make a dictionary for "movie to user"
    if m not in movie_to_user:
        movie_to_user[m] = [a]
    else:
        movie_to_user[m].append(a)
        
    # make rating dictionary
    um_to_rating[(a, m)] = r

In [28]:
temp = tr.apply(making_dic, axis = 1)

As test set has no ratings (we're predicting it), it has to be processed without rating. 

In [29]:
um_to_rating_te = {}

def making_dic_te(x):
    
    a = int(x.user_idx)
    m = int(x.movie_idx)
    r = x.rating 
    
    um_to_rating_te[(a, m)] = r

In [30]:
temp = te.apply(making_dic_te, axis = 1)

In [31]:
# Number of users
N = np.max(list(user_to_movie.keys())) + 1 

# Number of movies from the train set
m1 = np.max(list(movie_to_user.keys()))

# Number of movies from the test set
m2 = np.max([m for (a, m), r in list(um_to_rating_te.items())])

# Total Number of movies both from train & test
M = max(m1, m2) + 1

## 3. Matrix Factorization

In [35]:
K = 10         # latent dimensionality
# parameters for user 
W = np.random.randn(N, K)
b = np.zeros(N)

# parameters for movies 
U = np.random.randn(M, K)
c = np.zeros(M)

mu = np.mean(list(um_to_rating.values()))

In [36]:
# Defind objectives
def get_loss(X):
    """
    input : X = the all pairs of (user_id, movie_id)
            sse = sum of squared error
    output : mse = sse / n 
    """
    n = float(len(X))      # n = the number of X
    sse = 0
    
    for k, r in X.items():
        a, b = k           # user, item
        actual = r         # actual rating
        pred = W[a].dot(U[b]) + b[a] + c[b] + mu
        sse += (pred - actual)**2
    return sse / n

In [38]:
epochs = 20
reg = 20.      # regularization penalty
tr_loss = []
te_loss = []

for epoch in range(epochs):
    
    # 1. Train W parameter for all users
    # for each user a, for each movie b
    for a in range(N):
        matrix = np.eye(K)
        vector = np.zeros(K)
        
        b_a = 0                          # initialize user bias
        pie = len(user_to_movie[a])      # the number of movies that user a rated
        # for each movie that user a has rated
        for b in user_to_movie[a]:
            r = um_to_rating[(a, b)]
            matrix += np.outer(U[b], U[b])
            vector += (r - b[a] - c[b] - mu)*U[b]
            b_a += (r - W[a].dot(U[b]) - c[b] - mu)
            
        # update the parameter
        W[a] = np.linalg.solve(matrix, vector)
        b[a] = b_a / (pie + reg)
    
    # 2. Train U parameter for all movies
    # alternative updating 
    for b in range(M):
        matrix = np.eye(K) * reg
        vector = np.zeros(K)
        
        c_b = 0                           # initialize user bias
        omga = len(movie_to_user[b])      # the number of users who rated movie b
        try:
            for a in movie_to_user[b]:
                r = um_to_rating[(a, b)]
                matrix += np.outer(W[a], W[a])
                vector += (r - b[a] - c[b] - mu)*W[a]
                c_b += (r - W[a].dot(U[b]) - b[a] - mu)
                
            U[b] = np.linalg.solve(matrix, vector)
            c[b] = c_b / (omga + reg) 
        except KeyError:
            # for a movie which have no rating 
            pass
    
    # 3. Compute the loss for each epoch
    loss = get_loss(um_to_rating)
    tr_loss.append(loss)
    
    test_loss = get_loss(um_to_rating_te)
    te_loss.append(loss)

    print("Train loss: {} for {} epoch".format(tr_loss[-1], epoch))
    print("Test loss: {} for {} epoch".format(te_loss[-1], epoch)

In [None]:
print("Total Train loss: ", tr_loss)
print("Total test loss: ", te_loss)

In [None]:
# Plot the losses 
plt.plot(tr_losses, label = 'train loss')
plt.plot(te_loss, label = 'test loss')
plt.legend()
plt.show()