In [66]:
import numpy as np
import pandas as pd

Read the data from u.data, which contains the full data set. There is a description of the dataset here: http://files.grouplens.org/datasets/movielens/ml-100k-README.txt

In [67]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep = '\t', names = header)

Let's take a look at the first couple rows in the dataset. Let's also count the number of unique users and movies.

In [68]:
print(df.head())
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = '+ str(n_items))

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
Number of users = 943 | Number of movies = 1682


Let's split the data into training and testing sets (25 - 75 split).

In [85]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size = 0.9)

In [86]:
print(train_data.head(3))
train_users = train_data.user_id.shape[0]
train_items = train_data.item_id.shape[0]
print("Number of users in the training data: "+ str(train_data.user_id.shape[0]))
print("Number of items in the training data: "+ str(train_data.item_id.shape[0]))
print("unique users: "+ str(train_data.item_id.unique().shape[0]))

       user_id  item_id  rating  timestamp
85636      749     1013       1  881073081
91309      833      517       2  875133633
9259       288      887       5  886372155
Number of users in the training data: 10000
Number of items in the training data: 10000
unique users: 1254


Now that the data is loaded in, lets take a look at a Memory-Based Collaborative Filtering Technique.

## Memory-Based User-Item Collaborative Filtering

We will be using an User-Item Collaborative Filtering technique, this is the type used for recommended items where "Users who are similar to you also liked this"

We will start by making a user-item matrix (user x item). Then we will calculate the similarity of users by counting inversions and create a similarity matrix.

In [91]:
#create the user-item matrices
#create a matrix of zeroes the size of users x items
train_matrix = np.zeros((train_users, train_items))
for row in train_data.itertuples():
    train_matrix[row[1]-1, row[2]-1]=row[3]
test_matrix = np.zeros((train_users, train_items))
for row in train_data.itertuples():
    test_matrix[row[1]-1, row[2]-1]=row[3]


In [92]:
 train_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Let's quickly take a peek at the sparsity of the matrices. Sparsity in matrices can greatly effect the run-time

In [93]:
sparsity = float(len(train_matrix.nonzero()[0]))
sparsity /= (train_matrix.shape[0] * train_matrix.shape[1])
sparsity *= 100
print('Train_Sparsity: {:4.2f}%'.format(sparsity))
sparsity = float(len(test_matrix.nonzero()[0]))
sparsity /= (test_matrix.shape[0] * test_matrix.shape[1])
sparsity *= 100
print('Test_Sparsity: {:4.2f}%'.format(sparsity))

Train_Sparsity: 0.01%
Test_Sparsity: 0.01%


Now let's create the similarity matrices by counting inversions. We are going to user the brute force methods first, and improve as we move along.

In [None]:
#from sklearn.metrics.pairwise import pairwise_distances as p
#user_sim = p(train_matrix, metric = 'cosine')
#user_sim
#Cosine calculation of similarity

In [94]:
def compareInv(A,B):
    numInv = 0
    for i in range(0, len(A)-1):
        for j in range(i+1, len(A)):
            if A[i] > B[j]:
                numInv = numInv + 1
    return numInv

In [None]:
#similarity train matrix
#for each user, we need to calculate the similarity to all other users
#so we will take the row, iterate through all other rows
#calculate the similarity for every pair of users
simArray = pd.DataFrame(columns = ["Sim", "User"])
user = train_matrix[0]
sim = 0
for column in train_matrix:
    for i in range(1, n_users):
        sim = compareInv(user, train_matrix[i])
        simArray.append({"Sim":sim, "User":train_matrix[i]}, ignore_index = True)

In [None]:
#find items
simArray.sort_values(by='Sim', ascending=False)
list_items = []
for row in simArray:
    print(row)
    break