# User-User Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

from collections import Counter
from sklearn.utils import shuffle
from sortedcontainers import SortedList

## 1. Loading the data

In [3]:
df = pd.read_csv('data/rating.csv')
len(df)

20000263

In [4]:
mo = pd.read_csv('data/movie.csv')
mo.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Joing the two data frame
df2 = pd.merge(df, mo, how = 'inner', on = ['movieId'])
df2.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,1996-12-25 15:26:09,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,1996-11-27 08:19:02,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,1996-06-23 20:36:14,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,1996-10-28 13:29:44,Jumanji (1995),Adventure|Children|Fantasy


In [7]:
df2 = df2.drop(columns = ['timestamp', 'movieId', 'genres'])
df2.head()

Unnamed: 0,userId,rating,title
0,1,3.5,Jumanji (1995)
1,5,3.0,Jumanji (1995)
2,13,3.0,Jumanji (1995)
3,29,3.0,Jumanji (1995)
4,34,3.0,Jumanji (1995)


## 2. Preprocessing 

### 2-1. Sub sampling 

In [8]:
# Make the user Id starts from 0 
df2.userId -= 1
df2.head()

Unnamed: 0,userId,rating,title
0,0,3.5,Jumanji (1995)
1,4,3.0,Jumanji (1995)
2,12,3.0,Jumanji (1995)
3,28,3.0,Jumanji (1995)
4,33,3.0,Jumanji (1995)


In [9]:
# The number of the movies
df2.title.nunique()

26729

In [10]:
print("The number of samples of the data is ", len(df2))

The number of samples of the data is  20000263


I'm going to extract the most meaningful dataframe here. The users and movies that have many ranking rates. 

In [11]:
N = df2.userId.max() + 1
M = df2.title.nunique() + 1
print("The number of Users is ", N)
print("The number of Movies is ", M)

The number of Users is  138493
The number of Movies is  26730


In [12]:
user_ids_count = Counter(df2.userId)
movie_ids_count = Counter(df2.title)

The outcome of `Counter()` will be **'column value : count_number'**. So I'll take only the column values of the most common ones 

In [13]:
# Choose the numbers to subset 
n = 10000
m = 2000 

user_ids = [col for col, idx in user_ids_count.most_common(n)]
movie_ids = [col for col, idx in movie_ids_count.most_common(m)]

`user_ids` and `movie_ids` are the list of the most common values in `df2`. I'm going to filter `df2` to keep the samples that appear in the two lists at the same time. 

In [14]:
# Filter the data 
df_sub = df2[df2.userId.isin(user_ids) & df2.title.isin(movie_ids)]

In [15]:
df_sub.head()

Unnamed: 0,userId,rating,title
5,53,3.0,Jumanji (1995)
7,90,3.5,Jumanji (1995)
8,115,2.0,Jumanji (1995)
15,130,1.0,Jumanji (1995)
20,155,5.0,Jumanji (1995)


### 2-2. Assigning new Id values to Users and Movies

Now there are sparsity in the values of `userId` so I'd like to give the new index numbers to user Id and movie Id.   

In [16]:
# Indexing the user list using dictionary
user_dic = {}
i = 0

for k in user_ids:
    user_dic[k] = i
    i += 1

In [17]:
# Indexing the user list using dictionary 
movie_dic = {}
i = 0

for k in movie_ids:
    movie_dic[k] = i
    i += 1

Let's check the result and find the index of the movie *Minority Report (2002)*.

In [18]:
# Check the result
movie_dic['Minority Report (2002)']

108

In [19]:
df_sub.head()

Unnamed: 0,userId,rating,title
5,53,3.0,Jumanji (1995)
7,90,3.5,Jumanji (1995)
8,115,2.0,Jumanji (1995)
15,130,1.0,Jumanji (1995)
20,155,5.0,Jumanji (1995)


In [20]:
df_sub['user_idx'] = df_sub.userId.apply(lambda x: user_dic[x])
df_sub['movie_idx'] = df_sub.title.apply(lambda x: movie_dic[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [21]:
df_sub = df_sub.reset_index(drop = True)
df_sub.head()

Unnamed: 0,userId,rating,title,user_idx,movie_idx
0,53,3.0,Jumanji (1995),3863,125
1,90,3.5,Jumanji (1995),4358,125
2,115,2.0,Jumanji (1995),1404,125
3,130,1.0,Jumanji (1995),9442,125
4,155,5.0,Jumanji (1995),190,125


### 2-3. Splitting into train and test set

In [22]:
cut = int(0.8*len(df_sub))

df_sub = shuffle(df_sub)
tr = df_sub.iloc[:cut]
te = df_sub.iloc[cut:]

In [23]:
tr = tr.reset_index(drop = True)
te = te.reset_index(drop = True)

In [24]:
print("The size of train : ", len(tr))
print("The size of test : ", len(te))

The size of train :  4314019
The size of test :  1078505


### 2-4. Creating dictionary for user, movie and rating

Now I'd like to make it possible to look up what movie each user gave ratings simply and vise versa. 

In [33]:
user_to_movie = {}
movie_to_user = {}
um_to_rating = {}

def making_dic(x):
    
    a = int(x.user_idx)
    m = int(x.movie_idx)
    r = x.rating
    
    # make a dictionary for "user to movie" 
    if a not in user_to_movie:
        user_to_movie[a] = [m]
    else:
        user_to_movie[a].append(m)
        
    # make a dictionary for "movie to user"
    if m not in movie_to_user:
        movie_to_user[m] = [a]
    else:
        movie_to_user[m].append(a)
        
    # make rating dictionary
    um_to_rating[(a, m)] = r

In [34]:
tr.apply(making_dic, axis = 1)

0          None
1          None
2          None
3          None
4          None
5          None
6          None
7          None
8          None
9          None
10         None
11         None
12         None
13         None
14         None
15         None
16         None
17         None
18         None
19         None
20         None
21         None
22         None
23         None
24         None
25         None
26         None
27         None
28         None
29         None
           ... 
4313989    None
4313990    None
4313991    None
4313992    None
4313993    None
4313994    None
4313995    None
4313996    None
4313997    None
4313998    None
4313999    None
4314000    None
4314001    None
4314002    None
4314003    None
4314004    None
4314005    None
4314006    None
4314007    None
4314008    None
4314009    None
4314010    None
4314011    None
4314012    None
4314013    None
4314014    None
4314015    None
4314016    None
4314017    None
4314018    None
Length: 4314019, dtype: 

In [35]:
user_to_movie[50]

[614,
 615,
 1428,
 383,
 1243,
 1473,
 665,
 644,
 173,
 77,
 1917,
 620,
 1317,
 741,
 1295,
 578,
 60,
 814,
 411,
 113,
 1346,
 164,
 169,
 288,
 875,
 1518,
 1800,
 1094,
 1568,
 657,
 1812,
 43,
 1201,
 142,
 1229,
 1811,
 967,
 289,
 185,
 241,
 1548,
 243,
 1657,
 437,
 1706,
 838,
 493,
 712,
 1237,
 145,
 391,
 595,
 732,
 257,
 1702,
 133,
 815,
 1055,
 1051,
 1581,
 274,
 538,
 1137,
 714,
 21,
 601,
 1128,
 76,
 359,
 1563,
 962,
 129,
 623,
 354,
 554,
 1715,
 1892,
 691,
 1975,
 203,
 250,
 1326,
 629,
 114,
 1963,
 1387,
 150,
 1381,
 198,
 1245,
 769,
 399,
 5,
 1482,
 1339,
 460,
 825,
 1081,
 1075,
 1730,
 1084,
 641,
 1521,
 1130,
 1267,
 424,
 957,
 1744,
 375,
 861,
 327,
 963,
 13,
 1879,
 1160,
 1549,
 24,
 1924,
 344,
 63,
 1141,
 1425,
 1442,
 1643,
 970,
 770,
 350,
 95,
 1860,
 618,
 887,
 1721,
 0,
 1240,
 67,
 703,
 409,
 165,
 1628,
 566,
 537,
 1180,
 1865,
 324,
 1263,
 244,
 1836,
 1162,
 676,
 952,
 610,
 946,
 1239,
 285,
 728,
 50,
 914,
 1801,
 260

As test set has no ratings (we're predicting it), it has to be processed without rating. 

In [36]:
um_to_rating_te = {}

def making_dic_te(x):
    
    a = int(x.user_idx)
    m = int(x.movie_idx)
    r = x.rating 
    
    um_to_rating_te[(a, m)] = r

In [37]:
te.apply(making_dic_te, axis = 1)

0          None
1          None
2          None
3          None
4          None
5          None
6          None
7          None
8          None
9          None
10         None
11         None
12         None
13         None
14         None
15         None
16         None
17         None
18         None
19         None
20         None
21         None
22         None
23         None
24         None
25         None
26         None
27         None
28         None
29         None
           ... 
1078475    None
1078476    None
1078477    None
1078478    None
1078479    None
1078480    None
1078481    None
1078482    None
1078483    None
1078484    None
1078485    None
1078486    None
1078487    None
1078488    None
1078489    None
1078490    None
1078491    None
1078492    None
1078493    None
1078494    None
1078495    None
1078496    None
1078497    None
1078498    None
1078499    None
1078500    None
1078501    None
1078502    None
1078503    None
1078504    None
Length: 1078505, dtype: 

## 3. User-User Collaborative Filtering

In [41]:
# Number of users
N = np.max(list(user_to_movie.keys())) + 1 

# Number of movies from the train set
m1 = np.max(list(movie_to_user.keys()))

# Number of movies from the test set
m2 = np.max([m for (a, m), r in list(um_to_rating_te.items())])

# Total Number of movies both from train & test
M = max(m1, m2) + 1

In [None]:
averages = []
deviations = []
neighbors = []
limit = 5         # least limit number of movies in common 
K = 25            # Least number of neighbors 

for a in range(N):
    
    movie_a = user_to_movie[a]    # the list of movies that user A has rated
    movie_a_set = set(movie_a)
    
    # Calculate the rating avg and dev
    rating_a = {m: um_to_rating[(a, m)] for m in movie_a}
    avg_a = np.mean(list(rating_a.values()))                # the rating avg of user A
    dev_a = {m: (r - avg_a) for m, r in rating_a.items()}
    dev_a_array = np.array(list(dev_a.values()))
    sigma_a = np.sqrt(dev_a_array.dot(dev_a_array))         # for correlation calculation

    averages.append(avg_a)
    deviations.append(dev_a)
    
    sl = SortedList()
    for b in range(N):
        if a != b:
            movie_b = user_to_movie[b]
            movie_b_set = set(movie_b)
            movie_a_b = (movie_a_set & movie_b_set)
            
            if len(movie_a_b) > limit:
                rating_b = {m : um_to_rating[(b, m)] for m in movie_b}
                avg_b = np.mean(list(rating_b.values()))
                dev_b = {m : (r - avg_b) for m, r in rating_b.items()}
                dev_b_array = np.array(list(dev_b.values()))
                sigma_b = np.sqrt(dev_b_array.dot(dev_b_array))
                
                # correlation 
                w_ab = sum(dev_a[m]*dev_b[m] for m in movie_a_b) / (sigma_a * sigma_b)
                
                sl.add((-w_ab, b))
                if len(sl) > K:
                    del sl[-1]
    
    neighbors.append(sl)

# 4. Prediction

In [None]:
def recommender(a, m):
    numerator = 0
    denominator = 0
    
    for w_ab, b in neighbors[a]:
        try:
            numerator += -w_ab * deviations[b][m]
            denominator += abs(w_ab)
        except KeyError:
            pass
    
    if denominator = 0:
        pred = averages[a]
    else:
        pred = numerator / denominator + averages[a]
    
    # Fix the range of predicted ranking from .5 and 5
    pred = min(5, pred)
    pred = max(.5, pred)
    return pred

In [None]:
train_pred = []
train_actual = []

for (a, m), r in um_to_rating.items():
    
    pred = recommender(a, m)
    train_pred.append(pred)
    train_actual.append(r)

In [None]:
test_pred = []
test_actual = []

for (a, m), r in um_to_rating_te.items():
    
    pred = recommender(a, m)
    
    test_pred.append(pred)
    test_actual.append(r)

In [None]:
def get_mse(pred, actual):
    mse = np.mean((np.array(pred) - np.array(actual))**2)
    return mse

In [None]:
print("train mse: ", get_mse(train_pred, train_actual))
print("test mse: ", get_mse(test_pred, test_actual))