In [2]:
import pandas as pd
import numpy as np

In [4]:
movies = pd.read_csv('../data/movies.csv')
print(movies.head(3))

                     Movie Title  Movie ID
0                  Black Panther         1
1  Once upon a time in Hollywood         2
2                      The Tribe         3


In [6]:
df = pd.read_csv('../data/users_likes.csv', index_col="User ID")
print(df.head(3))

         Movie 1  Movie 2  Movie 3  Movie 4  Movie 5  Movie 6  Movie 7  \
User ID                                                                  
6925          12       95        3      115      120       73        5   
8297          91      119       10       55      114       73       63   
5419         100       40      123       33        3       42      132   

         Movie 8  Movie 9  Movie 10  
User ID                              
6925          19       83       126  
8297          81       55        36  
5419         119       10        34  


In [24]:
df_not = pd.read_csv('../data/users_dislikes.csv', index_col = "User ID")
print(df_not.head(3))

         Movie 1  Movie 2  Movie 3  Movie 4  Movie 5  Movie 6  Movie 7  \
User ID                                                                  
6925          29      105       17       14       44       50       90   
8297          29       90        2       34       44       87       76   
5419          30       43       82      121      123       51        7   

         Movie 8  Movie 9  Movie 10  
User ID                              
6925          28       88         2  
8297          11      113        31  
5419         120      105         5  


Okay so as you can see, in movies we have the movie title and the movie id. In the users csv we have a list of users and their top 10 favourite movies. Through this we can use collaborative filtering to recommend a movie to a given user, based on their similarity to other users top 10 lists!

In [25]:
#to figure out similarity we will be counting inversions via this method:
def compareInv(A,B):
    numInv = 0
    for i in range(0, len(A)):
        for j in range(0, len(A)):
            if A[i] != B[j] and i != j:
                numInv = numInv + 1
    return numInv

In [27]:
#let's take our first user as the given user
#Now lets calculate their similarity to the other users in our data
simArray = pd.DataFrame(columns = ["Sim", "User"])

given_user_likes = df.loc[6925]
print(given_user_likes)
given_user_dislikes = df_not.loc[6925]
print(given_user_dislikes)


simSeries = pd.Series(0, index=df.index)
for i in df.index: 
    if(i != 0):
        a = []
        sim = 0
        simLikes = 0
        simNot = 0
        simLikes = compareInv(given_user_likes, a)
        simNot = compareInv(given_user_dislikes, a)
        sim = simLikes + simNot
        simArray = simArray.append({"Sim":sim, "User":i}, ignore_index = True)
        simSeries.loc[i] = sim


Movie 1      12
Movie 2      95
Movie 3       3
Movie 4     115
Movie 5     120
Movie 6      73
Movie 7       5
Movie 8      19
Movie 9      83
Movie 10    126
Name: 6925, dtype: int64
Movie 1      29
Movie 2     105
Movie 3      17
Movie 4      14
Movie 5      44
Movie 6      50
Movie 7      90
Movie 8      28
Movie 9      88
Movie 10      2
Name: 6925, dtype: int64


IndexError: list index out of range

In [9]:
print(simArray)

   Sim  User
0   90  6925
1   90  8297
2   89  5419
3   90  3115
4   88  4913
5   90  8116
6   90  2784
7   90  3084
8   90  7084
9   89  2812
10  90   991
11  89  9723
12  89  6946
13  90  1001
14  90  1515
15  89  2186
16  89  7140
17  90  9422
18  90   150


In [10]:
# user id acts as a hashtable for similarity
simSeries

User ID
6925    90
8297    90
5419    89
3115    90
4913    88
8116    90
2784    90
3084    90
7084    90
2812    89
991     90
9723    89
6946    89
1001    90
1515    90
2186    89
7140    89
9422    90
150     90
dtype: int64

In [11]:
#now we have the similarity of every user except the given user
#Lets sort by the highest similarity
#we want to sort ascending as the higher the sim number the more differences there are in their ranking
#therefore they are less similar.
S = simArray.sort_values(by='Sim', ascending=True)
print(S)

   Sim  User
4   88  4913
9   89  2812
16  89  7140
2   89  5419
15  89  2186
12  89  6946
11  89  9723
14  90  1515
13  90  1001
10  90   991
0   90  6925
8   90  7084
7   90  3084
6   90  2784
5   90  8116
3   90  3115
1   90  8297
17  90  9422
18  90   150


In [12]:
# More similar = less inversions so lower similarity
simSeries = simSeries.sort_values(ascending=True)
print(simSeries)

User ID
4913    88
2812    89
7140    89
5419    89
2186    89
6946    89
9723    89
1515    90
1001    90
991     90
6925    90
7084    90
3084    90
2784    90
8116    90
3115    90
8297    90
9422    90
150     90
dtype: int64


In [98]:
#according to this user 7140 has the highest similarity index
#but to make sure we have enough data we will take the first w of the highest users
list_items = []
for i in S.index:
    user = S["User"][i]
    user_items = df.loc[df['User ID'] == user]
    items = user_items.loc[:, user_items.columns != 'User ID']

print(list_items)

   Sim  User
7    6  7084
4   14  8116
14  14  2186
2   16  3115
1   19  5419
6   19  3084
16  19  9422
17  20   150
5   20  2784
11  20  6946
12  20  1001
3   22  4913
10  22  9723
0   23  8297
13  23  1515
9   24   991
8   24  2812
15  25  7140
[]


In [13]:
#according to this user 7084 has the highest similarity index
# because it has the lowest number of inversions
#but to make sure we have enough data we will take the first w of the highest users

# save as set -> dont check for duplicates
list_items = []
for user_id in simSeries.index:
    # get the items the user has reviewed
    user_items = df.loc[user_id]
    for item in user_items:
        if item not in list_items:
            list_items.append(item)

print(list_items)

[45, 67, 7, 31, 3, 8, 127, 136, 32, 118, 27, 6, 44, 26, 47, 35, 5, 66, 85, 51, 60, 100, 108, 34, 37, 54, 40, 123, 33, 42, 132, 119, 10, 77, 76, 106, 58, 109, 98, 86, 96, 64, 99, 28, 81, 117, 61, 57, 62, 4, 75, 130, 2, 111, 80, 16, 48, 9, 23, 55, 90, 22, 39, 25, 126, 52, 63, 65, 38, 12, 95, 115, 120, 73, 19, 83, 121, 125, 94, 124, 91, 110, 89, 69, 103, 13, 21, 92, 82, 79, 107, 114, 36, 14, 68, 122, 41, 131, 53, 30, 135]


In [39]:
matrix [n users X m items]


In [21]:
user_item_series = df.loc[4913]
user_item_series


for i in user_item_series:
    print(movies.loc[movies["Movie ID"] == i, "Movie Title"])

44    Toni Erdmann
Name: Movie Title, dtype: object
66    Spotlight
Name: Movie Title, dtype: object
6    Hereditary
Name: Movie Title, dtype: object
30    The Irishman
Name: Movie Title, dtype: object
2    The Tribe
Name: Movie Title, dtype: object
7    Amour
Name: Movie Title, dtype: object
126    Spy Kids
Name: Movie Title, dtype: object
135    Finding Nemo
Name: Movie Title, dtype: object
2    The Tribe
Name: Movie Title, dtype: object
31    Anomalisa
Name: Movie Title, dtype: object


In [22]:

for i in user_array:
    print(movies.loc[movies["Movie ID"] == i, "Movie Title"])



11    First Reformed
Name: Movie Title, dtype: object
94    In Bruges
Name: Movie Title, dtype: object
2    The Tribe
Name: Movie Title, dtype: object
114    School of Rock
Name: Movie Title, dtype: object
119    Star Wars: Episode 4
Name: Movie Title, dtype: object
72    Moulin Rouge
Name: Movie Title, dtype: object
4    Personal Shopper
Name: Movie Title, dtype: object
18    No
Name: Movie Title, dtype: object
82    Bridget Jones's Diary
Name: Movie Title, dtype: object
125    The Dark Knight
Name: Movie Title, dtype: object


In [23]:
for i in list_items:
    print(movies.loc[movies["Movie ID"] == i, "Movie Title"])



44    Toni Erdmann
Name: Movie Title, dtype: object
66    Spotlight
Name: Movie Title, dtype: object
6    Hereditary
Name: Movie Title, dtype: object
30    The Irishman
Name: Movie Title, dtype: object
2    The Tribe
Name: Movie Title, dtype: object
7    Amour
Name: Movie Title, dtype: object
126    Spy Kids
Name: Movie Title, dtype: object
135    Finding Nemo
Name: Movie Title, dtype: object
31    Anomalisa
Name: Movie Title, dtype: object
117    The Departed
Name: Movie Title, dtype: object
26    The Act of Killing
Name: Movie Title, dtype: object
5    Black Coal, Thin Ice
Name: Movie Title, dtype: object
43    The Master
Name: Movie Title, dtype: object
25    Selma
Name: Movie Title, dtype: object
46    Boyhood
Name: Movie Title, dtype: object
34    12 Year's a Slave
Name: Movie Title, dtype: object
4    Personal Shopper
Name: Movie Title, dtype: object
65    mudbound
Name: Movie Title, dtype: object
84    Love and Basketball
Name: Movie Title, dtype: object
50    Children of Men
Na