In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('movies.csv')
print(movies.head(3))

                     Movie Title  Movie ID
0                  Black Panther         1
1  Once upon a time in Hollywood         2
2                      The Tribe         3


In [5]:
df = pd.read_csv('users.csv', index_col="User ID")
print(df.head(3))

         Movie 1  Movie 2  Movie 3  Movie 4  Movie 5  Movie 6  Movie 7  \
User ID                                                                  
6925          12       95        3      115      120       73        5   
8297          91      119       10       55      114       73       63   
5419         100       40      123       33        3       42      132   

         Movie 8  Movie 9  Movie 10  
User ID                              
6925          19       83       126  
8297          81       55        36  
5419         119       10        34  


Okay so as you can see, in movies we have the movie title and the movie id. In the users csv we have a list of users and their top 10 favourite movies. Through this we can use collaborative filtering to recommend a movie to a given user, based on their similarity to other users top 10 lists!

In [6]:
#to figure out similarity we will be counting inversions via this method:
def compareInv(A,B):
    numInv = 0
    for i in range(0, len(A)-1):
        for j in range(i+1, len(A)):
            if A[i] > B[j]:
                numInv = numInv + 1
    return numInv

In [12]:
#let's take our first user as the given user
#Now lets calculate their similarity to the other users in our data
simArray = pd.DataFrame(columns = ["Sim", "User"])
user_array = []
user_array.append(df['Movie 1'][6925])
user_array.append(df['Movie 2'][6925])
user_array.append(df['Movie 3'][6925])
user_array.append(df['Movie 4'][6925])
user_array.append(df['Movie 5'][6925])
user_array.append(df['Movie 6'][6925])
user_array.append(df['Movie 7'][6925])
user_array.append(df['Movie 8'][6925])
user_array.append(df['Movie 9'][6925])
user_array.append(df['Movie 10'][6925])
print(user_array)

simSeries = pd.Series(0, index=df.index)
for i in df.index: 
    if(i != 0):
        a = []
        sim = 0
        a.append(df['Movie 1'][i])
        a.append(df['Movie 2'][i])
        a.append(df['Movie 3'][i])
        a.append(df['Movie 4'][i])
        a.append(df['Movie 5'][i])
        a.append(df['Movie 6'][i])
        a.append(df['Movie 7'][i])
        a.append(df['Movie 8'][i])
        a.append(df['Movie 9'][i])
        a.append(df['Movie 10'][i])
        sim = compareInv(user_array, a)
        simArray = simArray.append({"Sim":sim, "User":i}, ignore_index = True)
        simSeries.loc[i] = sim


[12, 95, 3, 115, 120, 73, 5, 19, 83, 126]


In [13]:
print(simArray)

   Sim  User
0   17  6925
1   23  8297
2   19  5419
3   16  3115
4   22  4913
5   14  8116
6   20  2784
7   19  3084
8    6  7084
9   24  2812
10  24   991
11  22  9723
12  20  6946
13  20  1001
14  23  1515
15  14  2186
16  25  7140
17  19  9422
18  20   150


In [15]:
# user id acts as a hashtable for similarity
simSeries

User ID
6925    17
8297    23
5419    19
3115    16
4913    22
8116    14
2784    20
3084    19
7084     6
2812    24
991     24
9723    22
6946    20
1001    20
1515    23
2186    14
7140    25
9422    19
150     20
dtype: int64

In [16]:
#now we have the similarity of every user except the given user
#Lets sort by the highest similarity
#we want to sort ascending as the higher the sim number the more differences there are in their ranking
#therefore they are less similar.
S = simArray.sort_values(by='Sim', ascending=True)
print(S)

   Sim  User
8    6  7084
15  14  2186
5   14  8116
3   16  3115
0   17  6925
2   19  5419
7   19  3084
17  19  9422
13  20  1001
12  20  6946
18  20   150
6   20  2784
11  22  9723
4   22  4913
14  23  1515
1   23  8297
10  24   991
9   24  2812
16  25  7140


In [26]:
# More similar = less inversions so lower similarity
simSeries = simSeries.sort_values(ascending=True)
print(simSeries)

User ID
7084     6
8116    14
2186    14
3115    16
6925    17
5419    19
3084    19
9422    19
1001    20
6946    20
2784    20
150     20
9723    22
4913    22
1515    23
8297    23
991     24
2812    24
7140    25
dtype: int64


In [98]:
#according to this user 7140 has the highest similarity index
#but to make sure we have enough data we will take the first w of the highest users
list_items = []
for i in S.index:
    user = S["User"][i]
    user_items = df.loc[df['User ID'] == user]
    items = user_items.loc[:, user_items.columns != 'User ID']

print(list_items)

   Sim  User
7    6  7084
4   14  8116
14  14  2186
2   16  3115
1   19  5419
6   19  3084
16  19  9422
17  20   150
5   20  2784
11  20  6946
12  20  1001
3   22  4913
10  22  9723
0   23  8297
13  23  1515
9   24   991
8   24  2812
15  25  7140
[]


In [42]:
#according to this user 7084 has the highest similarity index
# because it has the lowest number of inversions
#but to make sure we have enough data we will take the first w of the highest users

# save as set -> dont check for duplicates
list_items = []
for user_id in simSeries.index:
    # get the items the user has reviewed
    user_items = df.loc[user_id]
    for item in user_items:
        if item not in list_items:
            list_items.append(item)

print(list_items)

[121, 23, 119, 51, 38, 136, 125, 94, 124, 13, 21, 100, 45, 34, 92, 76, 118, 77, 106, 5, 58, 109, 98, 86, 96, 82, 79, 66, 35, 107, 123, 89, 80, 12, 95, 3, 115, 120, 73, 19, 83, 126, 40, 33, 42, 132, 10, 91, 110, 6, 8, 127, 48, 44, 14, 68, 60, 55, 90, 122, 62, 41, 31, 7, 32, 22, 39, 25, 64, 54, 99, 28, 81, 69, 103, 67, 131, 53, 30, 135, 117, 61, 57, 4, 75, 130, 2, 111, 16, 9, 114, 63, 36, 52, 65, 27, 26, 47, 85, 108, 37]


In [39]:
matrix [n users X m items]


In [40]:
user_item_series = df.loc[7084]
user_item_series

# 

Movie 1     121
Movie 2      23
Movie 3     119
Movie 4      51
Movie 5      38
Movie 6     136
Movie 7     125
Movie 8      94
Movie 9     125
Movie 10    124
Name: 7084, dtype: int64