<p align="center">
    <a href="https://colab.research.google.com/drive/1HlXkBPCBWoxsRUeMAoUE1iJwaoNGDxoY?usp=sharing">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
    </a>
</p>

In [1]:
!gdown https://drive.google.com/uc?id=1GmqNwmZ0FpCGXMfQfsqEU3RqD1lGMuE2 # tags
!gdown https://drive.google.com/uc?id=1oCZZDzovsfBa1l97li1hCJ4aDyE9HmvY # ratings
!gdown https://drive.google.com/uc?id=1HICUSCWLSbU_sug2r5fZZYJsPI1wA5e6 # movies
!gdown https://drive.google.com/uc?id=1fSGJaSAgBqEc7fm1O4L4fps6N3OZxR3Z # links
!gdown https://drive.google.com/uc?id=11Mxekus6vaBFz5f_-RVMKy_40vKqLPs5 # genome tags
!gdown https://drive.google.com/uc?id=14M40G24e0WXln4fGP8phKeIihRCA_AUd # genome scores

Downloading...
From: https://drive.google.com/uc?id=1GmqNwmZ0FpCGXMfQfsqEU3RqD1lGMuE2
To: /content/tag.csv
21.7MB [00:00, 59.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1oCZZDzovsfBa1l97li1hCJ4aDyE9HmvY
To: /content/rating.csv
690MB [00:05, 116MB/s]
Downloading...
From: https://drive.google.com/uc?id=1HICUSCWLSbU_sug2r5fZZYJsPI1wA5e6
To: /content/movie.csv
100% 1.49M/1.49M [00:00<00:00, 94.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1fSGJaSAgBqEc7fm1O4L4fps6N3OZxR3Z
To: /content/link.csv
100% 539k/539k [00:00<00:00, 77.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=11Mxekus6vaBFz5f_-RVMKy_40vKqLPs5
To: /content/genome_tags.csv
100% 20.4k/20.4k [00:00<00:00, 35.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=14M40G24e0WXln4fGP8phKeIihRCA_AUd
To: /content/genome_scores.csv
214MB [00:01, 128MB/s]


In [2]:
import pandas as pd
import numpy as np

%matplotlib inline

df_movies = pd.read_csv('./movie.csv')
df_ratings = pd.read_csv('./rating.csv')
df_tags = pd.read_csv('./tag.csv')
df_links = pd.read_csv('./link.csv')
df_genome_scores = pd.read_csv('./genome_scores.csv')
df_genome_tags = pd.read_csv('./genome_tags.csv')

# User-user collaborative filtering

**Data Processing**

In [3]:
# Drop timestamp column
df_ratings.drop('timestamp', axis=1, inplace=True)
df_ratings

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
...,...,...,...
20000258,138493,68954,4.5
20000259,138493,69526,4.5
20000260,138493,69644,3.0
20000261,138493,70286,5.0


**Find the rating count for each user and sort in descending order**

In [4]:
df_user_ratings_no = df_ratings.groupby(by=['userId'])['rating'].count().reset_index()
df_user_ratings_no.sort_values(by=['rating'], ascending=False)[:10] # example for 10 records

Unnamed: 0,userId,rating
118204,118205,9254
8404,8405,7515
82417,82418,5646
121534,121535,5520
125793,125794,5491
74141,74142,5447
34575,34576,5356
131903,131904,5330
83089,83090,5169
59476,59477,4988


This collaborative filtering will be worked on 1000 users **(userId 1 to 1000)** and the movies which has **at least 100 rating count** while we'll be creating 2D matrices

In [5]:
df_ratings_sample = df_ratings[df_ratings['userId'] <= 1000]
df_ratings_sample

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
...,...,...,...
150624,1000,88125,5.0
150625,1000,88140,4.0
150626,1000,88744,5.0
150627,1000,90603,4.5


In [6]:
# Find the rating count for each movie
df_ratings_count = df_ratings_sample.groupby('movieId')['rating'].count()
df_ratings_count = pd.merge(df_ratings_sample, df_ratings_count, on = 'movieId', how = 'inner')
df_ratings_count

Unnamed: 0,userId,movieId,rating_x,rating_y
0,1,2,3.5,165
1,5,2,3.0,165
2,13,2,3.0,165
3,29,2,3.0,165
4,34,2,3.0,165
...,...,...,...,...
150624,990,74541,2.0,1
150625,994,1780,4.0,1
150626,994,56274,3.5,1
150627,995,8926,4.5,1


In [7]:
# Select the movies which have at least 100 ratings
df_ratings_count = df_ratings_count[df_ratings_count['rating_y'] > 100]
df_ratings_count.drop('rating_y', axis=1, inplace=True)
df_ratings_count.rename(columns={'rating_x': 'rating'}, inplace=True)
df_ratings_count.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


(52362, 3)

# Create user movie matrix

In [8]:
def create_user_movie_ratings_matrix(df):
    '''
    Create the user movie matrix which is further used for collaborative filtering

    params:
    @df: pandas dataframe with userid, movieid
    
    returns:
    @user_movie: user movie ratings matrix 
    '''
    user_movie_ratings_matrix = df.groupby(by=['userId','movieId'])['rating'].max().unstack().fillna(0)

    return user_movie_ratings_matrix.astype(float)

In [9]:
# Create matrix
user_movie_ratings_matrix = create_user_movie_ratings_matrix(df_ratings_count)
user_movie_ratings_matrix.head()

movieId,1,2,6,7,10,11,16,17,19,21,25,32,34,36,39,47,48,50,62,70,95,104,110,111,141,150,153,160,161,163,165,172,173,185,186,196,208,223,231,235,...,3996,4011,4022,4027,4034,4226,4306,4878,4886,4896,4963,4973,4993,4995,5349,5378,5418,5445,5816,5952,5989,6016,6333,6365,6377,6539,6711,6874,7153,7361,7438,8360,8636,8961,32587,33794,44191,48516,58559,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,3.5,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,4.0,4.0,0.0,4.0,0.0,3.5,4.0,3.5,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,0.0,4.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,5.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,2.0,4.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
def find_similarity(userId1, userId2, user_movie_ratings_matrix=user_movie_ratings_matrix):
    '''
    Calculate similarity of users

    params:
    @userId1: user id 1
    @userId 2: user id 2
    @user_moving_ratings_matrix: matrix of user moving with corresponding rating
    
    returns:
    @similarity: similarity scores
    '''
    similarity = np.dot(user_movie_ratings_matrix.loc[userId1, :], user_movie_ratings_matrix.loc[userId2, :])
    
    return similarity

In [11]:
def get_similar_users(userId, user_movie_ratings_matrix=user_movie_ratings_matrix, m=10):
    '''
    params:
    @userId: user id
    @user_moving_ratings_matrix: matrix of user moving with corresponding rating
    @m: number of similar users
    
    returns:
    @users[:m]: top m similar users
    '''
    users = []
    
    for i in user_movie_ratings_matrix.index:
        if i != userId:
            similarity = find_similarity(userId, i)
            users.append((i, similarity))
  
    users.sort(key=lambda x: x[1], reverse=True)
    return users[:m]

**Get 10 similar users for user id 123**

In [12]:
similar_users = get_similar_users(123)
similar_users
# Result respectively with (userid, similarity_scores)

[(775, 60.5),
 (91, 59.0),
 (24, 58.0),
 (158, 58.0),
 (689, 58.0),
 (648, 55.0),
 (903, 55.0),
 (984, 55.0),
 (986, 55.0),
 (298, 53.0)]

In [13]:
def get_movies(userid):
    '''
    params:
    @userid: (int) a user id
       
    returns:
    @movie_ids: set of movie_ids that the user has already watched
    
    '''

    movie_ids = set(df_ratings[df_ratings['userId'] == userid].sort_values('rating', ascending = False)['movieId'].tolist())

    return movie_ids 

In [14]:
movie_dict = dict(zip(df_movies.movieId, df_movies.title))

def get_movie_titles(movieIds):
    '''
    Get movie titles according to movieIds and return titles

    params:
    @movieIds: movieId lists

    returns:
    @movie_titles: list of movie titles according to the movieIds
    '''
    
    movie_titles = []
    for movieId in movieIds:
        if movieId in movie_dict:
            movie_titles.append(movie_dict[movieId])

    return movie_titles

In [15]:
def get_recommendations(userId, df_ratings=df_ratings_sample, m=10):
    '''
    params:
    @userId: user id
    @df_ratings: userId movieId rating DataFrame
    @m: number of recommendations

    returns:
    @movies: top m rated movies
    '''
    watched_movie_ids = get_movies(userId)

    similar_users = get_similar_users(userId)
    
    movies = []
    for (uId, _) in similar_users:
        movies.extend(list(get_movies(uId)))

    movies = list(set(movies))
    movies = [movie for movie in movies if movie not in watched_movie_ids]


    return get_movie_titles(movies[:m])

# List of recommeded movies for user 123

In [19]:
watched_movie_ids = get_movies(123)
get_movie_titles(list(watched_movie_ids)[:10]) # get 10 already watched movies

['Down Periscope (1996)',
 'GoldenEye (1995)',
 'Trainspotting (1996)',
 'Four Rooms (1995)',
 'Money Train (1995)',
 'Get Shorty (1995)',
 'Fear (1996)',
 'Now and Then (1995)',
 'Very Brady Sequel, A (1996)',
 'Bio-Dome (1996)']

In [17]:
get_recommendations(123, 10) # get 10 recommended movies

['Toy Story (1995)',
 'Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Beyond the Valley of the Dolls (1970)',
 'Hiroshima Mon Amour (1959)',
 'Heat (1995)',
 'Sabrina (1995)',
 'Father of the Bride Part II (1995)',
 'Waiting to Exhale (1995)',
 'American President, The (1995)']