In [1]:
import pandas as pd
import numpy as np

In [2]:
# Reading the file of ratings of the movies
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# Reading the file of the movies which contains the information regarding the movies
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Using the merge function to merge dataset of the two file, using movieId as primary variable
data = pd.merge(ratings, movies, on = 'movieId')
data.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
5,18,1,3.5,1455209816,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
6,19,1,4.0,965705637,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
7,21,1,3.5,1407618878,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
8,27,1,3.0,962685262,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
9,31,1,5.0,850466616,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [5]:
userIds = data.userId
userIds2 = data[['userId']] # Pandas series object 
userIds.head()

0     1
1     5
2     7
3    15
4    17
Name: userId, dtype: int64

In [6]:
# getting all the userId's as a dataframe in another variable
userIds2.head()

Unnamed: 0,userId
0,1
1,5
2,7
3,15
4,17


In [7]:
data.loc[0:10,['userId']]

Unnamed: 0,userId
0,1
1,5
2,7
3,15
4,17
5,18
6,19
7,21
8,27
9,31


In [8]:
# Showing the output of only required movie, as a subset of the dataframe from the main dataframe.
toyStoryData = data[data.title == "Toy Story (1995)"]
toyStoryData

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
210,606,1,2.5,1349082950,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
211,607,1,4.0,964744033,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
212,608,1,2.5,1117408267,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
213,609,1,3.0,847221025,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [9]:
# Sorting the data
data = pd.DataFrame.sort_values(data, ['userId','movieId'], ascending=[0,1])
# Showing the amount of movies and users are present
numUsers = max(data.userId)
numMovies = max(data.movieId)
# Showing the result of how many movies were rated by each user, and the number of users 
# that rated each movie.
moviesPerUser = data.userId.value_counts()
userPerMovie = data.title.value_counts()

userPerMovie

Forrest Gump (1994)                       329
Shawshank Redemption, The (1994)          317
Pulp Fiction (1994)                       307
Silence of the Lambs, The (1991)          279
Matrix, The (1999)                        278
                                         ... 
Boy (2010)                                  1
Waco: The Rules of Engagement (1997)        1
Shrooms (2007)                              1
Applesauce (2015)                           1
Thief of Paris, The (Le voleur) (1967)      1
Name: title, Length: 9719, dtype: int64

In [10]:
# Function to find the top N Favorite movies of a user

def favMovies(activeUser, N):
# Subset the dataframe to have the rows corresponding to the active use
# Sorting the rating in the descending order
# Selecting the top N rows
    topMovies = pd.DataFrame.sort_values(data[data.userId == activeUser], ['rating'], ascending=[0])[:N]
    return list(topMovies.title)

favMovies(5,6) # Printing the output of the top 6 movies of the userId; 5

['Dances with Wolves (1990)',
 'In the Name of the Father (1993)',
 "Schindler's List (1993)",
 'Postman, The (Postino, Il) (1994)',
 'Pinocchio (1940)',
 'Beauty and the Beast (1991)']

In [11]:
# Finding some recommendation for the particular system.
# Using Neighbour based Collabarative filtering model.
# Find the K Nearest Neighbour of a the active user and use their ratings to predict the ratings of the active
# user for movies they haven't rated.

In [12]:
# This  will respresent the table which is arranged like userid, movieid, rating and give us a new table in 
# which the row index  is the userId, column index is the movieId, and the value is the rating
userMovieRatingMatrix = pd.pivot_table(data, values='rating',index=['userId'], columns=['movieId'])

userMovieRatingMatrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [13]:
# Function to find the similarity between two users.
from scipy.spatial.distance import correlation
def similarity(user1, user2):
#     Normalizing the rating of the user, using the np.nanmean() - returns the mean of an array 
#     after ignoring nan Values
    user1 = np.array(user1) - np.nanmean(user1)
    user2 = np.array(user2) - np.nanmean(user2)

#     Now finding the similarity between users, first subset each user to be represented only by the ratings for the movies
#     the two users have in common.
    commonMovieIds = [i for i in range(len(user1)) if user1[i]>0 and user2[i]>0]
#     Gives us movies for which both users have non Nan ratings
    if len(commonMovieIds) == 0:
        return 0 # if there is no movies in common
    else:
        user1 = np.array([user1[i] for i in commonMovieIds])
        user2 = np.array([user2[i] for i in commonMovieIds])
        return correlation(user1,user2)

In [14]:
# Using the similarity function, finds the neareset neighbours of the active users
def nearestNeighbourRatings(activeUser, K):
#     Function will find the the K nearest neighbour of the active users, then use their ratings to predict
# the active users for other users
    similarityMatrix = pd.DataFrame(index=userMovieRatingMatrix.index, 
                                    columns=['Similarity'])
#     Creates an empty matrix whose row index is userIds, and the value will be similarity
# of that user to the activeUsers
    for i in userMovieRatingMatrix.index:
        similarityMatrix.loc[i] = similarity(userMovieRatingMatrix.loc[activeUser],
                                             userMovieRatingMatrix.loc[i])
#         It finds the similarity between the user i and active user and add it to the similarity matrix
# Soring the similarity matrix, in descending order
    similarityMatrix = pd.DataFrame.sort_values(similarityMatrix, 
                                                ['Similarity'], ascending=(0))
    nearestNeighbours = similarityMatrix[:K] # Gives us the K Nearest Neighbours
    
#     Taking the nearest neighbours and the use their ratings to predict the active user's rating for every movie
    neighbourMovieRatings = userMovieRatingMatrix.loc[nearestNeighbours.index]
    
    predictMovieRatings = pd.DataFrame(index=userMovieRatingMatrix.columns, columns=['Rating'])
    
    for i in userMovieRatingMatrix.columns:
#         for each movie
        predictedRating=np.nanmean(userMovieRatingMatrix.loc[activeUser])
#         start with average rating of the user
        for j in neighbourMovieRatings.index:
#         for each nieghbour in the neighbour list
            if userMovieRatingMatrix.loc[j,i]>0:
#         If the neighbour has rated that movie then, add the rating of the neighbour for that item adjusted,
# by the average rating of the neighbour weighted by the similarity of the neighbour to the active user.
                predictedRating += (userMovieRatingMatrix.loc[j,i] - 
                                    np.nanmean(userMovieRatingMatrix.loc[j]))*nearestNeighbours.loc[j,'Similarity']
        predictMovieRatings.loc[i,'Rating'] = predictedRating
    return predictMovieRatings

In [15]:
# Using the predicted Rating to find the top N recommendation for the active user

def topNRecommendations(activeUser,N):
    predictMovieRating = nearestNeighbourRatings(activeUser, 10)
#     Using the 10 nearest neighbours to find the predicted ratings
    moviesAlreadyWatched = list(userMovieRatingMatrix.loc[activeUser].loc[userMovieRatingMatrix.loc[activeUser]>0].index)
#    finds the list of movies whose ratings which are not Nan
    predictMovieRating = predictMovieRating.drop(moviesAlreadyWatched)
    
    topRecommendations = pd.DataFrame.sort_values(predictMovieRating,
                                                  ['Rating'], ascending=[0])[:N]
#     Will returns list of MovieIds which are top recommendations
    topRecommendationsTitles = (movies.loc[movies.movieId.isin(topRecommendations.index)])
    return list(topRecommendationsTitles.title)

In [16]:
# Printing the output
activeUser = 1
print ("Favorite Movies:",favMovies(activeUser, 10), "\n\nRecommended Movies:", topNRecommendations(activeUser, 5))

  dist = 1.0 - uv / np.sqrt(uu * vv)


Favorite Movies: ['M*A*S*H (a.k.a. MASH) (1970)', 'Excalibur (1981)', 'Indiana Jones and the Last Crusade (1989)', 'Pink Floyd: The Wall (1982)', 'From Russia with Love (1963)', 'Goldfinger (1964)', 'Dirty Dozen, The (1967)', "Gulliver's Travels (1939)", 'American Beauty (1999)', 'South Park: Bigger, Longer and Uncut (1999)'] 

Recommended Movies: ['Shawshank Redemption, The (1994)', 'Godfather, The (1972)', "Ferris Bueller's Day Off (1986)", 'Sin City (2005)', 'Dark Knight, The (2008)']
