# Movie recommendation system
### Kevin LIM

## Data Processing

### Library installation and import

In [23]:
import pandas as pd
import re
import numpy as np
import math
import sklearn
np.seterr(all="ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### Dataset importation

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

### The ratings dataset

In [4]:
"""
--- The ratings dataset --- 

This dataset contains 65292 ratings.
A rating is composed of:
userId: The id of the user who rated the movie
moveId: The id of the movie rated
rating: The score
timastamp: Time when the rating was made

"""
print(ratings.shape)
ratings.head(10)

(65292, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


### The movie dataset

In [5]:
"""

--- The movie dataset --- 

This dataset contains 2743 movies.
Each movie is described with:
movieId: The id of the movie
title: The title of the movie and its release date
genre: The genres that describes the movie

"""
print(movies.shape)
movies.head(10)

(2743, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


### The movie dataset needs to be rearranged

In [6]:
"""

--- Movie dataset Rearrangement ---

We will rearrange the movie dataset. We want to separate the release date from the title, and we can do a One-Hot encoding of the genres. 
The One-Hot encoding will allow us to have a column for each genre and set a 1 for the movie that has is concerned by the genre and 0 if it is not concerned.

"""
movieId = []
title = []
year = []
genres=[]


# Parse all movies from the dataset
for i in range(movies.shape[0]):
    
    #Extract a row of the dataset
    linedata = movies.iloc[i]
    
    # Regex pattern for date
    date = re.findall("\((\d{4})\)$", linedata[1])
    chaine = linedata[1].split(" ")

    # Title
    titre = ""
    for i in range (len(chaine)-1):
        titre += chaine[i]
        if i != len(chaine)-2:
            titre += " "
    
    # Genre array
    genrelist = linedata[2].split('|')
    
    movieId.append(str(linedata[0]))
    title.append(titre)
    year.append(date[0])
    genres.append(genrelist)
    
#    
new_columns= list(movies.columns) + ["year"]

#Erase the old movies dataset
movies=pd.DataFrame(columns=new_columns)

movies['movieId'] = movieId
movies['title'] = title
movies['year'] = year
movies = movies.drop(columns=['genres'])

genres_df = pd.DataFrame({"genres":genres})
movies = movies.join(genres_df)
genres_ohe = pd.get_dummies(movies['genres'].apply(pd.Series).stack()).sum(level=0)
movies = movies.join(genres_ohe)


In [7]:
movies.head(10)

Unnamed: 0,movieId,title,year,genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,"[Adventure, Children, Fantasy]",0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,"[Comedy, Romance]",0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,"[Comedy, Drama, Romance]",0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,[Comedy],0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,6,Heat,1995,"[Action, Crime, Thriller]",1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
6,7,Sabrina,1995,"[Comedy, Romance]",0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
7,8,Tom and Huck,1995,"[Adventure, Children]",0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,9,Sudden Death,1995,[Action],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,10,GoldenEye,1995,"[Action, Adventure, Thriller]",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


### Genres normalization

In [8]:
def normalize(df):
    for index , row in df.iterrows(): 
        nbofgenres = len(df.iloc[index,3])
        for i in range(len(df.iloc[index,4:])):
               if df.iloc[index,3+i]==1:
                    df.iloc[index,3+i] = df.iloc[index,3+i]/math.sqrt(nbofgenres)

normalize(movies)
# We don't need the genres column anymore                    
movies = movies.drop(columns=['genres'])

In [9]:
movies.head(10)

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2,Jumanji,1995,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,3,Grumpier Old Men,1995,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0
3,4,Waiting to Exhale,1995,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0
4,5,Father of the Bride Part II,1995,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,6,Heat,1995,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0
6,7,Sabrina,1995,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0
7,8,Tom and Huck,1995,0.0,0.707107,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,9,Sudden Death,1995,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,10,GoldenEye,1995,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0


### Merging the movie and ratings dataset by using movieId as common key

In [10]:
# Making sure that we have integers for the movieId
ratings['movieId'] = ratings['movieId'].astype(int)
movies['movieId'] = movies['movieId'].astype(int)
movies
#Merge
movies_ratings = pd.merge(ratings,movies,on='movieId')

In [11]:
movies_ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,964982703,Toy Story,1995,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,5,1,4.0,847434962,Toy Story,1995,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,7,1,4.5,1106635946,Toy Story,1995,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,15,1,2.5,1510577970,Toy Story,1995,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,17,1,4.5,1305696483,Toy Story,1995,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,18,1,3.5,1455209816,Toy Story,1995,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,19,1,4.0,965705637,Toy Story,1995,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,21,1,3.5,1407618878,Toy Story,1995,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,27,1,3.0,962685262,Toy Story,1995,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,31,1,5.0,850466616,Toy Story,1995,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Some functions that may help

## Ratings Utility Matrix

In [12]:
"""

--- The rating utility matrix

This utility matrix is a new way to store the ratings. On the columns we have all the users, and on the row all the movies. 
Each junction will correspond to the rating given by a user for a movie.

"""
def utility_matrix():
    utility_matrix = np.zeros(shape =(int(max(ratings["userId"].unique()))+1, int(max(ratings["movieId"].unique()))+1))
    for index , row in ratings.iterrows():
        utility_matrix[int(row[0]),int(row[1])] = row[2]
    return utility_matrix
matrix = utility_matrix()

In [13]:
matrix

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 4. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 2.5, 2. , ..., 0. , 0. , 0. ],
       [0. , 3. , 0. , ..., 0. , 0. , 0. ],
       [0. , 5. , 0. , ..., 0. , 0. , 0. ]])

### Normalizing the utility matrix

In [24]:
def normalizeUtility(mat):
    matrix = np.zeros(mat.shape)
    for i in range(len(matrix)):
        mean = mat.sum(axis=1)[i] /np.count_nonzero(mat, axis=1)[i]
        for j in range(len(mat[0])):
            if mat[i,j]!=0:
                matrix[i,j] = mat[i,j] - mean
    return matrix

matrix = normalizeUtility(matrix)
matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.36363636,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        , -0.45539906, -0.95539906, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.27027027,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.15350877,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

### User's interest in each genre

In [15]:
"""

--- User's interest in each genre --- 

The goal is to get the list of all users and their interest for each movies genre.
We can do this by summing the score he attributed for each movies of a genre.

"""

interest = pd.DataFrame()
all_genres = genres_ohe.columns.tolist()
for user_id in movies_ratings['userId'].unique():
    user = movies_ratings[movies_ratings['userId'] == user_id]
    new_line = []
    for g in all_genres:
        tempo = user[["rating"]].multiply(user[g],axis="index")
        new_line.append(tempo.sum(axis=0,skipna = True)[0])
    list_to_append = [user_id] + new_line
    line_labels = ['userId'] + all_genres
    dic = {line_labels[i]: list_to_append[i] for i in range(len(list_to_append))}
    interest = interest.append(dic,ignore_index=True)

# Reorder the userId column
cols = list(interest.columns)
cols = [cols[-1]] + cols[:-1]
interest = interest[cols]

interest['userId'] = interest['userId'].astype(int)

In [16]:
interest.head(10)

Unnamed: 0,userId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,165.354297,152.007365,41.116815,60.48682,188.475415,79.38461,0.0,134.050925,78.462684,2.5,20.593494,0.0,37.022794,32.23264,44.341323,73.218777,87.697857,40.144806,27.0
1,5,14.040327,11.745639,10.355018,14.915496,24.979843,20.940947,0.0,51.71182,12.030265,0.0,2.12132,3.773292,8.566164,0.0,15.255295,3.276021,13.264306,6.344935,6.0
2,7,52.18474,47.922812,8.428425,10.737826,38.431942,12.771438,0.0,50.673831,13.811339,1.5,3.535534,4.026945,2.77051,5.0,27.817304,43.250683,22.408171,14.920844,1.5
3,15,37.58917,37.793238,5.550445,6.257552,26.928396,13.625356,0.0,47.463903,5.940855,0.0,10.094935,0.0,2.341641,5.285534,14.289076,43.619044,27.808672,10.365661,4.0
4,17,52.601353,47.692595,6.024922,4.024922,34.128652,36.189141,0.0,74.473764,10.375775,4.020726,0.0,2.020726,6.342588,12.794682,9.369879,29.082616,35.032287,20.424074,12.5
5,18,131.970602,88.885715,13.793105,25.67761,119.609456,94.274369,0.0,182.669447,27.27764,8.309401,12.942388,1.224745,2.789992,32.798533,37.59196,68.361268,96.778167,19.23554,20.5
6,19,181.765627,193.467513,45.373104,100.674205,558.449595,90.934239,0.0,182.752495,103.898335,12.196152,124.149059,0.0,34.840888,72.182519,180.50495,152.600833,223.265648,12.126874,25.0
7,21,118.054663,97.075308,14.099641,24.190366,116.715731,31.918814,0.0,39.710012,24.089284,0.0,5.581386,0.0,4.130495,2.25,40.572342,54.774555,76.006164,2.25,5.0
8,27,61.530554,83.342152,28.8056,68.565445,66.635145,10.646243,0.0,77.306323,27.030228,0.0,1.914214,1.632993,11.845379,4.236068,35.965971,56.328025,20.379434,12.867361,24.0
9,31,28.416884,26.460817,7.472136,11.281537,35.44358,5.696152,5.0,23.639617,15.467754,0.0,4.328427,0.0,9.622819,2.5,28.915274,15.833981,18.892305,0.0,0.0


## Recommendation systems

### Cosine similarity

In [17]:
def cos_sim(a,b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

### Content based recommendation systems

In [18]:
"""

--- RecommendUser ---

Input a user and get the top N number of movies he may like.
This function computes the cosine similarity of a movie's genre to all the genre that the user like the most.
The function return the head of similarity sorted array.

"""

def RecommendUser(user_id,N):
    scores = pd.DataFrame()
    all_genres = genres_ohe.columns.tolist()
    score = 0
    
    for moviesId in movies['movieId'].unique():
        a = interest[interest['userId']==user_id].values[0,1:]
        b = movies[movies['movieId']==moviesId].values[0,3:]
        score = cos_sim(a,b)
        list_to_append = [moviesId,score]
        line_labels = ['moviesId'] + ['score']
        dic = {line_labels[i]: list_to_append[i] for i in range(len(list_to_append))}
        scores = scores.append(dic,ignore_index=True)
    scores = scores.sort_values(by=['score'],ascending=False)
    scores.head(N)
    top = scores.head(N)
    for i in top['moviesId'].unique():
        print(movies[movies['movieId']==int(i)]['title'])

# Recommend user number 2 the 10 movies that he may like  
RecommendUser(2,10)

1893    Bull Durham
Name: title, dtype: object
2283    Prelude to a Kiss
Name: title, dtype: object
1059    Bulworth
Name: title, dtype: object
1021    As Good as It Gets
Name: title, dtype: object
1484    Friends & Lovers
Name: title, dtype: object
1912    Turtle Diary
Name: title, dtype: object
165    Eat Drink Man Woman (Yin shi nan nu)
Name: title, dtype: object
2543    Dream a Little Dream
Name: title, dtype: object
158    Don Juan DeMarco
Name: title, dtype: object
951    Chasing Amy
Name: title, dtype: object


### Collaborative filtering

In [25]:
"""

--- Collaborative filtering ---
Find the N most similar users to a given user.
user : The given user
user_matrix: The ratings matrix
N: The number of similar users to show

"""

def find_similar_users(user,user_matrix, N):
    user_score = user_matrix[user]
    similarity_df = pd.DataFrame()
    
    for i in range(len(user_matrix)):
        if i != user:
            score = cos_sim(user_score,user_matrix[i])
            
            list_to_append = [i,score]
            line_labels = ['userId'] + ['Similarity']
            dic = {line_labels[j]: list_to_append[j] for j in range(len(list_to_append))}
            similarity_df = similarity_df.append(dic,ignore_index=True)
    
    similarity_df['userId'] = similarity_df['userId'].astype(int)
    similarity_df = similarity_df.sort_values(by=['Similarity'],ascending=False)
    similarity_df= similarity_df.iloc[:, ::-1]
    df = similarity_df.head(N)
    #print(df)
    return df

#Find the 5 most similar users to the user 1
df = find_similar_users(user=610,user_matrix = matrix, N = 5)
df

Unnamed: 0,userId,Similarity
380,380,0.280336
249,249,0.275828
298,298,0.270091
599,599,0.25469
274,274,0.250588
