# Movie Recommendations
## Introduction to Machine Learning Final Project
### Mohamed El Tahawy (met398)
### Kevin Foo (kf1270)

The idea of Content Based Filtering was inspired by: https://tinyurl.com/ybog6ett



### Step 1: Importing Data
From .dat to .csv

In [1]:
import pandas as pd

In [2]:
ratings = pd.read_csv("ratings.dat", sep='::', engine='python', encoding='utf-8',names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [3]:
max_userid = ratings['user_id'].drop_duplicates().max()
print('The max userid is:', max_userid)


The max userid is: 6040


In [4]:
max_movieid = ratings['movie_id'].drop_duplicates().max()
print('The max movieid is:',max_movieid)

The max movieid is: 3952


In [5]:
ratings['user_emb_id'] = ratings['user_id'] - 1 # make it indexable
ratings['movie_emb_id'] = ratings['movie_id'] - 1 # make it indexable

In [6]:
#ratings

In [7]:
ratings.to_csv("ratings.csv", sep='\t', header=True, encoding='utf-8', columns=['user_id', 'movie_id', 'rating', 'timestamp', 'user_emb_id', 'movie_emb_id'])

In [8]:
users = pd.read_csv("users.dat", sep='::', engine='python', encoding='utf-8', names=['user_id', 'gender', 'age', 'occupation', 'zipcode'])
#users

In [9]:
users.to_csv("users.csv", sep='\t', header=True, encoding='latin-1',columns=['user_id', 'gender', 'age', 'occupation', 'zipcode'])

In [10]:
movies = pd.read_csv('movies.dat', sep='::', engine='python', names=['movie_id', 'title', 'genres'])
# movies

In [11]:
movies.to_csv('movies.csv', sep='\t', header=True, columns=['movie_id', 'title', 'genres'])

### Step 2: Reading in the data

In [53]:
import numpy as np

In [54]:
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='utf-8', usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])
users = pd.read_csv('users.csv', sep='\t', encoding='utf-8', usecols=['user_id', 'gender', 'zipcode'])
movies = pd.read_csv('movies.csv', sep='\t', encoding='utf-8', usecols=['movie_id', 'title', 'genres'])

In [55]:
dataset = pd.merge(pd.merge(movies, ratings),users)
dataset[['title','genres','rating']].sort_values('rating', ascending=False).head(20)

Unnamed: 0,title,genres,rating
0,Toy Story (1995),Animation|Children's|Comedy,5
489283,American Beauty (1999),Comedy|Drama,5
489259,Election (1999),Comedy,5
489257,"Matrix, The (1999)",Action|Sci-Fi|Thriller,5
489256,Dead Ringers (1988),Drama|Thriller,5
489237,Rushmore (1998),Comedy,5
489236,"Simple Plan, A (1998)",Crime|Thriller,5
489226,Hands on a Hard Body (1996),Documentary,5
489224,Pleasantville (1998),Comedy,5
489212,Say Anything... (1989),Comedy|Drama|Romance,5


### Step 3: Do a count of genres 

In [80]:
# Comparator Function for sort
def takeSecond(elem):
    return elem[1]    

unique_genres = set()
genres = movies['genres'].str.split('|').values
#Find all unique genre labels
for g in genres:
    unique_genres = unique_genres.union(set(g))

#Convert the frequency table to a list for easier sorting
def convertToList(keyword_count):
    keyword_occurences = []
    for k,v in keyword_count.items():
        keyword_occurences.append([k,v]) 
    return keyword_occurences


#Counts the number of times each genre keyword appears
def count_genres(movies, census, ref_col):
    genre_dict = dict()
    for s in census: 
        genre_dict[s] = 0
        movie_lst_genres = movies[ref_col].str.split('|')
    for genre in movie_lst_genres:        
        if type(genre) is float and pd.isnull(genre): 
            continue
        temp = []
        for s in genre:
            if s in census:
                temp.append(s)
        for s in temp: 
            if not pd.isnull(s): 
                genre_dict[s] += 1
    keyword_occurences = convertToList(genre_dict)
    #descending order of frequency
    keyword_occurences.sort(key=takeSecond, reverse=True)
    return keyword_occurences

In [81]:
keyword_occurences = count_genres(movies, genre_labels, 'genres')
keyword_occurences[:5]

[['Drama', 1603],
 ['Comedy', 1200],
 ['Action', 503],
 ['Thriller', 492],
 ['Romance', 471]]

In [17]:
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].fillna("").astype('str')
# movies['genres']

### Step 4: Use tdidf to determine similar movies

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0)
tfidf_mat = tf.fit_transform(movies['genres'])
tfidf_mat.shape

(3883, 127)

### Step 5: Cosine Similarity/Linear Kernal

In [108]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
cosine_sim = linear_kernel(tfidf_mat, tfidf_mat)
cosine_sim[:4,:4]

array([[1.        , 0.14193614, 0.09010857, 0.1056164 ],
       [0.14193614, 1.        , 0.        , 0.        ],
       [0.09010857, 0.        , 1.        , 0.1719888 ],
       [0.1056164 , 0.        , 0.1719888 , 1.        ]])

In [109]:
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# Comparator Function for sort
def takeSecond(elem):
    return elem[1]    

'''
Predicts movies user would like based on the cosine similarity score of the movies' genres.
:params: title of movie
:type: (string)
'''
def predict(title):
    indexTitle = indices[title]
    cos_scores = enumerate(cosine_sim[indexTitle])
    cos_scores = cos_scores.sort(key=takeSecond, reverse=True)
    # 1-21?
    cos_scores = cos_scores[:]
    movie_indices = []
    for score in cos_scores:
        movie_indices.append(score[0])
    return titles.iloc[movie_indices]

In [110]:
recommend('Toy Story (1995)').head(10)


1050            Aladdin and the King of Thieves (1996)
2072                          American Tail, An (1986)
2073        American Tail: Fievel Goes West, An (1991)
2285                         Rugrats Movie, The (1998)
2286                              Bug's Life, A (1998)
3045                                Toy Story 2 (1999)
3542                             Saludos Amigos (1943)
3682                                Chicken Run (2000)
3685    Adventures of Rocky and Bullwinkle, The (2000)
236                              Goofy Movie, A (1995)
Name: title, dtype: object

In [111]:
recommend('Titanic (1997)').head(10)


24                  Leaving Las Vegas (1995)
34                         Carrington (1995)
45      How to Make an American Quilt (1995)
48              When Night Is Falling (1995)
57          Postino, Il (The Postman) (1994)
73                       Bed of Roses (1996)
84                 Angels and Insects (1995)
103    Bridges of Madison County, The (1995)
129                 Frankie Starlight (1995)
138             Up Close and Personal (1996)
Name: title, dtype: object