In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load data

In [2]:
movie = pd.read_csv('../data/movie.csv')
tag = pd.read_csv('../data/tag.csv')
genome_scores = pd.read_csv('../data/genome_scores.csv')
genome_tags = pd.read_csv('../data/genome_tags.csv')
rating = pd.read_csv('../data/rating.csv')

# Display data

In [3]:
tag['userId']

0             18
1             65
2             65
3             65
4             65
           ...  
465559    138446
465560    138446
465561    138446
465562    138446
465563    138472
Name: userId, Length: 465564, dtype: int64

In [4]:
genome_tags

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
...,...,...
1123,1124,writing
1124,1125,wuxia
1125,1126,wwii
1126,1127,zombie


In [5]:
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


# Data Preprocessing

In [6]:
# extract tags that have relevance > 0.8

relevant_tags = genome_scores[genome_scores['relevance'] > 0.8]
relevant_tags

Unnamed: 0,movieId,tagId,relevance
28,1,29,0.89200
62,1,63,0.93325
63,1,64,0.98575
185,1,186,0.95650
192,1,193,0.81925
...,...,...,...
11709528,131170,889,0.87650
11709529,131170,890,0.88350
11709637,131170,998,0.80550
11709638,131170,999,0.81525


In [7]:
grouped_tags = relevant_tags.groupby('movieId')
first = grouped_tags.first()
first.index

Int64Index([     1,      2,      3,      4,      5,      6,      7,      8,
                 9,     10,
            ...
            130075, 130087, 130490, 130496, 130520, 130578, 130840, 131013,
            131168, 131170],
           dtype='int64', name='movieId', length=10159)

In [8]:
tags = []

for i in range(len(movie)):
    l = []
    m_id = movie.loc[i,'movieId']
    if m_id in first.index:
        t = grouped_tags.get_group(m_id)
        for j in t['tagId']:
            l.append(j)
        tags.append(l)
    else:
        tags.append(l)
    
movie['tags'] = tags

In [9]:
tags_ = []
for tag in movie['tags']:
    list_ = ""
    if tag == []:
        tags_.append(" ")
    else:
        
        for t in tag:
            t_ = genome_tags[genome_tags['tagId']==t]['tag']
            t__ = genome_tags.iloc[t_.index[0]]['tag']
            list_ = list_ + t__ + ' ' 
        tags_.append(list_)

In [10]:
movie.drop(['tags'],axis=1)
movie['tags'] = tags_

In [11]:
# Movies with tags info
movie

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,adventure animated animation cartoon cgi child...
1,2,Jumanji (1995),Adventure|Children|Fantasy,adventure animals childhood children family fa...
2,3,Grumpier Old Men (1995),Comedy|Romance,comedy good sequel sequel sequels
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,chick flick girlie movie women
4,5,Father of the Bride Part II (1995),Comedy,comedy family father daughter relationship goo...
...,...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy,
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,
27275,131258,The Pirates (2014),Adventure,
27276,131260,Rentun Ruusu (2001),(no genres listed),


In [12]:
genres_tags = []
for i in range(len(movie['genres'])):
    gt = movie['genres'][i] + " " + movie['tags'][i]
    genres_tags.append(gt)
movie['genres_tags'] = genres_tags

In [13]:
# combine genres info and tags info
movie

Unnamed: 0,movieId,title,genres,tags,genres_tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,adventure animated animation cartoon cgi child...,Adventure|Animation|Children|Comedy|Fantasy ad...
1,2,Jumanji (1995),Adventure|Children|Fantasy,adventure animals childhood children family fa...,Adventure|Children|Fantasy adventure animals c...
2,3,Grumpier Old Men (1995),Comedy|Romance,comedy good sequel sequel sequels,Comedy|Romance comedy good sequel sequel sequels
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,chick flick girlie movie women,Comedy|Drama|Romance chick flick girlie movie ...
4,5,Father of the Bride Part II (1995),Comedy,comedy family father daughter relationship goo...,Comedy comedy family father daughter relations...
...,...,...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy,,Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,,Comedy
27275,131258,The Pirates (2014),Adventure,,Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed),,(no genres listed)


In [14]:
# add movie title to the rating dataframe
rating = pd.merge(rating,movie[['movieId','title']],on='movieId', how='left')

In [15]:
rating

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995)
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p..."
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995)
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)"
...,...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00,Up (2009)
20000259,138493,69526,4.5,2009-12-03 18:31:48,Transformers: Revenge of the Fallen (2009)
20000260,138493,69644,3.0,2009-12-07 18:10:57,Ice Age: Dawn of the Dinosaurs (2009)
20000261,138493,70286,5.0,2009-11-13 15:42:24,District 9 (2009)


In [16]:
movie.drop_duplicates(subset =['movieId',"title"], keep = False, inplace = True)

In [17]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=0, stop_words='english')
matrix = tf.fit_transform(movie['genres_tags'])
cosine_similarities = linear_kernel(matrix,matrix)
indices = pd.Series(movie.index, index=movie['title'])
movie_title = movie['title']

# Recommendation

In [248]:
# This function will output similar movies when you input a movie name

def get_movies(title):
    
    idx = indices[title]

    sim_scores = list(enumerate(cosine_similarities[idx]))
    try:
        
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    except Exception as e:
        
        return None
        
    sim_scores = sim_scores[1:31]

    movie_indices = [i[0] for i in sim_scores]

    return movie_title.iloc[movie_indices]        
        
    
 

In [246]:
# Get similar movies from the movies with top ratings
def recommended_movies(userId):
    
    userRatings = rating.loc[rating['userId'] == userId]
    bestRatings = userRatings.loc[userRatings['rating'] == max(userRatings['rating'])]
    max_rating_movielist = []
  
    movie_list = []
    for i in range(len(bestRatings)):
        max_rating_movielist.append(bestRatings.iloc[i]['title'])
    for j in max_rating_movielist:
        if get_movies(j) is None:
            continue
        else:
            recommendedMovies = get_movies(j).head(20)
            for k in range(len(recommendedMovies)):
                movie_list.append([recommendedMovies.iloc[k],20 - k])
    return movie_list

In [250]:
# recommend the top 5 movies that occur the most in the recommended movie list

from collections import Counter

def recommendation_top10(userId):
    list_ = recommended_movies(userId)
    dict_ = {}
    title = []
    for i in list_:
        title.append(i[0])
    title = list(set(title))
    for j in title:
        dict_[j] = 0
    for i in list_:
        dict_[i[0]] += i[1]
    top10 = []
    for k in sorted(dict_.keys(), key = lambda x:dict_[x], reverse = True)[:10]:
      top10.append(k)
    return top10

    

In [21]:
user1 = rating.loc[rating['userId'] == 1]
best1 = user1.loc[user1['rating'] == max(user1['rating'])]
best1

Unnamed: 0,userId,movieId,rating,timestamp,title
131,1,4993,5.0,2005-04-02 23:31:22,"Lord of the Rings: The Fellowship of the Ring,..."
142,1,5952,5.0,2005-04-02 23:30:19,"Lord of the Rings: The Two Towers, The (2002)"
158,1,7153,5.0,2005-04-02 23:30:33,"Lord of the Rings: The Return of the King, The..."
170,1,8507,5.0,2004-09-10 03:13:47,Freaks (1932)


In [251]:
recommendation_top10(1)

['Hobbit: The Desolation of Smaug, The (2013)',
 'Hobbit: An Unexpected Journey, The (2012)',
 'The Hobbit: The Battle of the Five Armies (2014)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 'Lord of the Rings: The Two Towers, The (2002)',
 'Percy Jackson & the Olympians: The Lightning Thief (2010)',
 'Lord of the Rings: The Return of the King, The (2003)',
 'Gladiator (2000)',
 'Star Wars: Episode III - Revenge of the Sith (2005)']

In [25]:
user6 = rating.loc[rating['userId'] == 6]
best6 = user6.loc[user6['rating'] == max(user6['rating'])]
best6

Unnamed: 0,userId,movieId,rating,timestamp,title
517,6,1,5.0,1997-03-13 17:50:52,Toy Story (1995)
519,6,7,5.0,1997-03-13 17:52:38,Sabrina (1995)
520,6,17,5.0,1997-03-13 17:50:52,Sense and Sensibility (1995)
521,6,52,5.0,1997-03-13 17:54:18,Mighty Aphrodite (1995)
522,6,62,5.0,1997-03-13 17:50:52,Mr. Holland's Opus (1995)
525,6,141,5.0,1997-03-13 17:50:52,"Birdcage, The (1996)"
529,6,648,5.0,1997-03-13 17:50:52,Mission: Impossible (1996)


In [252]:
recommendation_top10(6)

['Sunshine Boys, The (1975)',
 'Music of the Heart (1999)',
 'Emma (1996)',
 'Passenger 57 (1992)',
 'Toy Story 2 (1999)',
 'Alex and Emma (2003)',
 'In & Out (1997)',
 'My Left Foot (1989)',
 'Enemy of the State (1998)',
 'Together (Han ni Zai Yiki) (2002)']

# Evaluation

In [253]:
from statistics import mean 

def evaluation(userId):
    recMovsList = recommendation_top10(userId)
    recMovsId = []
    for i in recMovsList:
        id_ = movie.loc[movie['title'] == i]['movieId']
        recMovsId.append(id_)
    rating_user = rating.loc[rating['userId']==userId]
    meanRating = mean(rating_user['rating'])
    above = 0
    seenMovs = 0
    ids = []
    for d in range(len(rating_user['movieId'])):
        ids.append(rating_user['movieId'].iloc[d])
    for j in recMovsId:
        
        if j.values[0] in ids:
            
            if float(rating_user.loc[rating_user["movieId"] == j.values[0]]['rating']) > meanRating:
                seenMovs += 1
                above += 1
            else:
                seenMovs += 1
        else:
            continue
        
    if seenMovs == 0:
        return -1
    else:
        return above/seenMovs

In [254]:
def evaluations(users):
    total = 0
    users_num = 0 
    for i in users:
        if evaluation(i) == -1:
            continue
        else:
            users_num += 1
            total += evaluation(i)
            if users_num == 100:
                return total/100
            else:
                continue
            
      
        

In [255]:
# accuracy when number of similar movies of each movie = 20

evaluations(list(set(rating['userId'])))

0.7614880952380951

In [256]:
# Get similar movies from the movies with top ratings
def recommended_movies(userId):
    
    userRatings = rating.loc[rating['userId'] == userId]
    bestRatings = userRatings.loc[userRatings['rating'] == max(userRatings['rating'])]
    max_rating_movielist = []
  
    movie_list = []
    for i in range(len(bestRatings)):
        max_rating_movielist.append(bestRatings.iloc[i]['title'])
    for j in max_rating_movielist:
        if get_movies(j) is None:
            continue
        else:
            recommendedMovies = get_movies(j).head(10)
            for k in range(len(recommendedMovies)):
                movie_list.append([recommendedMovies.iloc[k],10 - k])
    return movie_list

In [257]:
# accuracy when number of similar movies of each movie = 10

evaluations(list(set(rating['userId'])))

0.8245714285714284

In [258]:
# Get similar movies from the movies with top ratings
def recommended_movies(userId):
    
    userRatings = rating.loc[rating['userId'] == userId]
    bestRatings = userRatings.loc[userRatings['rating'] == max(userRatings['rating'])]
    max_rating_movielist = []
  
    movie_list = []
    for i in range(len(bestRatings)):
        max_rating_movielist.append(bestRatings.iloc[i]['title'])
    for j in max_rating_movielist:
        if get_movies(j) is None:
            continue
        else:
            recommendedMovies = get_movies(j).head(5)
            for k in range(len(recommendedMovies)):
                movie_list.append([recommendedMovies.iloc[k],5 - k])
    return movie_list

In [259]:
# accuracy when number of similar movies of each movie = 5

evaluations(list(set(rating['userId'])))

0.8533095238095236

In [260]:
# Get similar movies from the movies with top ratings
def recommended_movies(userId):
    
    userRatings = rating.loc[rating['userId'] == userId]
    bestRatings = userRatings.loc[userRatings['rating'] == max(userRatings['rating'])]
    max_rating_movielist = []
  
    movie_list = []
    for i in range(len(bestRatings)):
        max_rating_movielist.append(bestRatings.iloc[i]['title'])
    for j in max_rating_movielist:
        if get_movies(j) is None:
            continue
        else:
            recommendedMovies = get_movies(j).head(3)
            for k in range(len(recommendedMovies)):
                movie_list.append([recommendedMovies.iloc[k],3 - k])
    return movie_list

In [261]:
# accuracy when number of similar movies of each movie = 3

evaluations(list(set(rating['userId'])))

0.8656428571428573

In [263]:
# Get similar movies from the movies with top ratings
def recommended_movies(userId):
    
    userRatings = rating.loc[rating['userId'] == userId]
    bestRatings = userRatings.loc[userRatings['rating'] == max(userRatings['rating'])]
    max_rating_movielist = []
  
    movie_list = []
    for i in range(len(bestRatings)):
        max_rating_movielist.append(bestRatings.iloc[i]['title'])
    for j in max_rating_movielist:
        if get_movies(j) is None:
            continue
        else:
            recommendedMovies = get_movies(j).head(1)
            for k in range(len(recommendedMovies)):
                movie_list.append([recommendedMovies.iloc[k],1 - k])
    return movie_list

In [262]:
# accuracy when number of similar movies of each movie = 1

evaluations(list(set(rating['userId'])))

0.8656428571428573