In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('white', { 'axes.spines.right': False, 'axes.spines.top': False})
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords

## Load Data

In [None]:
path = '../MLdata/'

In [None]:
# movie metadata
df_meta=pd.read_csv(path + 'movies_metadata.csv', low_memory=False, encoding='UTF-8') 
df_meta = df_meta.drop([19730, 29503, 35587])

In [None]:
df_meta = df_meta.set_index(df_meta['id'].str.strip().replace(',','').astype(int))
pd.set_option('display.max_colwidth', 20)
df_meta.head()

In [None]:
# load movie credits
df_credits = pd.read_csv(path + 'credits.csv', encoding='UTF-8')
df_credits = df_credits.set_index('id')

In [None]:
# load movie keywords
df_keywords=pd.read_csv(path + 'keywords.csv', low_memory=False, encoding='UTF-8') 
df_keywords = df_keywords.set_index('id')

In [None]:
# merge
df_k_c = df_keywords.merge(df_credits, left_index=True, right_on='id')
df = df_k_c.merge(df_meta[['release_date','genres','overview','title']], left_index=True, right_on='id')
df.head(3)

## Data Cleaning

### Smaller subset

In [None]:
links_small = pd.read_csv(path + 'links_small.csv', encoding='UTF-8')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [None]:
df = df[df.index.isin(links_small)]

### Bag-of-words Model

In [None]:
df_movies = pd.DataFrame()

In [None]:
df_movies['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in eval(x)])
df_movies['keywords'] = df_movies['keywords'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))

In [None]:
df_movies['overview'] = df['overview'].fillna('')

In [None]:
df_movies['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x!= np.nan else np.nan)

In [None]:
df_movies['cast'] = df['cast'].apply(lambda x:[i['name'] for i in eval(x)])
df_movies['cast'] = df_movies['cast'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))

In [None]:
df_movies['genres'] = df['genres'].apply(lambda x: [i['name'] for i in eval(x)])
df_movies['genres'] = df_movies['genres'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))

In [None]:
df_movies['title'] = df['title']

In [None]:
# merge all fields into 'tag' column
df_movies['tags'] = df_movies['keywords'] + ' ' + df_movies['cast']+' '+df_movies['genres']+' '+df_movies['release_date']

In [None]:
df_movies.drop(df_movies[df_movies['tags']==''].index, inplace=True)
df_movies.drop_duplicates(inplace=True)

In [None]:
df_movies['new_id'] = range(0, len(df_movies))

In [None]:
df_movies = df_movies[['new_id', 'title', 'tags']]

In [None]:
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.expand_frame_repr', False)

In [None]:
df_movies

In [None]:
df_movies.to_csv('movies.csv')

### Vectorization

In [None]:
stop = list(stopwords.words('english'))

# create the tfid vectorizer, alternatively you can also use countVectorizer
tfidf =  TfidfVectorizer(max_features=5000, analyzer = 'word', stop_words=stop)
vectorized_data = tfidf.fit_transform(df_movies['tags'])
count_matrix = pd.DataFrame(vectorized_data.toarray(), index=df_movies['tags'].index.tolist())

In [None]:
count_matrix

In [None]:
print(tfidf.get_feature_names_out()[940:990])

## Dimensionality Reduction

In [None]:
svd = TruncatedSVD(n_components=3000)
reduced_data = svd.fit_transform(count_matrix)

## Text Similarity with Cosine Similarity Scores

In [None]:
similarity = cosine_similarity(reduced_data)

In [None]:
np.save('movies_similarity', similarity)

## Generate Recommendation

In [2]:
similarity = np.load('movies_similarity.npy')

In [3]:
df_movies = pd.read_csv('movies.csv')
df_movies.head()

Unnamed: 0,id,new_id,title,tags
0,862,0,Toy Story,jealousy toy boy friendship friends rivalry bo...
1,8844,1,Jumanji,boardgame disappearance basedonchildren'sbook ...
2,15602,2,Grumpier Old Men,fishing bestfriend duringcreditsstinger oldmen...
3,31357,3,Waiting to Exhale,basedonnovel interracialrelationship singlemot...
4,11862,4,Father of the Bride Part II,baby midlifecrisis confidence aging daughter m...


In [4]:
# create a function that takes in movie title as input and returns a list of the most similar movies
def get_recommendations(title, n, cosine_sim, df_movies):
    
    # get the index of the movie that matches the title
    movie_index = df_movies[df_movies.title==title].new_id.values[0]
    
    # get the pairwsie similarity scores of all movies with that movie and sort the movies based on the similarity scores
    sim_scores_all = sorted(list(enumerate(cosine_sim[movie_index])), key=lambda x: x[1], reverse=True)
    
    # checks if recommendations are limited
    if n > 0:
        sim_scores_all = sim_scores_all[1:n+1]
        
    # get the movie indices of the top similar movies
    movie_indices = [i[0] for i in sim_scores_all]
    scores = [i[1] for i in sim_scores_all]
    
    # return the top n most similar movies from the movies df
    top_titles_df = pd.DataFrame(df_movies.iloc[movie_indices]['title'])
    top_titles_df['sim_scores'] = scores
    top_titles_df['ranking'] = range(1, len(top_titles_df) + 1)
    
    return top_titles_df, sim_scores_all


In [None]:
# # generate a list of recommendations for a specific movie title
# movie_name = 'The Matrix'
# number_of_recommendations = 15
# top_titles_df, _ = get_recommendations(movie_name, number_of_recommendations)

In [57]:
df1 = netflix_merge(df_1)
list(df1[df1['type'] == "MOVIE"]['Title'])


['Delhi Belly', 'Most Eligible Bachelor']

In [43]:
df1 = netflix_merge(df_1)

In [5]:
# list of movies a user has seen
movie_list = ['The Lion King', 'Se7en', 'Blade Runner', 'Quantum of Solace', 'Casino Royale', 'Skyfall']

# create a copy of the movie dataframe and add a column in which we aggregated the scores
user_scores = pd.DataFrame(df_movies['title'])
user_scores['sim_scores'] = 0.0

# top number of scores to be considered for each movie
number_of_recommendations = 10000
for movie_name in movie_list:
    try:
        top_titles_df, _ = get_recommendations(movie_name, number_of_recommendations,
                                               similarity, df_movies)
    except:
        pass
    # aggregate the scores
    user_scores = pd.concat([user_scores, top_titles_df[['title', 'sim_scores']]]).groupby(['title'], as_index=False).sum({'sim_scores'})

In [6]:
user_scores = user_scores[~user_scores['title'].isin(movie_list)]
user_scores.sort_values(by='sim_scores', ascending=False)[:20]

Unnamed: 0,title,sim_scores
5831,Spectre,1.067911
8163,Total Recall,0.63391
3403,Johnny English,0.586215
5555,Shaft,0.577136
3533,King Solomon's Mines,0.562624
6084,Surviving the Game,0.548778
7818,The Three Musketeers,0.530187
4172,Mirage,0.526142
358,Alice in Wonderland,0.488152
2168,Everything or Nothing,0.469017


### Merge with Netflix Data

In [7]:
netflix = pd.read_csv('../Netflix-Data/titles.csv')

In [8]:
netflix = netflix[['title', 'type', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score']]
netflix = netflix[netflix['type'] == 'MOVIE']

In [9]:
recs = netflix.merge(user_scores, how='inner', on='title')
recs = recs.sort_values(by='sim_scores', ascending=False)[:24]

In [10]:
def weighted_rating(x, m, C):
    v = x['imdb_votes']
    R = x['imdb_score']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [11]:
recs = recs[['title', 'imdb_score', 'imdb_votes']]
vote_counts = recs[recs['imdb_votes'].notnull()]['imdb_votes'].astype('int')
vote_averages = recs[recs['imdb_score'].notnull()]['imdb_score'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.60)
qualified = recs[(recs['imdb_votes'] >= m) & (recs['imdb_votes'].notnull())
                & (recs['imdb_score'].notnull())]
qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
qualified['imdb_score'] = qualified['imdb_score'].astype('int')
qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)
qualified = qualified.sort_values('wr', ascending=False).head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['imdb_score'] = qualified['imdb_score'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)


In [12]:
qualified

Unnamed: 0,title,imdb_score,imdb_votes,wr
81,Inception,8,2294231,7.866592
29,Mission: Impossible,7,420673,6.680235
35,Starship Troopers,7,291452,6.585476
88,Road to Perdition,7,265216,6.558939
6,Dirty Harry,7,155051,6.396788
85,Wanted,6,384995,5.931739
36,Mission: Impossible II,6,337987,5.925089
187,Total Recall,6,253352,5.909158
140,The Golden Compass,6,187485,5.89114
238,Legend,6,178601,5.888148


## Get non-Netflix movies

In [13]:
not_netflix_recs = user_scores[~user_scores['title'].isin(netflix['title'])]
not_netflix_recs = not_netflix_recs.sort_values(by='sim_scores', ascending=False)

In [14]:
not_netflix_recs[:20]

Unnamed: 0,title,sim_scores
5831,Spectre,1.067911
3403,Johnny English,0.586215
5555,Shaft,0.577136
3533,King Solomon's Mines,0.562624
6084,Surviving the Game,0.548778
7818,The Three Musketeers,0.530187
358,Alice in Wonderland,0.488152
2168,Everything or Nothing,0.469017
1824,Die Another Day,0.440202
4675,Ong Bak 2,0.424322


## Test

In [2]:
def clean_watch_history(df):
    '''
    Function that cleans a given users watch history data
    Input: dataframe
    Output: (cleaned) dataframe
    '''
    df = df.rename(columns = {"Title": "History"})
    df['Date'] = pd.to_datetime(df['Date'])
    df['Day']= df['Date'].dt.day
    df['Month']= df['Date'].dt.month
    df['Year']= df['Date'].dt.year
    df['Day_of_week'] = df['Date'].dt.dayofweek

    df['Title'] = df['History'].str.rsplit(': ', 2).str[0]
    df['Season'] = df['History'].str.rsplit(': ', 2).str[1]
    df['Episode'] = df['History'].str.rsplit(': ', 2).str[2]

    df['Type'] = df['Episode'].apply(lambda x : 'Movie' if (pd.isna(x)==True) else 'TV')

    tv = df[df['Type']!='Movie']
    tv['Season'] = tv['Season'].str.split().str[1]

    movies = df[df['Type']=='Movie']
    movies['Title'] = movies['History']
    movies['Season'] = None

    df = pd.concat([movies, tv], ignore_index = True)
    return df

In [12]:
def netflix_merge(df):
    '''
    Function that merges given watch history with netflix dataset,
    and returns merged dataset
    '''
    titles = pd.read_csv('../Netflix-Data/titles.csv')
    merged = df.merge(titles, left_on = 'Title', right_on = 'title', how = 'inner')
    cols_to_drop = ['production_countries', 'imdb_id', 'age_certification', 
                    'title', 'seasons', 'tmdb_popularity']
    merged = merged.drop(cols_to_drop, axis = 1)
    return merged

In [5]:
def get_movie_list(df):
    return list(df[df['type'] == "MOVIE"]['Title'])

In [4]:
def get_recommendations(title, n, cosine_sim, df_movies):
    
    # get the index of the movie that matches the title
    movie_index = df_movies[df_movies.title==title].new_id.values[0]
    
    # get the pairwsie similarity scores of all movies with that movie and sort the movies based on the similarity scores
    sim_scores_all = sorted(list(enumerate(cosine_sim[movie_index])), key=lambda x: x[1], reverse=True)
    
    # checks if recommendations are limited
    if n > 0:
        sim_scores_all = sim_scores_all[1:n+1]
        
    # get the movie indices of the top similar movies
    movie_indices = [i[0] for i in sim_scores_all]
    scores = [i[1] for i in sim_scores_all]
    
    # return the top n most similar movies from the movies df
    top_titles_df = pd.DataFrame(df_movies.iloc[movie_indices]['title'])
    top_titles_df['sim_scores'] = scores
    top_titles_df['ranking'] = range(1, len(top_titles_df) + 1)
    
    return top_titles_df, sim_scores_all

In [13]:
def get_top_movies(movie_list):
    similarity = np.load('../recommender_system_2/movies_similarity.npy')
    df_movies = pd.read_csv('../recommender_system_2/movies.csv')

    user_scores = pd.DataFrame(df_movies['title'])
    user_scores['sim_scores'] = 0.0

    # top number of scores to be considered for each movie
    number_of_recommendations = 10000
    for movie_name in movie_list:
        try:
            top_titles_df, _ = get_recommendations(movie_name, number_of_recommendations,
                                                similarity, df_movies)
        except:
            continue
        # aggregate the scores
        user_scores = pd.concat([user_scores, top_titles_df[['title', 'sim_scores']]]).groupby(['title'], as_index=False).sum({'sim_scores'})

    user_scores = user_scores[~user_scores['title'].isin(movie_list)]
    return user_scores.sort_values(by='sim_scores', ascending=False)[:20]

In [23]:
def get_netflix_recs(user_scores):
    netflix = pd.read_csv('../Netflix-Data/titles.csv')
    netflix = netflix[['title', 'type', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score']]
    netflix = netflix[netflix['type'] == 'MOVIE']

    recs = netflix.merge(user_scores, how='inner', on='title')
    recs = recs.sort_values(by='sim_scores', ascending=False)[:24]
    
    rated_recs = recs[['title', 'imdb_score', 'imdb_votes']]
    vote_counts = rated_recs[rated_recs['imdb_votes'].notnull()]['imdb_votes'].astype('int')
    vote_averages = rated_recs[rated_recs['imdb_score'].notnull()]['imdb_score'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.40)
    qualified = rated_recs[(rated_recs['imdb_votes'] >= m) & (rated_recs['imdb_votes'].notnull())
                    & (rated_recs['imdb_score'].notnull())]
    qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
    qualified['imdb_score'] = qualified['imdb_score'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(20)

    if len(qualified) > 5:
        return list(qualified['title'])
    else:
        return list(recs['title'])

In [8]:
def weighted_rating(x, m, C):
    v = x['imdb_votes']
    R = x['imdb_score']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [14]:
def get_not_netflix_recs(user_scores):
    netflix = pd.read_csv('../Netflix-Data/titles.csv')
    netflix = netflix[['title', 'type', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score']]
    netflix = netflix[netflix['type'] == 'MOVIE']

    not_netflix_recs = user_scores[~user_scores['title'].isin(netflix['title'])]
    not_netflix_recs = not_netflix_recs.sort_values(by='sim_scores', ascending=False)[:20]
    return list(not_netflix_recs['title'])

In [10]:
def get_common_movies(df1, df2):
    return list(set(df1['title']).intersection(set(df2['title'])))

In [26]:
df_1 = pd.read_csv('../Netflix-Data/Sample-History4.csv')
df_1 = clean_watch_history(df_1)

df1 = netflix_merge(df_1)

df1_movies = get_movie_list(df1)

df1_top_movies = get_top_movies(df1_movies)

# df1_netflix_recs = get_netflix_recs(df1_top_movies)

# df1_not_netflix_recs = get_not_netflix_recs(df1_top_movies)

  df['Title'] = df['History'].str.rsplit(': ', 2).str[0]
  df['Season'] = df['History'].str.rsplit(': ', 2).str[1]
  df['Episode'] = df['History'].str.rsplit(': ', 2).str[2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tv['Season'] = tv['Season'].str.split().str[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['Title'] = movies['History']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [28]:
df1_netflix_recs

['Wanted', 'Michael Clayton']

In [77]:
df_2 = pd.read_csv('../Netflix-Data/Sample-History3.csv')
df_2 = clean_watch_history(df_2)

df2 = netflix_merge(df_2)

df2_movies = get_movie_list(df2)

df2_top_movies = get_top_movies(df2_movies)

df2_netflix_recs = get_netflix_recs(df2_top_movies)

df2_not_netflix_recs = get_not_netflix_recs(df2_top_movies)

  df['Title'] = df['History'].str.rsplit(': ', 2).str[0]
  df['Season'] = df['History'].str.rsplit(': ', 2).str[1]
  df['Episode'] = df['History'].str.rsplit(': ', 2).str[2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tv['Season'] = tv['Season'].str.split().str[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['Title'] = movies['History']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [78]:
df2_netflix_recs, df2_not_netflix_recs

(['Kung Fu Panda 2', 'The Smurfs 2'],
 ['Despicable Me 2',
  'Kung Fu Panda',
  'Peter Pan',
  "The Emperor's Club",
  'Taking Woodstock',
  'Creature Comforts',
  'Kung Fu Panda: Secrets of the Furious Five',
  'Inside Out',
  'Jonah: A VeggieTales Movie',
  'Alpha Dog',
  "Bon Voyage, Charlie Brown (and Don't Come Back!)",
  'Maggie Simpson in The Longest Daycare',
  'The Double',
  'Alvin and the Chipmunks: The Squeakquel',
  'The Amazing Spider-Man 2',
  'Scary Movie 3',
  'Asterix and the Vikings',
  'Up',
  'Yogi Bear',
  'Life Is a Miracle'])

In [84]:
df1_top_movies

Unnamed: 0,title,sim_scores
3850,Louis C.K.: Chewed Up,0.882612
3855,Louis C.K.: Shameless,0.880678
3853,Louis C.K.: Live at the Beacon Theater,0.879025
3854,Louis C.K.: Oh My God,0.878389
3852,Louis C.K.: Live at The Comedy Store,0.87244
1283,Carrie Fisher: Wishful Drinking,0.78315
450,American: The Bill Hicks Story,0.771721
5234,Ricky Gervais Live 4: Science,0.747116
1003,Blue Collar Comedy Tour: The Movie,0.717803
2016,"Dylan Moran: Like, Totally...",0.678287


In [85]:
df2_top_movies

Unnamed: 0,title,sim_scores
1800,Despicable Me 2,0.843454
3581,Kung Fu Panda,0.73599
4849,Peter Pan,0.727221
6718,The Emperor's Club,0.724746
6151,Taking Woodstock,0.723275
1580,Creature Comforts,0.711249
3584,Kung Fu Panda: Secrets of the Furious Five,0.706844
3252,Inside Out,0.699861
3413,Jonah: A VeggieTales Movie,0.686626
402,Alpha Dog,0.652767


In [89]:
list(set(df1_top_movies['title']).intersection(set(df2_top_movies['title'])))

[]