In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import (cosine_similarity,
                                     euclidean_distances,
                                     cosine_distances)

from thefuzz import process
import fuzzyset

In [3]:
#loading in the movies
movies = pd.read_csv('./data/movies_with_review_id.csv')
movies.drop(columns=['Unnamed: 0'], inplace=True)
movies.rename(columns={'primary_title_x':'primary_title',
                       'release_year_x':'release_year'}, inplace=True)
movies.set_index('primary_title', inplace=True)

#making the tomato score into a float
movies['tomato_score'] = movies['tomato_score'].apply(
    lambda x: str(x).replace('%','')).astype('float')

#making the metacritic score into a float
movies['metacritic_score'] = movies['metacritic_score']\
                            .apply(lambda x: eval(x) if x == x else np.nan)

#getting dummies for mpaa_rating
movies = pd.get_dummies(movies, columns=['mpaa_rating'])

movies.reset_index(inplace=True)

In [4]:
#the director column got messed up at some point, let's fix it
director = pd.read_csv('./data/with_wiki_scrape_complete.csv')
director.drop(columns=['Unnamed: 0',
                       'title', 
                       'primary_title', 
                       'original_title',
                       'release_year', 
                       'runtime', 
                       'genres', 
                       'writers', 
                       'rating',
                       'votes', 
                       'cast_crew', 
                       'wiki_title', 
                       'scraped_data', 
                       'tomato_score',
                       'metacritic_score', 
                       'mpaa_rating', 
                       'wiki_scrape'
                      ], inplace=True)
director['directors'] = director['directors'].apply(\
            lambda x: x.replace('nm', '').strip().split(','))

#keeping only the primary director for recommendation simplicity
director['directors'] = director['directors'].apply(lambda x: x[0])

In [5]:
#merging director back into the movies df
movies.drop(columns=['director_1','director_2', 'director_3'], inplace=True)

movies = pd.merge(movies, director, on='tconst', how='left')

# # setting the index as title again
movies.set_index('primary_title', inplace=True)

In [6]:
#filling na's in the tomato and metacritic scores
movies['tomato_score'].fillna(movies['rating']*10, inplace=True)
movies['metacritic_score'].fillna(movies['rating']/10, inplace=True)

In [7]:
#getting complete list of genres from genre_1, 2, and 3
genre_list1 = list(pd.DataFrame(movies['genre_1'].value_counts()).\
                                   reset_index()['index'])
genre_list2 = list(pd.DataFrame(movies['genre_2'].value_counts()).\
                                   reset_index()['index'])
genre_list3 = list(pd.DataFrame(movies['genre_3'].value_counts()).\
                                   reset_index()['index'])

for item in genre_list2:
    genre_list1.append(item)
for item in genre_list3:
    genre_list1.append(item)
    
genre_list=set(genre_list1)
#genre_list now contains all of the possible genre tags so we can start creating dummies

In [8]:
#creating the blank dummy columns
for genre in genre_list:
    movies[genre] = 0

#populating the dummy columns
for i in range(0, len(list(movies['genre_1']))):
    for genre in genre_list:
        if movies['genre_1'].iloc[i] == genre:
            movies[genre].iloc[i] = 1
        elif movies['genre_2'].iloc[i] == genre:
            movies[genre].iloc[i] = 1
        elif movies['genre_3'].iloc[i] == genre:
            movies[genre].iloc[i] = 1
        else:
            movies[genre].iloc[i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [9]:
movies3 = movies.drop(columns=['genre_1','genre_2','genre_3'])

In [10]:
#creating a function to add the necessary zeroes back onto the cast identifiers
def add_zeroes(identifier):
    number_of_zeroes = 8 - len(str(identifier))
    return (number_of_zeroes * '0') + str(identifier)

#apply the zeroes function to all 10 cast columns, and writer columns
for i in range(1, 11):
    movies3['cast_' + str(i)] = movies3['cast_' + str(i)].apply(add_zeroes)
   
for i in range(1, 4):
    movies3['writer_' + str(i)] = movies3['writer_' + str(i)].apply(add_zeroes)

In [11]:
#getting dummies for the writers columns

for i in range(1, 4):
    movies3 = pd.get_dummies(movies3, columns=['writer_'+str(i)])
    
#creating a list of all the writers ids:
writer_ids = []
for column_name in movies3.columns:
    if column_name[:6] == 'writer':
        writer_ids.append(column_name[9:])

In [12]:
#doing horizontal summations to get a single column for each writer
writer_df = pd.DataFrame()
for writer_id in writer_ids:
    temp_list = []
    for column_name in movies3.columns:
        if str(column_name[9:]) == str(writer_id):
            temp_list.append(column_name)
    temp_df = pd.DataFrame()
    temp_df[writer_id] = movies3[temp_list].sum(axis=1)
    writer_df['writer_id_'+ str(writer_id)] = temp_df[writer_id]

In [13]:
#taking the new and improved writers columns and adding them to the old df

for column_name in writer_df.columns:
    movies[column_name] = list(writer_df[column_name])

While I do believe that the cast columns could improve the performance of the recommender, they ultimately slow down the whole process too much for me to iteratively test my code and model, so I have commented out the related cells below

In [14]:
#getting dummies for the cast columns

for i in range(1, 11):
    movies3 = pd.get_dummies(movies3, columns=['cast_'+str(i)])

In [15]:
#creating a list of all the cast ids:
cast_ids = []
for column_name in movies3.columns:
    if column_name[:4] == 'cast':
        cast_ids.append(column_name[7:])

In [None]:
#doing horizontal summations to get a single column for each cast member
cast_df = pd.DataFrame()
for cast_id in cast_ids:
    temp_list = []
    for column_name in movies3.columns:
        if str(column_name[7:]) == str(cast_id):
            temp_list.append(column_name)
    temp_df = pd.DataFrame()
    temp_df[cast_id] = movies3[temp_list].sum(axis=1)
    cast_df['cast_id_'+ str(cast_id)] = temp_df[cast_id]

In [None]:
#taking the new and improved cast columns and adding them to the old df

for column_name in cast_df.columns:
    movies[column_name] = list(cast_df[column_name])

In [None]:
movies.to_csv('./data/recommender_df.csv')

In [None]:
#Dropping columns that we will not be considering in our recommendations
movies2 = movies.drop(columns=['genre_1', 
                     'genre_2', 
                     'genre_3', 
                     'writer_1', 
                     'writer_2', 
                     'writer_3', 
                     'cast_1',
                     'cast_2',
                    'cast_3',
                    'cast_4',
                    'cast_5',
                    'cast_6',
                    'cast_7',
                    'cast_8',
                    'cast_9',
                    'cast_10',
                    'plot',
#                     'tconst',
                    'movieId',
                    'votes'])



In [None]:
movies2 = pd.get_dummies(movies2, columns=['directors'])

In [None]:
movies2.drop_duplicates(keep='first', inplace=True)

In [None]:
movies = movies2.reset_index()

In [None]:
movies2.set_index('tconst', inplace=True)

In [None]:
cos_df = pd.DataFrame(cosine_similarity(movies2, movies2),
                                columns=movies2.index,
                                index=movies2.index)

cos_df.reset_index(inplace=True)

In [None]:
cos_df.head()

In [None]:
#getting the top 20 recommended movies for a movie (excluding the film itself)
list(cos_df.sort_values('tt1657299', ascending=False)['tconst'][1:21])

In [None]:
euc_df = pd.DataFrame(euclidean_distances(movies2, movies2), 
             columns=movies2.index, 
             index=movies2.index)

In [None]:
movies.head()

In [None]:
movies2.head()

In [None]:
#getting a list of all the unique identifiers from movies2 that we can recommned
movie_id = pd.DataFrame(list(movies2.index), columns=['tconst'])

#joining the titles back on to the id's
movie_id = pd.merge(movie_id, movies, on='tconst', how='left')

#restricting to just ID and title
movie_id = movie_id[['tconst','primary_title', 'release_year']]

In [None]:
#creating a column with title and release year for clarification purposes later

movie_id['release_year'] = \
movie_id['release_year'].apply(lambda x: ' (' + str(x) + ')')

movie_id['full_title'] = \
movie_id.apply(lambda row: row['primary_title'] + row['release_year'], axis=1)

In [None]:
movie_id.head()

In [None]:
movie_id.to_csv('./data/movie_ids_and_year.csv', index=False)

In [None]:
cos_df.to_csv('./data/product_based_recommender_df.csv', index=False)

In [None]:
#creating the list of titles for fuzzy matching
title_list = fuzzyset.FuzzySet(list(movie_id['full_title']))

def intersection(list1, list2):
    return [item for item in list1 if item in list2]

def new_user_recommender():
    correct = 'n'
    
    while correct != 'y':
        
        #asking the user to input films they like
        liked_films = input("Please enter 3 film titles that you enjoy, \nseparated by forward slashes (/):\n")

        if len(liked_films.split('/')) == 3:
            #getting the matching film titles from our list of films
            film1 = liked_films.split('/')[0]
            film2 = liked_films.split('/')[1]
            film3 = liked_films.split('/')[2]

            #appending the matching titles to a list
            liked_films = []
            liked_films.append(title_list.get(film1)[0][1])
            liked_films.append(title_list.get(film2)[0][1])
            liked_films.append(title_list.get(film3)[0][1])

            #printing the list of films for validation
            print('')
            for film in liked_films:
                print(film)

            #asking the user to validate the selected list of films
            correct = input("Please verify that the above titles are correct y/n:\n")
        
        else:
            correct = 'n'
            print('Error, please try again')
        
#creating the list of suggestions for the three selected titles

    #getting the unique ids for the liked movies:
    liked_films_tconst = []
    for film in liked_films:
        liked_films_tconst.append(\
            movie_id[movie_id['full_title'] == film]['tconst'].values[0])
    
    #getting the top 20 recommended films for each liked movie
    rec_list_1 = list(cos_df.sort_values(liked_films_tconst[0], ascending=False)\
                      ['tconst'][1:21])
    rec_list_2 = list(cos_df.sort_values(liked_films_tconst[1], ascending=False)\
                      ['tconst'][1:21])
    rec_list_3 = list(cos_df.sort_values(liked_films_tconst[2], ascending=False)\
                      ['tconst'][1:21])
    
    #translating the tconst back to titles
    rec_list_1_titles = []
    rec_list_2_titles = []
    rec_list_3_titles = []
    
    for tconst in rec_list_1:
        rec_list_1_titles.append(\
            list(movie_id[movie_id['tconst'] == tconst]['full_title'])[0])
        
    for tconst in rec_list_2:
        rec_list_2_titles.append(\
            list(movie_id[movie_id['tconst'] == tconst]['full_title'])[0])
        
    for tconst in rec_list_3:
        rec_list_3_titles.append(\
            list(movie_id[movie_id['tconst'] == tconst]['full_title'])[0])
    
    #if there are films that are recommended based on more than 1 liked movie
    #prioritize these films
    final_recs = intersection(rec_list_1_titles, rec_list_2_titles)
    final_recs = intersection(final_recs, rec_list_3_titles)
    
    #only want to return 5 recommendations
    for i in range(0, 5):
        final_recs.extend([rec_list_1_titles[i],\
                            rec_list_2_titles[i],\
                            rec_list_3_titles[i]])
           
    print(final_recs[0], '\n', 
          final_recs[1], '\n',
          final_recs[2], '\n',
          final_recs[3], '\n',
          final_recs[4]
         )
    
    #asking the user to "rate" the films that we have recommended to see if we're doing well
    bad_format = 'y'
    
    while bad_format == 'y':
        film_reviews = input('Please rate the above films on how likely you are to watch them (1-5). Separate the scores with commas, \n')
        film_reviews = film_reviews.replace(' ','').split(',')
        for i in range(0, len(film_reviews)):
            film_reviews[i] = int(film_reviews[i])
        
         #checking to see if they have input their scores correctly
        if all(isinstance(x, int) for x in film_reviews) == True\
        and len(film_reviews) == 5\
        and min(film_reviews) >= 1\
        and max(film_reviews) <= 5:
            print('Thank You!')
            break
        else:
            bad_format = 'y'
            
    #saving their review scores to the reviews data frame for review based recs
    
    

In [None]:
new_user_recommender()

In [None]:
# building a person to person recommender

#import the ratings dataframe
ratings = pd.read_csv('./data/ml-25m/ml-25m/ratings.csv')
ratings.drop(columns=['timestamp'], inplace=True)

#import the unique id/title for each film
movie_ids = pd.read_csv('./data/movies_with_review_id.csv')
movie_ids = movie_ids[['tconst', 'primary_title_x','movieId']]

#limiting the ratings df to only the films that are in our movies database
ratings = ratings[ratings['movieId'].isin(list(movie_ids['movieId']))]

#joining film titles onto the ratings df
ratings = pd.merge(ratings, movie_ids, on='movieId', how='left') 

ratings.rename(columns={'primary_title_x':'primary_title'}, inplace=True)

ratings.drop(columns=['primary_title', 'movieId'], inplace=True)

In [None]:
#transforming the ratings dataframe into the required format
ratings = pd.pivot_table(
    ratings, 
    values = 'rating',
    columns = 'tconst',
    index = 'userId')

In [None]:
# ratings.T #fit a standar scaler to this data

In [None]:
ratings.shape

In [None]:
#normalizing each users reviews, as seen in our recommenders lesson

ratings_std = pd.DataFrame(index=ratings.index, columns = ratings.columns )

# We're basically implementing a StandardScaler that can handle nans here...
def stand(x, ave, std):
    try:
        return (x-ave) / std

    except:
        return x
    
for u in ratings.index:
    ave = np.mean(ratings.loc[u,:])
    std = np.std(ratings.loc[u,:])
    
    ratings_std.loc[u,:] = ratings.loc[u,:].apply(lambda x: stand(x, ave, std))
    
ratings_std = ratings_std + 5

In [None]:
# function to calculate similarity between any two users
# if they have fewer than n films in common, it'll return 0
def sim(user1, user2, n):
    #grabbing movies that the two users have in common, dropping movies that have not been reviewed
    commons = ratings.loc[[user1,user2]].dropna(axis=1)
    #if they have less than n movies in common, return 0
    if len(commons.columns)<n:
        return 0
    else:
        #if they have more than n columns in common, return the cos similarity of the two users
        return cosine_similarity(commons.loc[user1,:].values.reshape(1,-1), \
                                 commons.loc[user2,:].values.reshape(1,-1))[0][0]

In [None]:
# calculates estimated rating for a user + new film
def rating_estimator(user, title):
#     '''doesnt check if user has seen title.  
#     assumes we are only asking about unseen titles'''
    
    #list of users id's of people that have reviewed a title
    seers = list(ratings[title].dropna().index)
    
    #corresponding ratings for each user above, for a specified title
    reviews = list(ratings[title].dropna().values) 
    
    #create empty container
    sims = []
    
    #iterate over the list of people that have viewed a title
    for s in seers:
        
#        append similarities between new user and users who have seen the title
        sims.append(sim(user,s,10))
# score of how similar user and s are, weighted by how much seers liked the movie
#will help get the recommendation 
    return np.sum([i*j for i, j in zip(reviews, sims)]) / np.sum(sims) 

In [None]:
rating_estimator(1, 'tt0033373')

In [None]:
ratings