In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('white', { 'axes.spines.right': False, 'axes.spines.top': False})
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords

In [None]:
movies = pd.read_csv('../Netflix-Data/titles.csv')
movies = movies[movies['type'] == 'MOVIE']

In [None]:
cols = ['id', 'title', 'description', 'release_year', 'genres', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score']
movies = movies[cols]

In [None]:
credits = pd.read_csv('../Netflix-Data/credits.csv')
credits = credits.dropna(subset=['character'])

In [None]:
credits["name"] = credits["name"].str.replace(" ", "")
credits = credits.groupby(['id']).agg({'name': ' '.join}).reset_index()

In [None]:
movies = movies.merge(credits, on='id')

In [None]:
movies['description'] = movies['description'].fillna('')
movies['name'] = movies['name'].fillna('')
movies['tags'] = movies['description'] + ' ' +  movies['name']
movies['tags'] = movies['tags'].str.lower()

In [None]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0, stop_words='english')
count_matrix = count.fit_transform(movies['tags'])

In [None]:
similarity = cosine_similarity(count_matrix, count_matrix)

In [None]:
np.save('netflix_movies_similarity', similarity)

In [None]:
movies.to_csv('netflix_movies.csv')

In [None]:
similarity = np.load('netflix_movies_similarity.npy')

In [None]:
movies = pd.read_csv('netflix_movies.csv')

In [None]:
def weighted_rating(x, m, C):
    v = x['imdb_votes']
    R = x['imdb_score']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
def improved_recommendations(title, cosine_sim, tv):
    tv = tv.reset_index()
    indices = pd.Series(tv.index, index=tv['title'])

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    tv_indices = [i[0] for i in sim_scores]
    
    tv_df = tv.iloc[tv_indices][['title', 'imdb_score', 'imdb_votes']]
    vote_counts = tv_df[tv_df['imdb_votes'].notnull()]['imdb_votes'].astype('int')
    vote_averages = tv_df[tv_df['imdb_score'].notnull()]['imdb_score'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = tv_df[(tv_df['imdb_votes'] >= m) & (tv_df['imdb_votes'].notnull())
                       & (tv_df['imdb_score'].notnull())]
    qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
    qualified['imdb_score'] = qualified['imdb_score'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(20)
    return qualified

In [None]:
movies_list = ['Taxi Driver', 'The Guilty', 'Red Notice', 'Bird Box', 'Intrusion',
               'Don\'t Look Up', 'The Power of the Dog']

In [None]:
user_scores = pd.DataFrame(movies['title'])
user_scores['wr'] = 0.0

In [None]:
# read in tv and similarity here
for movie_name in movies_list:
    try:
        top_titles_df = improved_recommendations(movie_name, similarity, movies)
    except:
        continue
    # aggregate the scores
    user_scores = pd.concat([user_scores, top_titles_df[['title', 'wr']]]).groupby(['title'], as_index=False).sum({'wr'})

In [None]:
user_scores = user_scores[~user_scores['title'].isin(movies_list)]
user_scores.sort_values(by='wr', ascending=False)[:20]

# Test

In [2]:
def clean_watch_history(df):
    '''
    Function that cleans a given users watch history data
    Input: dataframe
    Output: (cleaned) dataframe
    '''
    df = df.rename(columns = {"Title": "History"})
    df['Date'] = pd.to_datetime(df['Date'])
    df['Day']= df['Date'].dt.day
    df['Month']= df['Date'].dt.month
    df['Year']= df['Date'].dt.year
    df['Day_of_week'] = df['Date'].dt.dayofweek

    df['Title'] = df['History'].str.rsplit(': ', 2).str[0]
    df['Season'] = df['History'].str.rsplit(': ', 2).str[1]
    df['Episode'] = df['History'].str.rsplit(': ', 2).str[2]

    df['Type'] = df['Episode'].apply(lambda x : 'Movie' if (pd.isna(x)==True) else 'TV')

    tv = df[df['Type']!='Movie']
    tv['Season'] = tv['Season'].str.split().str[1]

    movies = df[df['Type']=='Movie']
    movies['Title'] = movies['History']
    movies['Season'] = None

    df = pd.concat([movies, tv], ignore_index = True)
    return df

In [3]:
def netflix_merge(df):
    '''
    Function that merges given watch history with netflix dataset,
    and returns merged dataset
    '''
    titles = pd.read_csv('../Netflix-Data/titles.csv')
    merged = df.merge(titles, left_on = 'Title', right_on = 'title', how = 'inner')
    cols_to_drop = ['production_countries', 'imdb_id', 'age_certification', 
                    'title', 'seasons', 'tmdb_popularity']
    merged = merged.drop(cols_to_drop, axis = 1)
    return merged

In [4]:
def get_tv_list(df, type):
    return list(df[df['type'] == type]['Title'])

In [26]:
def get_tv_recs(tv_list, type):

    # read in tv and similarity here
    if type == "MOVIE":
        similarity = np.load('../recommender_system_2/netflix_movies_similarity.npy')
        tv = pd.read_csv('../recommender_system_2/netflix_movies.csv')
    else:
        similarity = np.load('../recommender_system_2/tv_similarity.npy')
        tv = pd.read_csv('../recommender_system_2/tv.csv')

    user_scores = pd.DataFrame(tv['title'])
    user_scores['wr'] = 0.0

    for tv_name in tv_list:
        try:
            top_titles_df = improved_recommendations(tv_name, similarity, tv)
        except:
            continue
        # aggregate the scores
        user_scores = pd.concat([user_scores, top_titles_df[['title', 'wr']]]).groupby(['title'], as_index=False).sum({'wr'})

    user_scores = user_scores[~user_scores['title'].isin(tv_list)]
    user_scores = user_scores.sort_values(by='wr', ascending=False)[:50]
    return user_scores

In [27]:
def improved_recommendations(title, cosine_sim, tv):
    tv = tv.reset_index()
    indices = pd.Series(tv.index, index=tv['title'])

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    tv_indices = [i[0] for i in sim_scores]
    
    tv_df = tv.iloc[tv_indices][['title', 'imdb_score', 'imdb_votes']]
    vote_counts = tv_df[tv_df['imdb_votes'].notnull()]['imdb_votes'].astype('int')
    vote_averages = tv_df[tv_df['imdb_score'].notnull()]['imdb_score'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = tv_df[(tv_df['imdb_votes'] >= m) & (tv_df['imdb_votes'].notnull())
                       & (tv_df['imdb_score'].notnull())]
    qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
    qualified['imdb_score'] = qualified['imdb_score'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(50)
    return qualified

In [7]:
def weighted_rating(x, m, C):
    v = x['imdb_votes']
    R = x['imdb_score']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [28]:
df_1 = pd.read_csv('../Netflix-Data/Sample-History3.csv')
df_2 = pd.read_csv('../Netflix-Data/Sample-History2.csv')

In [29]:
df_1 = clean_watch_history(df_1)
df1 = netflix_merge(df_1)

df1_tv = get_tv_list(df1, "SHOW")
recs1 = get_tv_recs(df1_tv, "SHOW")

  df['Title'] = df['History'].str.rsplit(': ', 2).str[0]
  df['Season'] = df['History'].str.rsplit(': ', 2).str[1]
  df['Episode'] = df['History'].str.rsplit(': ', 2).str[2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tv['Season'] = tv['Season'].str.split().str[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['Title'] = movies['History']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [30]:
df_2 = clean_watch_history(df_2)
df2 = netflix_merge(df_2)

df2_tv = get_tv_list(df2, "SHOW")
recs2 = get_tv_recs(df2_tv, "SHOW")

  df['Title'] = df['History'].str.rsplit(': ', 2).str[0]
  df['Season'] = df['History'].str.rsplit(': ', 2).str[1]
  df['Episode'] = df['History'].str.rsplit(': ', 2).str[2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tv['Season'] = tv['Season'].str.split().str[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['Title'] = movies['History']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [31]:
recs2

Unnamed: 0,title,wr
821,Monty Python's Flying Circus,198.188514
991,Queer Eye,193.874461
795,Middleditch & Schwartz,193.030596
555,Hip Hop Evolution,185.97814
593,I Think You Should Leave with Tim Robinson,179.851298
1314,The House of Flowers,172.1752
500,Grand Army,171.894973
1311,The Hook Up Plan,171.4648
1218,The Baker and the Beauty,171.002531
161,Bill Nye Saves the World,109.123922


In [32]:
def overlap(df1, df2):
    on = df1.columns[0]
    val = str(df1.columns[1])
    val_x = val+'_x'
    val_y = val+'_y'

    df = pd.merge(df1, df2, how='inner', on=on)
    df['score'] = (df[val_x] + df[val_y])/2
    df = df.sort_values(by='score', ascending=False)
    return list(df[on])

In [33]:
overlap(recs1, recs2)

['Kota Factory',
 'Chilling Adventures of Sabrina',
 'The I-Land',
 'My Mister',
 'Queer Eye',
 'The Spectacular Spider-Man',
 'Bonding',
 'Boys Over Flowers']