In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

In [2]:
# def get_recommendations(title, cosine_sim, titles, indices):
#     idx = indices[title]
#     sim_scores = list(enumerate(cosine_sim[idx]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     sim_scores = sim_scores[1:31]
#     movie_indices = [i[0] for i in sim_scores]
#     return titles.iloc[movie_indices]

## Metadata

In [2]:
tv = pd.read_csv('../Netflix-Data/titles.csv')
tv = tv[tv['type'] == 'SHOW']

In [3]:
cols = ['id', 'title', 'description', 'release_year', 'genres', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score']
tv = tv[cols]

In [4]:
credits = pd.read_csv('../Netflix-Data/credits.csv')
credits = credits.dropna(subset=['character'])

In [5]:
credits["name"] = credits["name"].str.replace(" ", "")
credits = credits.groupby(['id']).agg({'name': ' '.join}).reset_index()

In [6]:
tv = tv.merge(credits, on='id')

In [7]:
tv['description'] = tv['description'].fillna('')
tv['name'] = tv['name'].fillna('')
tv['tags'] = tv['description'] + ' ' +  tv['name']
tv['tags'] = tv['tags'].str.lower()

In [8]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0, stop_words='english')
count_matrix = count.fit_transform(tv['tags'])

In [9]:
similarity = cosine_similarity(count_matrix, count_matrix)

In [6]:
np.save('tv_similarity', similarity)

In [7]:
tv.to_csv('tv.csv')

## Generate Recs

In [10]:
similarity = np.load('tv_similarity.npy')

In [11]:
tv = pd.read_csv('tv.csv')

In [2]:
def weighted_rating(x, m, C):
    v = x['imdb_votes']
    R = x['imdb_score']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [3]:
def improved_recommendations(title, cosine_sim, tv):
    tv = tv.reset_index()
    indices = pd.Series(tv.index, index=tv['title'])

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    tv_indices = [i[0] for i in sim_scores]
    
    tv_df = tv.iloc[tv_indices][['title', 'imdb_score', 'imdb_votes']]
    vote_counts = tv_df[tv_df['imdb_votes'].notnull()]['imdb_votes'].astype('int')
    vote_averages = tv_df[tv_df['imdb_score'].notnull()]['imdb_score'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = tv_df[(tv_df['imdb_votes'] >= m) & (tv_df['imdb_votes'].notnull())
                       & (tv_df['imdb_score'].notnull())]
    qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
    qualified['imdb_score'] = qualified['imdb_score'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(20)
    return qualified

In [None]:
improved_recommendations("Breaking Bad", similarity, tv)

In [15]:
tv_list = ['Breaking Bad', 'Better Call Saul', 'Ozark', 'Bojack Horseman', 
           'Ginny & Georgia', 'Seinfeld']

In [16]:
user_scores = pd.DataFrame(tv['title'])
user_scores['wr'] = 0.0

In [17]:
# read in tv and similarity here
for tv_name in tv_list:
    try:
        top_titles_df = improved_recommendations(tv_name, similarity, tv)
    except:
        continue
    # aggregate the scores
    user_scores = pd.concat([user_scores, top_titles_df[['title', 'wr']]]).groupby(['title'], as_index=False).sum({'wr'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['imdb_score'] = qualified['imdb_score'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)
A val

In [18]:
user_scores = user_scores[~user_scores['title'].isin(tv_list)]
user_scores.sort_values(by='wr', ascending=False)[:20]

Unnamed: 0,title,wr
779,Master of None,15.750745
1421,The Spectacular Spider-Man,15.369437
1652,jeen-yuhs,15.205948
844,My Mister,8.13484
951,Peaky Blinders,7.984577
94,Arrested Development,7.973995
758,Maid,7.959798
761,Making a Murderer,7.951541
108,Atypical,7.916289
1481,Top Boy,7.746305


## Test

In [4]:
def clean_watch_history(df):
    '''
    Function that cleans a given users watch history data
    Input: dataframe
    Output: (cleaned) dataframe
    '''
    df = df.rename(columns = {"Title": "History"})
    df['Date'] = pd.to_datetime(df['Date'])
    df['Day']= df['Date'].dt.day
    df['Month']= df['Date'].dt.month
    df['Year']= df['Date'].dt.year
    df['Day_of_week'] = df['Date'].dt.dayofweek

    df['Title'] = df['History'].str.rsplit(': ', 2).str[0]
    df['Season'] = df['History'].str.rsplit(': ', 2).str[1]
    df['Episode'] = df['History'].str.rsplit(': ', 2).str[2]

    df['Type'] = df['Episode'].apply(lambda x : 'Movie' if (pd.isna(x)==True) else 'TV')

    tv = df[df['Type']!='Movie']
    tv['Season'] = tv['Season'].str.split().str[1]

    movies = df[df['Type']=='Movie']
    movies['Title'] = movies['History']
    movies['Season'] = None

    df = pd.concat([movies, tv], ignore_index = True)
    return df

In [5]:
def netflix_merge(df):
    '''
    Function that merges given watch history with netflix dataset,
    and returns merged dataset
    '''
    titles = pd.read_csv('../Netflix-Data/titles.csv')
    merged = df.merge(titles, left_on = 'Title', right_on = 'title', how = 'inner')
    cols_to_drop = ['production_countries', 'imdb_id', 'age_certification', 
                    'title', 'seasons', 'tmdb_popularity']
    merged = merged.drop(cols_to_drop, axis = 1)
    return merged

In [6]:
def get_tv_list(df):
    return list(df[df['type'] == "SHOW"]['Title'])

In [7]:
def get_tv_recs(tv_list):

    # read in tv and similarity here
    similarity = np.load('tv_similarity.npy')
    tv = pd.read_csv('tv.csv')

    user_scores = pd.DataFrame(tv['title'])
    user_scores['wr'] = 0.0

    for tv_name in tv_list:
        try:
            top_titles_df = improved_recommendations(tv_name, similarity, tv)
        except:
            continue
        # aggregate the scores
        user_scores = pd.concat([user_scores, top_titles_df[['title', 'wr']]]).groupby(['title'], as_index=False).sum({'wr'})

    user_scores = user_scores[~user_scores['title'].isin(tv_list)]
    user_scores = user_scores.sort_values(by='wr', ascending=False)[:30]
    return list(user_scores['title'])

In [12]:
def get_common_tv(recs1, recs2):
    return list(set(recs1).intersection(set(recs2)))

In [9]:
df_1 = pd.read_csv('../Netflix-Data/Sample-History2.csv')
df_1 = clean_watch_history(df_1)

df1 = netflix_merge(df_1)

df1_tv = get_tv_list(df1)

recs1 = get_tv_recs(df1_tv)

  df['Title'] = df['History'].str.rsplit(': ', 2).str[0]
  df['Season'] = df['History'].str.rsplit(': ', 2).str[1]
  df['Episode'] = df['History'].str.rsplit(': ', 2).str[2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tv['Season'] = tv['Season'].str.split().str[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['Title'] = movies['History']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [10]:
df_2 = pd.read_csv('../Netflix-Data/Sample-History3.csv')
df_2 = clean_watch_history(df_2)

df2 = netflix_merge(df_2)

df2_tv = get_tv_list(df2)

recs2 = get_tv_recs(df2_tv)

  df['Title'] = df['History'].str.rsplit(': ', 2).str[0]
  df['Season'] = df['History'].str.rsplit(': ', 2).str[1]
  df['Episode'] = df['History'].str.rsplit(': ', 2).str[2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tv['Season'] = tv['Season'].str.split().str[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['Title'] = movies['History']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [13]:
get_common_tv(recs1, recs2)

['The Spectacular Spider-Man',
 'Bonding',
 'Boys Over Flowers',
 'Chilling Adventures of Sabrina']