In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

In [2]:
tv = pd.read_csv('../Netflix-Data/titles.csv')
tv = tv[tv['type'] == 'SHOW']

In [3]:
cols = ['id', 'title', 'description', 'release_year', 'genres', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score']
tv = tv[cols]

## Description

In [4]:
tv['description'] = tv['description'].fillna('')

In [5]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(tv['description'])

In [6]:
cosine_sim1 = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
tv = tv.reset_index()
titles = tv['title']
indices = pd.Series(tv.index, index=tv['title'])

In [8]:
def get_recommendations(title, cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [9]:
get_recommendations('Better Call Saul', cosine_sim1).head(10)

971     Jimmy: The True Story of a True Idiot
368                                  Aquarius
1639                                     Maid
1378                              Brotherhood
1782                                 Vincenzo
1801                               Making Fun
786                                   The Fix
1766     Jimmy Savile: A British Horror Story
40                               Breaking Bad
Name: title, dtype: object

## Metadata

In [10]:
credits = pd.read_csv('../Netflix-Data/credits.csv')
credits = credits.dropna(subset=['character'])

In [11]:
credits["name"] = credits["name"].str.replace(" ", "")

In [12]:
credits = credits.groupby(['id']).agg({'name': ' '.join}).reset_index()

In [13]:
credits

Unnamed: 0,id,name
0,tm1000037,LunaWedler JannisNiewöhner MilanPeschel EdinHa...
1,tm1000147,GuyPearce MatildaAnnaIngridLutz TravisFimmel J...
2,tm100015,IdrisElba PaulWalker MattDillon MichaelEaly Ja...
3,tm1000166,GlennFredly MarcelloTahitoe AndienAisyah Canti...
4,tm1000185,AdriannaChlebicka MateuszBanasiuk MirosławBaka...
...,...,...
4950,ts97584,SebastianPerry
4951,ts9794,JerrySeinfeld
4952,ts98252,TomMcGrath JohnDiMaggio DavidSchwimmer AndyRic...
4953,ts98316,JohnHurt


In [14]:
tv = tv.merge(credits, on='id')

In [15]:
tv['description'] = tv['description'].fillna('')
tv['name'] = tv['name'].fillna('')

In [16]:
tv['tags'] = tv['description'] + ' ' +  tv['name']

In [17]:
tv.head(1)

Unnamed: 0,index,id,title,description,release_year,genres,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,name,tags
0,5,ts22164,Monty Python's Flying Circus,A British sketch comedy series with the shows ...,1969,"['comedy', 'european']",8.8,73424.0,17.617,8.306,GrahamChapman MichaelPalin TerryJones EricIdle...,A British sketch comedy series with the shows ...


In [18]:
stemmer = SnowballStemmer('english')

In [19]:
tv['tags'] = tv['tags'].str.lower()

In [20]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0, stop_words='english')
count_matrix = count.fit_transform(tv['tags'])

In [21]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [22]:
tv = tv.reset_index()
titles = tv['title']
indices = pd.Series(tv.index, index=tv['title'])

In [67]:
get_recommendations('Breaking Bad', cosine_sim2)[:10]

  get_recommendations('Breaking Bad', cosine_sim2)[:10]


908                     Merli: Dare to Know
342                          W/ Bob & David
333         The Adventures of Puss in Boots
236                        Better Call Saul
1404                        Ginny & Georgia
1631                        So Not Worth It
299                       Making a Murderer
919                                Ultraman
1284                 Arashi's Diary: Voyage
351     Inside the World's Toughest Prisons
Name: title, dtype: object

In [36]:
def weighted_rating(x, m, C):
    v = x['imdb_votes']
    R = x['imdb_score']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [48]:
def improved_recommendations(title, cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = tv.iloc[movie_indices][['title', 'imdb_score', 'imdb_votes']]
    vote_counts = movies[movies['imdb_votes'].notnull()]['imdb_votes'].astype('int')
    vote_averages = movies[movies['imdb_score'].notnull()]['imdb_score'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['imdb_votes'] >= m) & (movies['imdb_votes'].notnull())
                       & (movies['imdb_score'].notnull())]
    qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
    qualified['imdb_score'] = qualified['imdb_score'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(20)
    return qualified

In [49]:
improved_recommendations("Breaking Bad", cosine_sim2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['imdb_score'] = qualified['imdb_score'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)


Unnamed: 0,title,imdb_score,imdb_votes,wr
523,My Mister,9,6043,8.13484
236,Better Call Saul,8,438575,7.989261
429,Ozark,8,300454,7.984382
299,Making a Murderer,8,94419,7.951541
50,The Spectacular Spider-Man,8,14996,7.74414
505,Chilling Adventures of Sabrina,7,91958,6.987573
1404,Ginny & Georgia,7,46343,6.976214
342,W/ Bob & David,7,4160,6.846263
386,Spotless,7,3663,6.835687
636,Life Sentence,6,3899,6.318231


In [62]:
tv_list = ['Breaking Bad', 'Better Call Saul', 'Ozark', 'Bojack Horseman', 
           'Ginny & Georgia']

In [63]:
user_scores = pd.DataFrame(tv['title'])
user_scores['wr'] = 0.0

In [64]:
umber_of_recommendations = 10000
for tv_name in tv_list:
    try:
        top_titles_df = improved_recommendations(tv_name, cosine_sim2)
    except:
        pass
    # aggregate the scores
    user_scores = pd.concat([user_scores, top_titles_df[['title', 'wr']]]).groupby(['title'], as_index=False).sum({'wr'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['imdb_score'] = qualified['imdb_score'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)
A val

In [65]:
user_scores = user_scores[~user_scores['title'].isin(tv_list)]
user_scores.sort_values(by='wr', ascending=False)[:20]

Unnamed: 0,title,wr
1652,jeen-yuhs,22.696336
951,Peaky Blinders,15.969155
94,Arrested Development,15.94799
1481,Top Boy,15.492611
1421,The Spectacular Spider-Man,15.369437
179,Bloodline,13.979059
578,How to Sell Drugs Online (Fast),13.968345
990,Queen of the South,13.96661
1194,Tales of the City,13.919607
844,My Mister,8.13484
