In [1]:
import pandas as pd
import json
import numpy as np
import requests
import sklearn

# Parse the stringified features into their corresponding python objects
from ast import literal_eval


In [3]:
tvs = pd.read_csv('datasets/mine/top_voted_tvs.csv')
tvs.head(2)

Unnamed: 0,backdrop_path,first_air_date,id,name,origin_country,original_language,overview,popularity,poster_path,vote_average,vote_count,genres,year,production_companies,networks,cast,creators,keywords
0,/9faGSFi5jam6pDWGNd0p8JcJgXQ.jpg,2008-01-20,1396,Breaking Bad,['US'],en,"When Walter White, a New Mexico chemistry teac...",362.838,/dY2FtSKXFCvutAHNgYax27oqwqX.jpg,8.9,12189,"['Drama', 'Crime']",2008.0,"['Sony Pictures Television Studios', 'High Bri...","['Sony Pictures Television Studios', 'High Bri...","[{'name': 'Bryan Cranston', 'character': 'Walt...",['Vince Gilligan'],"['drug dealer', 'psychopath', 'new mexico', 't..."
1,/rkB4LyZHo1NHXFEDHl9vSD9r1lI.jpg,2021-11-06,94605,Arcane,['US'],en,Amid the stark discord of twin cities Piltover...,63.546,/xS9hcajmOqwLUwXblCrPk2lfz4s.jpg,8.7,3260,"['Animation', 'Drama', 'Sci-Fi & Fantasy', 'Ac...",2021.0,"['Fortiche Production', 'Riot Games']","['Fortiche Production', 'Riot Games']","[{'name': 'Hailee Steinfeld', 'character': 'Vi...","['Christian Linke', 'Alex Yee']","['magic', 'female friendship', 'battle', 'base..."


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [5]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

def extract_cast_names(entries_str):
    entries = literal_eval(entries_str)
    return [entry['name'] for entry in entries[:5]]

In [6]:
features = ['cast', 'creators', 'keywords', 'genres', 'production_companies', 'networks']
df2 = tvs.copy()[ features]

df2['cast'] = df2['cast'].apply(extract_cast_names) # extract cast names


In [7]:
for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

In [8]:
df2.iloc[1]['cast']

['haileesteinfeld', 'kevinalejandro', 'jasonspisak']

In [9]:
df2['creators']

0                     ['vincegilligan']
1          ['christianlinke','alexyee']
2                                    []
3                                    []
4         ['danharmon','justinroiland']
                     ...               
2955    ['dorishursley','frankhursley']
2956                     ['kenwarwick']
2957                                 []
2958                    ['stephenking']
2959                  ['anthonysalter']
Name: creators, Length: 2960, dtype: object

Creating a "soup" of the selected metadata
Also - give the director 3 times weight in the soup

In [21]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + ' '.join(x['creators']) + ' ' + ' '.join(x['genres']) + ' '.join(x['production_companies']) 
df2['soup'] = df2.apply(create_soup, axis=1)

Not using TF-IDF beacause it might eliminate directors/cast that played in ,ultiple movies

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])

In [24]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [25]:

title_to_id = pd.Series(tvs.index, index=tvs['name'])

In [16]:
id_to_index = pd.Series(tvs.index, index=tvs['id'])

In [19]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations_by_title(title, cosine_similarity=cosine_similarity):
    # Get the index of the movie that matches the title
    idx = title_to_id[title]

    # Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_similarity[idx]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:15]

    # Get the movie indices
    movie_indices = [i[0] for i in similarity_scores]

    # Return the top 15 most similar movies
    return tvs['name'].iloc[movie_indices]

In [26]:
rec = get_recommendations_by_title('Black Mirror', cosine_sim2)
rec

1                               Arcane
2                            One Piece
3     Fullmetal Alchemist: Brotherhood
4                       Rick and Morty
5                         Heartstopper
6                       Anne with an E
7           Avatar: The Last Airbender
8                           Invincible
9       Demon Slayer: Kimetsu no Yaiba
10                    My Hero Academia
11                       The Owl House
12                      The Last of Us
13                        Regular Show
14                              Goblin
Name: name, dtype: object

In [29]:
def get_recommendations_by_id(id, cosine_similarity=cosine_similarity):
    # Get the index of the movie that matches the title
    idx = id_to_index[id]

    # Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_similarity[idx]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:11]

    # Get the movie indices
    movies_idx = [i[0] for i in similarity_scores]

    # Return the top 10 most similar movies
    return tvs['id'].iloc[movies_idx]

In [27]:
tvs.iloc[title_to_id['Manifest']]

backdrop_path                            /iZu83GB1IM7VXL2X90m7iLHYUHU.jpg
first_air_date                                                 2018-09-24
id                                                                  79696
name                                                             Manifest
origin_country                                                     ['US']
original_language                                                      en
overview                After landing from a turbulent but routine fli...
popularity                                                        190.964
poster_path                              /27ChAuQxC0sD4EoPiZpz9ddzju0.jpg
vote_average                                                          7.7
vote_count                                                           1270
genres                           ['Drama', 'Mystery', 'Sci-Fi & Fantasy']
year                                                               2018.0
production_companies    ['Warner Bros.

In [30]:
rec = get_recommendations_by_id(79696, cosine_sim2)
rec

0      1396
1     94605
2     37854
3     31911
4     60625
5    124834
6     70785
7       246
8     95557
9     85937
Name: id, dtype: int64

In [31]:
rec.to_list()

[1396, 94605, 37854, 31911, 60625, 124834, 70785, 246, 95557, 85937]

export cosine similarity

In [32]:
id_to_index

id
1396        0
94605       1
37854       2
31911       3
60625       4
         ... 
987      2955
4551     2956
62223    2957
16089    2958
90755    2959
Length: 2960, dtype: int64

In [43]:
import pickle

with open('similarity_matrix_tvs.pkl', 'wb') as f1:
    pickle.dump(cosine_sim2, f1)

with open('tv_id_to_matrix_similarity_idx.pkl', 'wb') as f2:
    pickle.dump(id_to_index, f2)