In [57]:
import pandas as pd
import json
import numpy as np
import requests
import sklearn

# Parse the stringified features into their corresponding python objects
from ast import literal_eval


In [58]:
tvs = pd.read_csv('datasets/mine/top_voted_tvs.csv')
tvs.head(2)

Unnamed: 0,backdrop_path,first_air_date,id,name,origin_country,original_language,overview,popularity,poster_path,vote_average,vote_count,genres,year,production_companies,networks,cast,creators,keywords
0,/9faGSFi5jam6pDWGNd0p8JcJgXQ.jpg,2008-01-20,1396,Breaking Bad,['US'],en,"When Walter White, a New Mexico chemistry teac...",362.838,/dY2FtSKXFCvutAHNgYax27oqwqX.jpg,8.9,12189,"['Drama', 'Crime']",2008.0,"['Sony Pictures Television Studios', 'High Bri...","['Sony Pictures Television Studios', 'High Bri...","[{'name': 'Bryan Cranston', 'character': 'Walt...",['Vince Gilligan'],"['drug dealer', 'psychopath', 'new mexico', 't..."
1,/rkB4LyZHo1NHXFEDHl9vSD9r1lI.jpg,2021-11-06,94605,Arcane,['US'],en,Amid the stark discord of twin cities Piltover...,63.546,/xS9hcajmOqwLUwXblCrPk2lfz4s.jpg,8.7,3260,"['Animation', 'Drama', 'Sci-Fi & Fantasy', 'Ac...",2021.0,"['Fortiche Production', 'Riot Games']","['Fortiche Production', 'Riot Games']","[{'name': 'Hailee Steinfeld', 'character': 'Vi...","['Christian Linke', 'Alex Yee']","['magic', 'female friendship', 'battle', 'base..."


In [59]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

def extract_cast_names(entries_str):
    entries = literal_eval(entries_str)
    return [entry['name'] for entry in entries[:5]]

In [60]:
features = ['cast', 'creators', 'keywords', 'genres', 'production_companies', 'networks']
df2 = tvs.copy()[ features]

df2['cast'] = df2['cast'].apply(extract_cast_names) # extract cast names


In [61]:
for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

In [62]:
df2.iloc[1]['cast']

['haileesteinfeld', 'kevinalejandro', 'jasonspisak']

Creating a "soup" of the selected metadata
Also - give the director 3 times weight in the soup

In [63]:
from ast import literal_eval

df2['soup'] = df2['creators'].apply(literal_eval).apply(lambda x: x * 2) + \
        df2['keywords'].apply(literal_eval) + \
        df2['genres'].apply(literal_eval) + \
        df2['production_companies'].apply(literal_eval) + \
        df2['cast']   

df2['soup'] = df2['soup'].apply(lambda x: ' '.join(x))   

Not using TF-IDF beacause it might eliminate creators/cast that played in multiple movies

In [64]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])

In [65]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [66]:

title_to_id = pd.Series(tvs.index, index=tvs['name'])

In [67]:
id_to_index = pd.Series(tvs.index, index=tvs['id'])

In [68]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations_by_title(title, cosine_similarity=cosine_similarity):
    # Get the index of the movie that matches the title
    idx = title_to_id[title]

    # Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_similarity[idx]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:15]

    # Get the movie indices
    movie_indices = [i[0] for i in similarity_scores]

    # Return the top 15 most similar movies
    return tvs['name'].iloc[movie_indices]

In [69]:
rec = get_recommendations_by_title('Black Mirror', cosine_sim2)
rec

2270                          Dead Set
2265                 Masters of Horror
80                       Shadow Hunter
1237                  The Outer Limits
1651                 The Twilight Zone
1076           American Horror Stories
1096                        Goosebumps
2527                             Awake
2659                             Solos
1735                         Creepshow
2428                             Calls
2662    Penny Dreadful: City of Angels
1000              Tales from the Crypt
229                  The Twilight Zone
Name: name, dtype: object

In [70]:
def get_recommendations_by_id(id, cosine_similarity=cosine_similarity):
    # Get the index of the movie that matches the title
    idx = id_to_index[id]

    # Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_similarity[idx]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:11]

    # Get the movie indices
    movies_idx = [i[0] for i in similarity_scores]

    # Return the top 10 most similar movies
    return tvs['id'].iloc[movies_idx]

In [71]:
tvs.iloc[title_to_id['Manifest']]

backdrop_path                            /iZu83GB1IM7VXL2X90m7iLHYUHU.jpg
first_air_date                                                 2018-09-24
id                                                                  79696
name                                                             Manifest
origin_country                                                     ['US']
original_language                                                      en
overview                After landing from a turbulent but routine fli...
popularity                                                        190.964
poster_path                              /27ChAuQxC0sD4EoPiZpz9ddzju0.jpg
vote_average                                                          7.7
vote_count                                                           1270
genres                           ['Drama', 'Mystery', 'Sci-Fi & Fantasy']
year                                                               2018.0
production_companies    ['Warner Bros.

In [72]:
rec = get_recommendations_by_id(79696, cosine_sim2)
rec

1020    81322
1379    60726
1357    64464
80      12313
711      3137
2343    85703
988     32871
1775      433
1847    79240
2140     5690
Name: id, dtype: int64

In [73]:
rec.to_list()

[81322, 60726, 64464, 12313, 3137, 85703, 32871, 433, 79240, 5690]

export cosine similarity

In [74]:
id_to_index

id
1396        0
94605       1
37854       2
31911       3
60625       4
         ... 
987      2955
4551     2956
62223    2957
16089    2958
90755    2959
Length: 2960, dtype: int64

In [75]:
import pickle

with open('similarity_matrix_tvs.pkl', 'wb') as f1:
    pickle.dump(cosine_sim2, f1)

with open('tv_id_to_matrix_similarity_idx.pkl', 'wb') as f2:
    pickle.dump(id_to_index, f2)