In [1]:
import pandas as pd
import numpy as np
import os
import ast
import pickle
import torch
from collections import Counter
from sentence_transformers import SentenceTransformer

In [2]:
# Read the CSV files
history_df = pd.read_csv('../data/netflix_history_preprocessed.csv')
titles_df = pd.read_csv('../data/netflix_titles_preprocessed.csv')

In [3]:
history_df

Unnamed: 0,Title,Start Time,Total Duration
0,tote mädchen lügen nicht,2018-06-04 17:39:00,154323.0
1,haus des geldes,2018-06-17 13:45:00,196086.0
2,naruto,2018-06-28 06:09:00,280744.0
3,death note,2018-06-28 10:51:00,5999.0
4,stranger things,2018-06-29 08:39:00,25353.0
...,...,...,...
270,tiny house nation usa,2022-12-15 17:29:00,33778.0
271,falling for christmas,2022-12-24 20:52:00,5555.0
272,your place or mine,2023-02-13 20:54:00,6257.0
273,anatomie eines skandals,2023-03-05 20:45:00,14850.0


In [4]:
# Convert string representation of list to actual list
titles_df['director'] = titles_df['director'].apply(ast.literal_eval)
titles_df['cast'] = titles_df['cast'].apply(ast.literal_eval)
titles_df['country'] = titles_df['country'].apply(ast.literal_eval)
titles_df['listed_in'] = titles_df['listed_in'].apply(ast.literal_eval)

In [5]:
# Keep only the first occurrence of each title
titles_df = titles_df.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)

In [6]:
titles_df

Unnamed: 0,title,director,cast,country,date_added,release_year,listed_in,description
0,dick johnson is dead,[kirsten johnson],[],[united states],"september 25, 2021",2020,[documentaries],"as her father nears the end of his life, filmm..."
1,blood & water,[],"[ama qamata, khosi ngema, gail mabalane, th...",[south africa],"september 24, 2021",2021,"[international tv shows, tv dramas, tv myste...","after crossing paths at a party, a cape town t..."
2,ganglands,[julien leclercq],"[sami bouajila, tracy gotoas, samuel jouy, ...",[],"september 24, 2021",2021,"[crime tv shows, international tv shows, tv ...",to protect his family from a powerful drug lor...
3,jailbirds new orleans,[],[],[],"september 24, 2021",2021,"[docuseries, reality tv]","feuds, flirtations and toilet talk go down amo..."
4,kota factory,[],"[mayur more, jitendra kumar, ranjan raj, al...",[india],"september 24, 2021",2021,"[international tv shows, romantic tv shows, ...",in a city of coaching centers known to train i...
...,...,...,...,...,...,...,...,...
8249,zodiac,[david fincher],"[mark ruffalo, jake gyllenhaal, robert downe...",[united states],"november 20, 2019",2007,"[cult movies, dramas, thrillers]","a political cartoonist, a crime reporter and a..."
8250,zombie dumb,[],[],[],"july 1, 2019",2018,"[kids' tv, korean tv shows, tv comedies]","while living alone in a spooky town, a young g..."
8251,zombieland,[ruben fleischer],"[jesse eisenberg, woody harrelson, emma ston...",[united states],"november 1, 2019",2009,"[comedies, horror movies]",looking to survive in a world taken over by zo...
8252,zoom,[peter hewitt],"[tim allen, courteney cox, chevy chase, kat...",[united states],"january 11, 2020",2006,"[children & family movies, comedies]","dragged from civilian life, a former superhero..."


In [7]:
history_titles_set = set(history_df['Title'])
titles_set = set(titles_df['title'])

In [8]:
overlaps = history_titles_set.intersection(titles_set)

In [9]:
en_history_df = history_df[history_df['Title'].isin(overlaps)]

In [10]:
en_history_df

Unnamed: 0,Title,Start Time,Total Duration
2,naruto,2018-06-28 06:09:00,280744.0
3,death note,2018-06-28 10:51:00,5999.0
4,stranger things,2018-06-29 08:39:00,25353.0
6,cowspiracy,2018-07-01 19:08:00,10585.0
8,house of cards,2018-07-20 15:19:00,126137.0
...,...,...,...
246,darwin’s game,2021-10-31 19:48:00,16770.0
250,a christmas prince,2021-12-24 14:50:00,15556.0
252,the last kingdom,2022-01-03 08:03:00,136762.0
253,the game changers,2022-02-01 11:58:00,4887.0


In [11]:
watch_history = en_history_df['Title'].to_list()
watch_history

['naruto',
 'death note',
 'stranger things',
 'cowspiracy',
 'house of cards',
 'welcome to the family',
 'riverdale',
 'gossip girl',
 'orange is the new black',
 'sierra burgess is a loser',
 'to all the boys i’ve loved before',
 'the kissing booth',
 'greenhouse academy',
 'insatiable',
 '#realityhigh',
 'dude',
 'you get me',
 'shooter',
 'american horror story',
 'chilling adventures of sabrina',
 'black butler',
 'baby',
 'sex education',
 'bodyguard',
 'suits',
 'how to get away with murder',
 'the order',
 'on my block',
 'the perfect date',
 'the protector',
 'bonding',
 'the last summer',
 'kidnapping stella',
 'sintonia',
 'the seven deadly sins',
 'the irregular at magic high school',
 'cam',
 'naruto shippuden',
 'castlevania',
 'how to sell drugs online (fast)',
 'the end of the f***ing world',
 'baki',
 'the irishman',
 'revisions',
 'hot girls wanted',
 'the witcher',
 'the blacklist',
 'flavors of youth',
 'attack on titan',
 'big mouth',
 'granblue fantasy the animat

In [12]:
# Flatten the list of actor names
actor_names = [name for sublist in titles_df['cast'] for name in sublist]

# Count the occurrences of each actor name
name_counts = Counter(actor_names)

In [13]:
def keep_top_three_actors(actor_list):
    if len(actor_list) == 0:
        return []
    # Keep only the top k most frequent actors
    actor_list.sort(key=lambda x: name_counts[x], reverse=True)
    return actor_list[:3]

In [14]:
titles_df['cast'] = titles_df['cast'].apply(keep_top_three_actors)

In [15]:
titles_df

Unnamed: 0,title,director,cast,country,date_added,release_year,listed_in,description
0,dick johnson is dead,[kirsten johnson],[],[united states],"september 25, 2021",2020,[documentaries],"as her father nears the end of his life, filmm..."
1,blood & water,[],"[ thabang molaba, cindy mahlangu, patrick mo...",[south africa],"september 24, 2021",2021,"[international tv shows, tv dramas, tv myste...","after crossing paths at a party, a cape town t..."
2,ganglands,[julien leclercq],"[sami bouajila, samuel jouy, sofia lesaffre]",[],"september 24, 2021",2021,"[crime tv shows, international tv shows, tv ...",to protect his family from a powerful drug lor...
3,jailbirds new orleans,[],[],[],"september 24, 2021",2021,"[docuseries, reality tv]","feuds, flirtations and toilet talk go down amo..."
4,kota factory,[],"[ alam khan, ahsaas channa, revathi pillai]",[india],"september 24, 2021",2021,"[international tv shows, romantic tv shows, ...",in a city of coaching centers known to train i...
...,...,...,...,...,...,...,...,...
8249,zodiac,[david fincher],"[ dermot mulroney, john carroll lynch, brian...",[united states],"november 20, 2019",2007,"[cult movies, dramas, thrillers]","a political cartoonist, a crime reporter and a..."
8250,zombie dumb,[],[],[],"july 1, 2019",2018,"[kids' tv, korean tv shows, tv comedies]","while living alone in a spooky town, a young g..."
8251,zombieland,[ruben fleischer],"[ amber heard, woody harrelson, jesse eisenberg]",[united states],"november 1, 2019",2009,"[comedies, horror movies]",looking to survive in a world taken over by zo...
8252,zoom,[peter hewitt],"[ kate mara, courteney cox, rip torn]",[united states],"january 11, 2020",2006,"[children & family movies, comedies]","dragged from civilian life, a former superhero..."


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)



In [17]:
descriptions = titles_df['description'].to_list()

In [18]:
descriptions_embeddings = model.encode(descriptions, convert_to_tensor=True)

In [19]:
print(descriptions_embeddings.shape)

torch.Size([8254, 384])


In [20]:
descriptions_similarity_scores = torch.matmul(descriptions_embeddings, descriptions_embeddings.T).cpu().numpy()

In [21]:
# Evaluation of the recommendation
def evaluate(similarity_scores, consider_history=False):
    target_ranks = []
    scores = np.zeros(similarity_scores.shape[0])
    
    for i in range(1, len(watch_history)):
        target_title = watch_history[i]
        target_row_index = titles_df.index[titles_df['title'] == target_title].tolist()[0]
        prev_title = watch_history[i - 1]
        prev_row_index = titles_df.index[titles_df['title'] == prev_title].tolist()[0]
    
        # Get recommendation based on the similarity
        if consider_history:
            scores = 1 / 2 * scores + 1 / 2 * similarity_scores[prev_row_index]
        else:
            scores = similarity_scores[prev_row_index]
        recommendation_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
        target_rank = recommendation_indices.index(target_row_index)
        target_ranks.append(target_rank)
    
    print('Average rank:', np.mean(target_ranks))
    print('Successful recommendations:', np.sum(np.array(target_ranks) <= 5))

In [22]:
evaluate(descriptions_similarity_scores)

Average rank: 3193.1696428571427
Successful recommendations: 1


In [23]:
evaluate(descriptions_similarity_scores, True)

Average rank: 2809.8303571428573
Successful recommendations: 1


In [24]:
metadata = []

for index, row in titles_df.iterrows():
    text = ''
    if row['director']:
        text += f"the director is {','.join(row['director'])}. "
    if row['cast']:
        text += f"the leading actors are {','.join(row['cast'])}. "
    if row['country']:
        text += f"the movie is from {','.join(row['country'])}. "
    if row['release_year']:
        text += f"the movie is released in {row['release_year']}. "
    if row['listed_in']:
        text += f"the movie falls within the genre of {','.join(row['listed_in'])}. "
    
    metadata.append(text)

In [25]:
metadata_embeddings = model.encode(metadata, convert_to_tensor=True)

In [26]:
print(metadata_embeddings.shape)

torch.Size([8254, 384])


In [27]:
metadata_similarity_scores = torch.matmul(metadata_embeddings, metadata_embeddings.T).cpu().numpy()

In [28]:
evaluate(metadata_similarity_scores)

Average rank: 3192.625
Successful recommendations: 1


In [29]:
evaluate(metadata_similarity_scores, True)

Average rank: 2935.7053571428573
Successful recommendations: 1


In [30]:
evaluate(descriptions_similarity_scores + metadata_similarity_scores)

Average rank: 3052.910714285714
Successful recommendations: 3


In [31]:
evaluate(descriptions_similarity_scores + metadata_similarity_scores, True)

Average rank: 2581.910714285714
Successful recommendations: 3


In [32]:
pickle.dump(titles_df, open('../data/movie_list.pkl', 'wb'))
pickle.dump(descriptions_similarity_scores + metadata_similarity_scores, open('../data/similarity_bert.pkl', 'wb'))

In [33]:
pickle.dump(descriptions_embeddings.cpu().numpy(), open('../data/descriptions_embeddings.pkl', 'wb'))
pickle.dump(metadata_embeddings.cpu().numpy(), open('../data/metadata_embeddings.pkl', 'wb'))

In [34]:
metadata

['the director is kirsten johnson. the movie is from united states. the movie is released in 2020. the movie falls within the genre of documentaries. ',
 'the leading actors are  thabang molaba, cindy mahlangu, patrick mofokeng. the movie is from south africa. the movie is released in 2021. the movie falls within the genre of international tv shows, tv dramas, tv mysteries. ',
 'the director is julien leclercq. the leading actors are sami bouajila, samuel jouy, sofia lesaffre. the movie is released in 2021. the movie falls within the genre of crime tv shows, international tv shows, tv action & adventure. ',
 'the movie is released in 2021. the movie falls within the genre of docuseries, reality tv. ',
 'the leading actors are  alam khan, ahsaas channa, revathi pillai. the movie is from india. the movie is released in 2021. the movie falls within the genre of international tv shows, romantic tv shows, tv comedies. ',
 'the director is mike flanagan. the leading actors are  henry thomas,