In [1]:
# import libs 
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# path of csv files
netflix_csv_path = "../data/netflix_titles.csv"
amazon_csv_path = "../data/amazon_titles.csv"

# Read CSV files from List
df = pd.concat(map(pd.read_csv, [netflix_csv_path, amazon_csv_path]))
data = df[['id','title', 'type', 'genres', 'description']]

# removing missing movies/shows without descriptions
data = data[~data['description'].isnull()].reset_index()

# Concat genres and description
data['desc_n_genres'] = data['genres'].str.cat(data['description'], sep=', ')

# df infos
data.info()

# df first 5 lines
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15584 entries, 0 to 15583
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   index          15584 non-null  int64 
 1   id             15584 non-null  object
 2   title          15584 non-null  object
 3   type           15584 non-null  object
 4   genres         15584 non-null  object
 5   description    15584 non-null  object
 6   desc_n_genres  15584 non-null  object
dtypes: int64(1), object(6)
memory usage: 852.4+ KB


Unnamed: 0,index,id,title,type,genres,description,desc_n_genres
0,0,ts300399,Five Came Back: The Reference Films,SHOW,['documentation'],This collection includes 12 World War II-era p...,"['documentation'], This collection includes 12..."
1,1,tm84618,Taxi Driver,MOVIE,"['drama', 'crime']",A mentally unstable Vietnam War veteran works ...,"['drama', 'crime'], A mentally unstable Vietna..."
2,2,tm154986,Deliverance,MOVIE,"['drama', 'action', 'thriller', 'european']",Intent on seeing the Cahulawassee River before...,"['drama', 'action', 'thriller', 'european'], I..."
3,3,tm127384,Monty Python and the Holy Grail,MOVIE,"['fantasy', 'action', 'comedy']","King Arthur, accompanied by his squire, recrui...","['fantasy', 'action', 'comedy'], King Arthur, ..."
4,4,tm120801,The Dirty Dozen,MOVIE,"['war', 'action']",12 American military prisoners in World War II...,"['war', 'action'], 12 American military prison..."


In [3]:
# model = SentenceTransformer('paraphrase-distilroberta-base-v1')
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')


In [42]:
## Get embeddings
# descriptions = data['desc_n_genres'].tolist()

# des_embeddings = []
# for i,des in enumerate(descriptions):
#     des_embeddings.append(model.encode(des))

# np.save('../input/descriptions-multiLM-embeddings/des_embeddings.npy', des_embeddings)

In [4]:
## Choosing description embeddings  
# des_embeddings = np.load('../input/descriptions-roberta-embeddings/des_embeddings.npy')
des_embeddings = np.load('../input/descriptions-multiLM-embeddings/des_embeddings.npy')


In [5]:
# Function to get top 5 matches similar with all embeddings
def recommend(query):
    #Compute cosine-similarities with all embeddings 
    query_embedd = model.encode(query)
    cosine_scores = util.pytorch_cos_sim(query_embedd, des_embeddings)
    top5_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][1:6]
    return top5_matches

In [6]:
recommendded_results = recommend("An action film/series featuring spaceship battles in a futuristic era")
recommendded_results

[366, 13387, 7195, 10463, 9575]

In [16]:
df = data.iloc[recommendded_results,:][['title','desc_n_genres']]
df.index = np.arange(1, len(df) + 1)
df

Unnamed: 0,title,desc_n_genres
1,Mobile Suit Gundam Unicorn,"['scifi', 'animation', 'action', 'drama', 'war..."
2,Cosmoball,"['scifi', 'sport', 'fantasy', 'action'], Cosmo..."
3,Riders to the Stars,"['drama', 'scifi'], Three men gamble their liv..."
4,Scavengers,"['action', 'scifi', 'drama', 'thriller', 'come..."
5,Shadow Company,"['documentation'], Documentary about the merce..."
