In [8]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import torch

In [9]:
movies_df = pd.read_csv("movie_dataset.csv")
movies_df.head(2)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski


In [11]:
features = ['keywords', 'cast', 'genres', 'director']
for feature in features:
    movies_df[feature] = movies_df[feature].fillna('')

In [12]:
movies_df[['title', 'cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron,culture clash future space war space colony so...,Action Adventure Fantasy Science Fiction
1,Pirates of the Caribbean: At World's End,Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski,ocean drug abuse exotic island east india trad...,Adventure Fantasy Action
2,Spectre,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Sam Mendes,spy based on novel secret agent sequel mi6,Action Adventure Crime
3,The Dark Knight Rises,Christian Bale Michael Caine Gary Oldman Anne ...,Christopher Nolan,dc comics crime fighter terrorist secret ident...,Action Crime Drama Thriller
4,John Carter,Taylor Kitsch Lynn Collins Samantha Morton Wil...,Andrew Stanton,based on novel mars medallion space travel pri...,Action Adventure Science Fiction


In [13]:
movies_df["combined_features"] = movies_df['keywords']+" "+movies_df['cast']+" "+movies_df['genres']+" "+movies_df['director']
movies_df["combined_features"]

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based on novel secret agent sequel mi6 Dan...
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4798    united states\u2013mexico barrier legs arms pa...
4799     Edward Burns Kerry Bish\u00e9 Marsha Dietlein...
4800    date love at first sight narration investigati...
4801     Daniel Henney Eliza Coupe Bill Paxton Alan Ru...
4802    obsession camcorder crush dream girl Drew Barr...
Name: combined_features, Length: 4803, dtype: object

In [None]:
movies_db = movies_df["combined_features"].tolist()
movies_db

In [16]:
model = SentenceTransformer('all-MiniLM-L6-v1') 

In [17]:
movies_db_embd = model.encode(movies_db, convert_to_tensor=True, show_progress_bar=True, device='cuda')

Batches:   0%|          | 0/151 [00:00<?, ?it/s]

In [21]:
movies_db_embd.shape

torch.Size([4803, 384])

In [23]:
cos_scores = util.pytorch_cos_sim(movies_db_embd, movies_db_embd)
cos_scores.shape

torch.Size([4803, 4803])

In [24]:
movie_user_likes = "The Avengers"

In [25]:
movie_index  = movies_df[movies_df['title'] == movie_user_likes]["index"].values[0]
movie_index

16

In [27]:
top_k = min(10, len(movies_db))
top_results = torch.topk(cos_scores[movie_index], k=top_k)
top_results

torch.return_types.topk(
values=tensor([1.0000, 0.8171, 0.7636, 0.7227, 0.6921, 0.6535, 0.6330, 0.6109, 0.5999,
        0.5981], device='cuda:0'),
indices=tensor([ 16,   7,  85,  26, 126,  79, 129,  10, 182,  68], device='cuda:0'))

In [30]:
print("\n\n======================\n\n")
print("Movie user likes:", movie_user_likes)
print("\nTop 10 movies recommendation for move",movie_user_likes)

for score, idx in zip(top_results[0][1:], top_results[1][1:]):
    print(movies_df.iloc[int(idx)]['title'], "(Score: {:.4f})".format(score))





Movie user likes: The Avengers

Top 10 movies recommendation for move The Avengers
Avengers: Age of Ultron (Score: 0.8171)
Captain America: The Winter Soldier (Score: 0.7636)
Captain America: Civil War (Score: 0.7227)
Thor: The Dark World (Score: 0.6921)
Iron Man 2 (Score: 0.6535)
Thor (Score: 0.6330)
Superman Returns (Score: 0.6109)
Ant-Man (Score: 0.5999)
Iron Man (Score: 0.5981)


In [None]:
movies_df.iloc[top_results[1]][['title','keywords', 'cast', 'genres', 'director', 'combined_features']]