In [1]:
# These are my default settings
import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import joblib

plt.rcParams["figure.figsize"] = (12, 6)
sns.set()
pd.set_option("display.max_columns", None)

In [2]:
df = pd.read_csv("movies.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     10000 non-null  int64  
 1   title          10000 non-null  object 
 2   poster         10000 non-null  object 
 3   imdb_link      10000 non-null  object 
 4   genre          10000 non-null  object 
 5   year           10000 non-null  int64  
 6   certificate    10000 non-null  object 
 7   plot           10000 non-null  object 
 8   directors      10000 non-null  object 
 9   director_1     10000 non-null  object 
 10  director_2     10000 non-null  object 
 11  actors         10000 non-null  object 
 12  actor_1        10000 non-null  object 
 13  actor_2        10000 non-null  object 
 14  actor_3        10000 non-null  object 
 15  time_minute    10000 non-null  int64  
 16  imdb_rating    10000 non-null  float64
 17  metascore      6016 non-null   float64
 18  vote   

In [3]:
df.columns

Index(['Unnamed: 0', 'title', 'poster', 'imdb_link', 'genre', 'year',
       'certificate', 'plot', 'directors', 'director_1', 'director_2',
       'actors', 'actor_1', 'actor_2', 'actor_3', 'time_minute', 'imdb_rating',
       'metascore', 'vote', 'gross_earning'],
      dtype='object')

In [4]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,title,poster,imdb_link,genre,year,certificate,plot,directors,director_1,director_2,actors,actor_1,actor_2,actor_3,time_minute,imdb_rating,metascore,vote,gross_earning
0,0,The Shawshank Redemption,https://m.media-amazon.com/images/S/sash/4Fyxw...,https://www.imdb.com/title/tt0111161/,['Drama'],1994,R,Two imprisoned men bond over a number of years...,['Frank Darabont'],['Frank Darabont'],-,"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton']",Tim Robbins,Morgan Freeman,Bob Gunton,142,9.3,80.0,2517941,28.34
1,1,The Dark Knight,https://m.media-amazon.com/images/S/sash/4Fyxw...,https://www.imdb.com/title/tt0468569/,"['Action', 'Crime', 'Drama']",2008,PG-13,When the menace known as the Joker wreaks havo...,['Christopher Nolan'],['Christopher Nolan'],-,"['Christian Bale', 'Heath Ledger', 'Aaron Eckh...",Christian Bale,Heath Ledger,Aaron Eckhart,152,9.0,84.0,2468188,534.86
2,2,Inception,https://m.media-amazon.com/images/S/sash/4Fyxw...,https://www.imdb.com/title/tt1375666/,"['Action', 'Adventure', 'Sci-Fi']",2010,PG-13,A thief who steals corporate secrets through t...,['Christopher Nolan'],['Christopher Nolan'],-,"['Leonardo DiCaprio', 'Joseph Gordon-Levitt', ...",Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,148,8.8,74.0,2213396,292.58
3,3,Fight Club,https://m.media-amazon.com/images/S/sash/4Fyxw...,https://www.imdb.com/title/tt0137523/,['Drama'],1999,R,An insomniac office worker and a devil-may-car...,['David Fincher'],['David Fincher'],-,"['Brad Pitt', 'Edward Norton', 'Meat Loaf']",Brad Pitt,Edward Norton,Meat Loaf,139,8.8,66.0,1980219,37.03
4,4,Forrest Gump,https://m.media-amazon.com/images/S/sash/4Fyxw...,https://www.imdb.com/title/tt0109830/,"['Drama', 'Romance']",1994,PG-13,"The presidencies of Kennedy and Johnson, the V...",['Robert Zemeckis'],['Robert Zemeckis'],-,"['Tom Hanks', 'Robin Wright', 'Gary Sinise']",Tom Hanks,Robin Wright,Gary Sinise,142,8.8,82.0,1943101,330.25


# Bert Hugging face

## Preprocessing

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Select relevant features
features = ["plot", "genre", "directors", "actors"]

In [6]:
# Fill any missing values
for feature in features:
    df[feature] = df[feature].fillna("")

df[feature].info()

<class 'pandas.core.series.Series'>
RangeIndex: 10000 entries, 0 to 9999
Series name: actors
Non-Null Count  Dtype 
--------------  ----- 
10000 non-null  object
dtypes: object(1)
memory usage: 78.2+ KB


## Emb

In [8]:
import torch

# Specify the GPU index
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# Use the BERT model from HuggingFace for embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")

# Move the model to the GPU
model.to(device)


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [11]:
from tensorflow.python.client import device_lib 
device_lib.list_local_devices()

ModuleNotFoundError: No module named 'tensorflow'

In [9]:
# Combine the features into one string
df["combined_features"] = df[features].apply(lambda row: " ".join(row.values.astype(str)), axis=1)

In [10]:
# Compute embeddings for each movie
embeddings = model.encode(df["combined_features"].tolist(), convert_to_tensor=True)

# Compute the cosine similarity matrix from the embeddings
cosine_sim_matrix = cosine_similarity(embeddings)

KeyboardInterrupt: 

In [15]:
def recommend_movies(title):
    # Get the index of the movie from its title
    idx = df[df["title"] == title].index[0]

    # Get a list of tuples in the format (movie index, similarity score)
    similarity_scores = list(enumerate(cosine_sim_matrix[idx]))

    # Sort the list of tuples by the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar movies
    top_10_movies_indices = [i[0] for i in similarity_scores[1:11]]

    # Return the titles of the top 10 most similar movies
    return df["title"].iloc[top_10_movies_indices]

(…)f3d3c277d6e90027e55de9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

(…)7d6e90027e55de9125/1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(…)e2f80f3d3c277d6e90027e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

(…)f80f3d3c277d6e90027e55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

(…)de9125/config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

(…)d3c277d6e90027e55de9125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

(…)90027e55de9125/sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

(…)6e90027e55de9125/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

(…)f3d3c277d6e90027e55de9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)7d6e90027e55de9125/tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

(…)3d3c277d6e90027e55de9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

(…)e2f80f3d3c277d6e90027e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)80f3d3c277d6e90027e55de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

KeyboardInterrupt: 