In [35]:
# importing dependencies
import pandas as pd
from transformers import BertTokenizer, TFBertModel
import torch




In [3]:
# reading the csv file for rotten tomatoes movies
movies_df = pd.read_csv('Resources/rotten_tomatoes_movies.csv')
movies_df.columns

Index(['rotten_tomatoes_link', 'movie_title', 'movie_info',
       'critics_consensus', 'content_rating', 'genres', 'directors', 'authors',
       'actors', 'original_release_date', 'streaming_release_date', 'runtime',
       'production_company', 'tomatometer_status', 'tomatometer_rating',
       'tomatometer_count', 'audience_status', 'audience_rating',
       'audience_count', 'tomatometer_top_critics_count',
       'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count'],
      dtype='object')

In [4]:
# reading the csv file for rotten tomatoes movie reviews
movie_reviews_df = pd.read_csv('Resources/rotten_tomatoes_critic_reviews.csv')
movie_reviews_df.columns

Index(['rotten_tomatoes_link', 'critic_name', 'top_critic', 'publisher_name',
       'review_type', 'review_score', 'review_date', 'review_content'],
      dtype='object')

In [5]:
# merging the two dataframes on rotten_tomatoes_link
df_movies = pd.merge(movies_df, movie_reviews_df, on='rotten_tomatoes_link', how='inner')
pd.set_option('display.max_columns', None)
df_movies.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,streaming_release_date,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


In [6]:
# dropping columns that are not needed
columns_to_drop = ['authors', 'streaming_release_date', 'tomatometer_status', 'audience_status', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count', 'critic_name', 'top_critic', 'publisher_name', 'review_type', 'review_score', 'review_date']
df_movies.drop(columns=columns_to_drop, inplace=True)

In [7]:
# showing the first 5 rows of the new dataframe
df_movies.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,actors,original_release_date,runtime,production_company,tomatometer_rating,tomatometer_count,audience_rating,audience_count,review_content
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,119.0,20th Century Fox,49.0,149.0,53.0,254421.0,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,119.0,20th Century Fox,49.0,149.0,53.0,254421.0,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,119.0,20th Century Fox,49.0,149.0,53.0,254421.0,With a top-notch cast and dazzling special eff...
3,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,119.0,20th Century Fox,49.0,149.0,53.0,254421.0,Whether audiences will get behind The Lightnin...
4,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,119.0,20th Century Fox,49.0,149.0,53.0,254421.0,What's really lacking in The Lightning Thief i...


In [12]:
df_movies.to_csv("Resources/df_movies.csv")

In [8]:
# checking the data types of the columns
df_movies.dtypes

rotten_tomatoes_link      object
movie_title               object
movie_info                object
critics_consensus         object
content_rating            object
genres                    object
directors                 object
actors                    object
original_release_date     object
runtime                  float64
production_company        object
tomatometer_rating       float64
tomatometer_count        float64
audience_rating          float64
audience_count           float64
review_content            object
dtype: object

In [10]:
# checking the shape of the dataframe
df_movies.shape

(1129887, 16)

In [13]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [15]:
# Define a function to tokenize text
def tokenize_text(text):
    return tokenizer.tokenize(text)

In [16]:
# Apply the tokenization function to the 'movie_title' column
df_movies['tokenized_movie_title'] = df_movies['movie_title'].apply(tokenize_text)

In [22]:
# Convert the 'review_content' column to a string ... even though it already was ... but was getting an error
df_movies['review_content'] = df_movies['review_content'].astype(str)

In [23]:
# Apply the tokenization function to the 'review_content' column
df_movies['tokenized_review_content'] = df_movies['review_content'].apply(tokenize_text)

In [24]:
print(df_movies)

        rotten_tomatoes_link  \
0                  m/0814255   
1                  m/0814255   
2                  m/0814255   
3                  m/0814255   
4                  m/0814255   
...                      ...   
1129882          m/zulu_dawn   
1129883          m/zulu_dawn   
1129884          m/zulu_dawn   
1129885          m/zulu_dawn   
1129886          m/zulu_dawn   

                                               movie_title  \
0        Percy Jackson & the Olympians: The Lightning T...   
1        Percy Jackson & the Olympians: The Lightning T...   
2        Percy Jackson & the Olympians: The Lightning T...   
3        Percy Jackson & the Olympians: The Lightning T...   
4        Percy Jackson & the Olympians: The Lightning T...   
...                                                    ...   
1129882                                          Zulu Dawn   
1129883                                          Zulu Dawn   
1129884                                          Zulu Daw

In [40]:
def get_recommendations(queries, movies_df, bert_model, tokenizer):
    # Tokenize movie descriptions
    # encoded_inputs = tokenizer(df_movies['tokenized_review_content'].tolist(), padding=True, truncation=True, return_tensors='pt')
    
    # Encode movie descriptions
    with torch.no_grad():
        movie_embeddings = bert_model(df_movies['tokenized_review_content']).last_hidden_state.mean(dim=1)
    
    recommendations = []
    for query in queries:
        # Tokenize user query
        encoded_query = tokenizer(query, padding=True, truncation=True, return_tensors='pt')
        
        # Encode user query
        with torch.no_grad():
            query_embedding = bert_model(encoded_query).last_hidden_state.mean(dim=1)
        
        # Calculate similarities
        similarities = torch.nn.functional.cosine_similarity(movie_embeddings, query_embedding)
        
        # Get top recommendations
        top_k_indices = similarities.argsort(descending=True)[:10]
        recommended_movies = movies_df.iloc[top_k_indices][['movie_title', 'review_content']]
        recommendations.append(recommended_movies)
    
    return recommendations
    

In [36]:
# Load tokenizer and BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [41]:
# Example queries
queries = ["action-packed thriller", "heartwarming family drama", "sci-fi adventure"]
recommendations = get_recommendations(queries, movies_df, bert_model, tokenizer)

ValueError: Exception encountered when calling layer 'tf_bert_model' (type TFBertModel).

Data of type <class 'pandas.core.series.Series'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for input_ids.

Call arguments received by layer 'tf_bert_model' (type TFBertModel):
  • input_ids=0          [a, fantasy, adventure, that, fuse, ##s, greek...
1          [um, ##a, th, ##ur, ##man, as, med, ##usa, ,, ...
2          [with, a, top, -, notch, cast, and, dazzling, ...
3          [whether, audiences, will, get, behind, the, l...
4          [what, ', s, really, lacking, in, the, lightni...
                                 ...                        
1129882                                                [nan]
1129883    [seen, today, ,, it, ', s, not, only, a, start...
1129884    [a, ro, ##using, visual, spectacle, that, ', s...
1129885    [a, simple, two, -, act, story, :, prelude, to...
1129886    [rides, the, line, between, being, a, pure, ar...
Name: tokenized_review_content, Length: 1129887, dtype: object
  • attention_mask=None
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False