# Transformers-Powered Movie Recommendation System (IMDb Web Scraping)

Register the Virtual Environment as a Jupyter Kernel:
Register the activated virtual environment as a kernel that Jupyter Notebook can recognize:

In [9]:
import sys
!{sys.executable} -m ipykernel install --user --name=venv --display-name "My Venv"

zsh:1: no such file or directory: /Users/lilswapnil/GitHub


### Install Requirements

In [10]:
!pip install -r ../requirements.txt



### Imports Requirements

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
import re
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

## Reade File

In [None]:
import scrapy
file_path = "./imdb_top_1000.csv" 
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


## 1. Cleaning DataSet

In [None]:
df = df[['Genre','Series_Title','IMDB_Rating','Overview']]
MOVIES = df
MOVIES.head()

Unnamed: 0,Genre,Series_Title,IMDB_Rating,Overview
0,Drama,The Shawshank Redemption,9.3,Two imprisoned men bond over a number of years...
1,"Crime, Drama",The Godfather,9.2,An organized crime dynasty's aging patriarch t...
2,"Action, Crime, Drama",The Dark Knight,9.0,When the menace known as the Joker wreaks havo...
3,"Crime, Drama",The Godfather: Part II,9.0,The early life and career of Vito Corleone in ...
4,"Crime, Drama",12 Angry Men,9.0,A jury holdout attempts to prevent a miscarria...


## 2. Load BERT tokenizer and model

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

## 3. Function to get BERT embeddings for a given text

In [None]:
def get_bert_embedding(text):
    # Encode the input text and create attention mask
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors="pt",
        return_attention_mask=True
    )
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)

    embeddings = output.last_hidden_state[:, 0, :].numpy()  # Squeezing to get 1D embeddings
    return embeddings

## 4. Preprocess the input for better tokenization

In [None]:
def preprocess_user_input(user_input):
    # Preprocess the input for better tokenization
    user_input = user_input.lower()
    user_input = re.sub(r'[^\w\s]', ' ', user_input)  # Remove punctuation
    user_input = re.sub(r'\s+', ' ', user_input)  # Normalize whitespace
    return user_input

## 5. Extract user-specified movie names within double inverted commas

In [None]:
def extract_movie_names(user_input):
    movie_names = re.findall(r'"([^"]*)"', user_input)
    return movie_names

## 6. Extract user-specified imdb rating mentioned after imdb:

In [None]:
def extract_imdb_rating(user_input):
    
    imdb_rating = re.findall(r'imdb:(\d+\.\d+)', user_input)       #imdb:8.2
    if len(imdb_rating) == 0:
         imdb_rating = re.findall(r'imdb: (\d+\.\d+)', user_input) #imdb: 8.2
    if len(imdb_rating) == 0 :
         imdb_rating = re.findall(r'imdb:(\d+)', user_input)       #imdb:8
    if len(imdb_rating) == 0 :
        imdb_rating = re.findall(r'imdb: (\d+)', user_input)       #imdb: 8
    return imdb_rating

## 7. Function to get movie embeddings based on user-specified movie names

In [None]:
def get_movie_embeddings(movie_names):
    movie_embeddings = []
    
    for name in movie_names:
        matching_movie = MOVIES[MOVIES['Series_Title'] == name]
        if not matching_movie.empty:
            movie_index = MOVIES[MOVIES['Series_Title'] == name].index[0]
            movie_embedding = movie_ef[movie_index]
            movie_embeddings.append(movie_embedding)
    return np.array(movie_embeddings)

## 8. Calculate similarity using weighted cosine similarity

In [None]:
def calculate_weighted_similarity(user_embedding, user_movie_embeddings, movie_embeddings):
    user_similarities = cosine_similarity(user_embedding.reshape(1, -1), movie_embeddings) #reshaping the user_embeddings to 2D
    if not user_movie_embeddings.size == 0: 
        movie_similarities = cosine_similarity(user_movie_embeddings, movie_embeddings)
        weighted_similarities = 0.5 * user_similarities + 0.5 * movie_similarities
    else: 
        weighted_similarities = 1 * user_similarities
    return weighted_similarities

## 9. Function to recommend top 15 movies based on user input

In [None]:
def recommend_movies(user_input):
    user_input = user_input 
    user_inputs = preprocess_user_input(user_input)
    
    # Tokenize the user input and get attention mask
    inputs = tokenizer(user_inputs, return_tensors="pt")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Use the model to get embeddings for the user input
    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)

    user_embedding = output.last_hidden_state[:, 0, :].numpy()  
 

    movies = MOVIES.copy()

## 10. Filter the movies based on IMDb score

In [None]:
imdb_score = extract_imdb_rating(user_input) 
for x in imdb_score:
    imdb_score = float(x) 
if imdb_score:
        movies = MOVIES[MOVIES['IMDB_Rating'] >= imdb_score]

filtered_indices = movies.index
filtered_embeddings = movie_ef[filtered_indices]

# Calculate similarity using weighted cosine similarity
user_movie_names = extract_movie_names(user_input)
user_movie_embeddings = get_movie_embeddings(user_movie_names)
weighted_similarity_scores = calculate_weighted_similarity(user_embedding, user_movie_embeddings, filtered_embeddings)

# Get the indices of the top 15 similar movies
movie_indices = np.argsort(weighted_similarity_scores[0])[-15:]

# Get the recommended movies based on the indices
recommended_movies = movies.iloc[movie_indices]['Series_Title'].tolist()
return recommended_movies

NameError: name 'movie_ef' is not defined

## Example usage

In [None]:
if __name__ == "__main__":
    print(" 1. If you are mentioning any movie than mention the full name of the movie between double inverted commas, example: \"Mission: Impossible\"")
    print(" 2. Metion the IMDB rating more than or equal to which you want your movie to be as imdb:8")
    user_input = input("Enter your prompt: ")
    print()
    print("Prompt: ",user_input)
    recommended_movies = recommend_movies(user_input)
    print("\nRecommended Movies:")
    for i, movie in enumerate(recommended_movies, 1):
        print(f"{i}. {movie}")