# Transformers-Powered Movie Recommendation System (IMDb Web Scraping)

Register the Virtual Environment as a Jupyter Kernel:
Register the activated virtual environment as a kernel that Jupyter Notebook can recognize:

In [1]:
import sys
!{sys.executable} -m ipykernel install --user --name=venv --display-name "My Venv"

zsh:1: no such file or directory: /Users/lilswapnil/GitHub


### Install Requirements

In [None]:
!pip install -r ./requirements.txt



### Imports Requirements

In [3]:
import numpy as np
import pandas as pd
import torch
import re
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


## Load & Prepare Data

In [None]:
# Load the CSV file into a DataFrame
file_path = "./data/imdb_top_1000.csv"  # Adjust path relative to notebook location
df = pd.read_csv(file_path)

# Keep only relevant columns
MOVIES = df[['Genre', 'Series_Title', 'IMDB_Rating', 'Overview']].copy()
print(f"Loaded {len(MOVIES)} movies")
MOVIES.head()

Loaded 1000 movies


Unnamed: 0,Genre,Series_Title,IMDB_Rating,Overview
0,Drama,The Shawshank Redemption,9.3,Two imprisoned men bond over a number of years...
1,"Crime, Drama",The Godfather,9.2,An organized crime dynasty's aging patriarch t...
2,"Action, Crime, Drama",The Dark Knight,9.0,When the menace known as the Joker wreaks havo...
3,"Crime, Drama",The Godfather: Part II,9.0,The early life and career of Vito Corleone in ...
4,"Crime, Drama",12 Angry Men,9.0,A jury holdout attempts to prevent a miscarria...


## Initialize BERT Model

In [5]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
print("BERT model loaded successfully")

BERT model loaded successfully


## Define Helper Functions

In [6]:
def get_bert_embedding(text):
    """Get BERT embeddings for a given text."""
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    with torch.no_grad():
        output = model(inputs["input_ids"], attention_mask=inputs["attention_mask"])
    
    return output.last_hidden_state[:, 0, :].squeeze().numpy()

def preprocess_user_input(user_input):
    """Preprocess user input for better tokenization."""
    user_input = user_input.lower()
    user_input = re.sub(r'[^\w\s]', ' ', user_input)
    user_input = re.sub(r'\s+', ' ', user_input)
    return user_input

def extract_movie_names(user_input):
    """Extract movie names within double quotes."""
    return re.findall(r'"([^"]*)"', user_input)

def extract_imdb_rating(user_input):
    """Extract IMDb rating from user input."""
    patterns = [r'imdb:(\d+\.\d+)', r'imdb: (\d+\.\d+)', r'imdb:(\d+)', r'imdb: (\d+)']
    for pattern in patterns:
        rating = re.findall(pattern, user_input)
        if rating:
            return float(rating[0])
    return None

## Generate Movie Embeddings

In [7]:
# Calculate movie embeddings (this may take a few minutes)
print("Generating BERT embeddings for all movies...")
movie_ef = np.array([get_bert_embedding(desc) for desc in MOVIES['Overview']])
print(f"Generated embeddings shape: {movie_ef.shape}")

Generating BERT embeddings for all movies...
Generated embeddings shape: (1000, 768)


## Define Recommendation Functions

In [8]:
def get_movie_embeddings(movie_names):
    """Get embeddings for user-specified movies."""
    movie_embeddings = []
    for name in movie_names:
        matching_movie = MOVIES[MOVIES['Series_Title'] == name]
        if not matching_movie.empty:
            movie_index = matching_movie.index[0]
            movie_embedding = movie_ef[movie_index]
            movie_embeddings.append(movie_embedding)
    return np.array(movie_embeddings)

def calculate_weighted_similarity(user_embedding, user_movie_embeddings, movie_embeddings):
    """Calculate weighted cosine similarity."""
    user_similarities = cosine_similarity(user_embedding.reshape(1, -1), movie_embeddings)
    
    if user_movie_embeddings.size > 0:
        movie_similarities = cosine_similarity(user_movie_embeddings, movie_embeddings)
        weighted_similarities = 0.5 * user_similarities + 0.5 * movie_similarities.mean(axis=0, keepdims=True)
    else:
        weighted_similarities = user_similarities
    
    return weighted_similarities

def recommend_movies(user_input):
    """Recommend top 15 movies based on user input."""
    # Preprocess input and get user embedding
    processed_input = preprocess_user_input(user_input)
    inputs = tokenizer(processed_input, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        output = model(inputs["input_ids"], attention_mask=inputs["attention_mask"])
    user_embedding = output.last_hidden_state[:, 0, :].numpy()
    
    # Filter movies by IMDb rating if specified
    movies = MOVIES.copy()
    imdb_score = extract_imdb_rating(user_input)
    if imdb_score:
        movies = MOVIES[MOVIES['IMDB_Rating'] >= imdb_score]
        print(f"Filtered to {len(movies)} movies with IMDb rating >= {imdb_score}")
    
    filtered_indices = movies.index
    filtered_embeddings = movie_ef[filtered_indices]
    
    # Get user-specified movie embeddings
    user_movie_names = extract_movie_names(user_input)
    user_movie_embeddings = get_movie_embeddings(user_movie_names)
    
    # Calculate similarity scores
    similarity_scores = calculate_weighted_similarity(user_embedding, user_movie_embeddings, filtered_embeddings)
    
    # Get top 15 recommendations
    top_indices = np.argsort(similarity_scores[0])[-15:][::-1]  # Reverse for descending order
    recommended_movies = movies.iloc[top_indices][['Series_Title', 'IMDB_Rating', 'Genre']].reset_index(drop=True)
    
    return recommended_movies

##  Interactive Recommendation

In [9]:
# Instructions for users
print("Movie Recommendation System")
print("=" * 50)
print("1. Mention movie names in double quotes: \"Inception\"")
print("2. Specify minimum IMDb rating: imdb:8.0")
print("3. Example: \"The Dark Knight\" action thriller imdb:8.5")
print()

# Get user input
user_input = input("Enter your prompt: ")
print(f"\nPrompt: {user_input}")

# Get recommendations
recommended_movies = recommend_movies(user_input)

print(f"\nTop {len(recommended_movies)} Recommended Movies:")
print("=" * 60)
for i, (_, movie) in enumerate(recommended_movies.iterrows(), 1):
    print(f"{i:2d}. {movie['Series_Title']} (IMDb: {movie['IMDB_Rating']}) - {movie['Genre']}")

Movie Recommendation System
1. Mention movie names in double quotes: "Inception"
2. Specify minimum IMDb rating: imdb:8.0
3. Example: "The Dark Knight" action thriller imdb:8.5


Prompt: the great gatsby

Top 15 Recommended Movies:
 1. Das Boot (IMDb: 8.3) - Adventure, Drama, Thriller
 2. The Last Emperor (IMDb: 7.7) - Biography, Drama, History
 3. A Man for All Seasons (IMDb: 7.7) - Biography, Drama, History
 4. The Perks of Being a Wallflower (IMDb: 8.0) - Drama, Romance
 5. The Longest Day (IMDb: 7.8) - Action, Drama, History
 6. The Lord of the Rings: The Fellowship of the Ring (IMDb: 8.8) - Action, Adventure, Drama
 7. The World's Fastest Indian (IMDb: 7.8) - Biography, Drama, Sport
 8. Magnolia (IMDb: 8.0) - Drama
 9. Fantasia (IMDb: 7.7) - Animation, Family, Fantasy
10. M.S. Dhoni: The Untold Story (IMDb: 7.8) - Biography, Drama, Sport
11. Spotlight (IMDb: 8.1) - Biography, Crime, Drama
12. The King's Speech (IMDb: 8.0) - Biography, Drama, History
13. Dazed and Confused (IMDb: 7