In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack, csr_matrix
import ast
from bs4 import BeautifulSoup
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [None]:
movie_title = input("Enter Movie Title: ").title()
API_KEY = '40aef168ab16c2f7c59380272ba1b17e'
BASE_URL = 'https://api.themoviedb.org/3'

def get_movie_details(movie_name):
    # Search for the movie
    search_url = f"{BASE_URL}/search/movie"
    params = {
        'api_key': API_KEY,
        'query': movie_name
    }

    response = requests.get(search_url, params=params)
    results = response.json().get('results')

    if not results:
        return "No movie found."

    # Get the ID of the first result
    first_movie = results[0]
    movie_id = first_movie['id']

    # Fetch full details using the movie ID
    detail_url = f"{BASE_URL}/movie/{movie_id}"
    detail_params = {'api_key': API_KEY}
    actual_url = f"https://www.themoviedb.org/movie/{movie_id}"

    movie_details = requests.get(detail_url, params=detail_params).json()
    website = requests.get(actual_url)
    soup = BeautifulSoup(website.text, 'html.parser')

    cast_cards = soup.find_all('li', class_='card')
    actors = [tag.get_text() for tag in cast_cards]
    actors_with_roles = [actor.strip() for actor in actors]
    actors_without_roles = [item.split('\n')[0] for item in actors_with_roles]

    keyword_cards = soup.find_all('section', class_='keywords right_column')
    keywords = [tag.get_text()[11:] for tag in keyword_cards]
    keywords_cleaned = [actor.strip().replace('\n', ', ') for actor in keywords]

    director_card = soup.find_all('ol', class_='people no_image')
    director_text = [tag.get_text() for tag in director_card]
    director_text_cleaned = [item.split('\n')[2] for item in director_text]

    title = movie_details.get('title')
    overview = movie_details.get('overview')
    director = director_text_cleaned
    genre_name = [genre['name'] for genre in movie_details['genres']]

    return {
        'title': title,
        'director': director,
        'overview': overview,
        'genre_names': genre_name,
        'actors': actors_without_roles,
        'keywords': keywords_cleaned,
    }

movie_info = get_movie_details(movie_title)

Enter Movie Title: American Psycho


In [None]:
def create_soup_from_scraped_dict(movie):
    # 1. Clean and tokenize lists (Director, Genres, Actors)
    def clean_and_join(items):
        if isinstance(items, list):
            # Remove spaces and lowercase everything
            return [str(i).replace(" ", "").lower() for i in items]
        return []

    # 2. Special handling for your 'keywords' string
    # Splits "car race, villain" into ['carrace', 'villain']
    raw_keywords = movie.get('keywords', '')
    # Ensure raw_keywords is a string before splitting
    if isinstance(raw_keywords, list):
        raw_keywords = ", ".join(raw_keywords)
    keywords_list = [k.strip().replace(" ", "").lower() for k in raw_keywords.split(',')]

    # 3. Process the other fields
    director = clean_and_join(movie.get('director', []))
    genres = clean_and_join(movie.get('genre_names', []))
    actors = clean_and_join(movie.get('actors', []))[:]

    # 4. Overview stays as natural language (just lowercase)
    overview = movie.get('overview', '').lower()

    # 5. Create the Final String Combine everything into one "soup"
    soup = " ".join(director) + " " + " ".join(genres) + " " + " ".join(keywords_list) + " " + " ".join(actors) + " " + overview

    return soup

In [None]:
data = pd.read_csv('movies.csv')

# 1. Process the CONTENT of the 'cast' column
def joinall(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    return str.lower(str(x).replace(" ", ""))

# Apply cleaning to specific columns to create unique tokens
data['cast'] = data['cast'].apply(joinall)
data['director'] = data['director'].apply(joinall)
data['genres'] = data['genres'].apply(joinall)
data['keywords'] = data['keywords'].apply(joinall)

# 2. Select the existing columns we need
relevant = ['genres', 'keywords', 'overview', 'title', 'cast', 'director', 'vote_average']
needed = data[relevant].copy()
needed = needed.fillna('')

# 3. Create the text soup. Since 'cast' and 'director' are now cleaned (joined names), they act as unique IDs in this string.
def create_soup(x):
  return (x['director'] + ' ') * 5 + \
           (x['keywords'] + ' ') * 4 + \
           (x['genres'] + ' ') * 3 + \
           (x['cast'] + ' ') + \
           (x['overview'])
  # return (x['director'] + ' ') * 5 + (x['keywords'] + ' ') * 4 + (x['genres'] + ' ') * 3 + x['cast'] + ' ' + x['overview']

needed['soup'] = needed.apply(create_soup, axis=1)

# 4. Initialize the Vectorizer and Similarity
tfidf = TfidfVectorizer(stop_words='english')
csv_matrix = tfidf.fit_transform(needed['soup']) # Changed to fit_transform
scraped_matrix = tfidf.transform([create_soup_from_scraped_dict(movie_info)])
cosine_sim = cosine_similarity(scraped_matrix, csv_matrix)

# 5. Setup for lookup to ensure data index matches cosine_sim index
data = data.reset_index(drop=True)
# The 'indices' series is no longer needed for recommendations for scraped movies
# indices = pd.Series(data.index, index=data['title']).drop_duplicates()

# Modified function to get recommendations for a scraped movie
def get_recommendations_for_scraped_movie(scraped_movie_similarity_scores, data_df, input_title=movie_info['title']):
    sim_scores = list(enumerate(scraped_movie_similarity_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    qualified_movies_indices = []
    qualified_movies_scores = []

    for i, score in sim_scores:
        current_title = data_df.iloc[i]['title']
        movie_rating = data_df.iloc[i]['vote_average']

        # NEW FILTER: Skip if the title matches the input movie
        if current_title.lower() == input_title.lower():
            continue

        if movie_rating > 6.5:
            qualified_movies_indices.append(i)
            qualified_movies_scores.append(score)

        # Stop once we have 10 good recommendations
        if len(qualified_movies_indices) == 10:
            break

    recommendations = pd.DataFrame({
        'Movie Title': data_df['title'].iloc[qualified_movies_indices].values,
        'Similarity Score': qualified_movies_scores,
        'Rating': data_df['vote_average'].iloc[qualified_movies_indices].values
    })
    return recommendations
# Call the new function with the cosine_sim[0] and the data DataFrame
print(get_recommendations_for_scraped_movie(cosine_sim[0], data))

                    Movie Title  Similarity Score  Rating
0                  Midnight Run          0.111047     7.2
1                  Billy Elliot          0.061251     7.4
2                      Salvador          0.042236     7.0
3       The Wolf of Wall Street          0.038474     7.9
4           The Lives of Others          0.037236     7.9
5  The Texas Chain Saw Massacre          0.037023     7.2
6                   Margin Call          0.035948     6.7
7                 The Godfather          0.035062     8.4
8                     Eden Lake          0.033429     6.7
9      Night of the Living Dead          0.032429     7.5
