In [62]:
import pandas as pd

data = pd.read_csv("movies.csv")

In [63]:
from bs4 import BeautifulSoup
import requests

movie_title = input("Enter Movie Title: ").title()
API_KEY = '40aef168ab16c2f7c59380272ba1b17e'
BASE_URL = 'https://api.themoviedb.org/3'

def get_movie_details(movie_name):
    search_url = f"{BASE_URL}/search/movie"
    params = {
        'api_key': API_KEY,
        'query': movie_name
    }

    response = requests.get(search_url, params=params)
    results = response.json().get('results')

    if not results:
        return "No movie found."

    first_movie = results[0]
    movie_id = first_movie['id']

    detail_url = f"{BASE_URL}/movie/{movie_id}"
    detail_params = {'api_key': API_KEY}
    actual_url = f"https://www.themoviedb.org/movie/{movie_id}"

    movie_details = requests.get(detail_url, params=detail_params).json()
    website = requests.get(actual_url)
    soup = BeautifulSoup(website.text, 'html.parser')

    cast_cards = soup.find_all('li', class_='card')
    actors = [tag.get_text() for tag in cast_cards]
    actors_with_roles = [actor.strip() for actor in actors]
    actors_without_roles = [item.split('\n')[0] for item in actors_with_roles]

    keyword_cards = soup.find_all('section', class_='keywords right_column')
    keywords = [tag.get_text()[11:] for tag in keyword_cards]
    keywords_cleaned = [actor.strip().replace('\n', ', ') for actor in keywords]

    director_card = soup.find_all('ol', class_='people no_image')
    director_text = [tag.get_text() for tag in director_card]
    director_text_cleaned = [item.split('\n')[2] for item in director_text]

    title = movie_details.get('title')
    overview = movie_details.get('overview')
    director = director_text_cleaned
    genre_name = [genre['name'] for genre in movie_details['genres']]

    return {
        'title': title,
        'director': ', '.join(director),
        'overview': overview,
        'genres': ', '.join(genre_name),
        'cast': ', '.join(actors_without_roles),
        'keywords': ', '.join(keywords_cleaned),
        'original_title': title,
    }

movie_info = get_movie_details(movie_title)

if movie_info['title'] not in data['title'].values:
  data.loc[len(data)] = movie_info

chosen_movie = data.loc[data['title'] == movie_info['title']]
data = data.fillna('')

Enter Movie Title: cars


In [64]:
def create_soup(x):
  return (x['genres'] + ' ') * 4 + \
       (x['keywords'] + ' ') * 2 + \
       (x['cast'] + ' ') * 3 + \
       (x['director'] + ' ') * 4 + \
       (x['overview'])

relevant = ['genres', 'keywords', 'overview', 'title', 'cast', 'director']
needed = data[relevant].copy()

needed['soup'] = needed.apply(create_soup, axis=1)

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer(stop_words='english', ngram_range=(1, 2))
matrix = cv.fit_transform(needed['soup'])
cosine_sim = cosine_similarity(matrix)

In [68]:
movie_similarities = list(enumerate(cosine_sim[chosen_movie.index[0]]))

distance = sorted(movie_similarities, reverse=True, key=lambda x: x[1])

print(f"Top 5 similar movies to '{movie_info['title']}':")
for i in range(1, 11):
    similar_movie_idx = distance[i][0]
    similarity_score = distance[i][1]
    similar_movie_title = needed.loc[similar_movie_idx, 'title']
    print(f"  {i}. {similar_movie_title} (Similarity: {similarity_score:.2f})")

Top 5 similar movies to 'Cars':
  1. Cars 2 (Similarity: 0.49)
  2. Penguins of Madagascar (Similarity: 0.27)
  3. The SpongeBob Movie: Sponge Out of Water (Similarity: 0.27)
  4. Toy Story (Similarity: 0.26)
  5. Toy Story 2 (Similarity: 0.25)
  6. Rio (Similarity: 0.24)
  7. Minions (Similarity: 0.24)
  8. Rio 2 (Similarity: 0.23)
  9. The Book of Life (Similarity: 0.23)
  10. Bee Movie (Similarity: 0.22)
