In [36]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv("movies.csv")
movie_title = input("Enter Movie Title: ").title()
API_KEY = '40aef168ab16c2f7c59380272ba1b17e'
BASE_URL = 'https://api.themoviedb.org/3'

def get_movie_details(movie_name):
    search_url = f"{BASE_URL}/search/movie"
    params = {
        'api_key': API_KEY,
        'query': movie_name
    }

    response = requests.get(search_url, params=params)
    results = response.json().get('results')

    if not results:
        return "No movie found."

    first_movie = results[0]
    movie_id = first_movie['id']

    detail_url = f"{BASE_URL}/movie/{movie_id}"
    detail_params = {'api_key': API_KEY}
    actual_url = f"https://www.themoviedb.org/movie/{movie_id}"

    movie_details = requests.get(detail_url, params=detail_params).json()
    website = requests.get(actual_url)
    soup = BeautifulSoup(website.text, 'html.parser')

    cast_cards = soup.find_all('li', class_='card')
    actors = [tag.get_text() for tag in cast_cards]
    actors_with_roles = [actor.strip() for actor in actors]
    actors_without_roles = [item.split('\n')[0] for item in actors_with_roles]

    keyword_cards = soup.find_all('section', class_='keywords right_column')
    keywords = [tag.get_text()[11:] for tag in keyword_cards]
    keywords_cleaned = [actor.strip().replace('\n', ', ') for actor in keywords]

    director_card = soup.find_all('ol', class_='people no_image')
    director_text = [tag.get_text() for tag in director_card]
    director_text_cleaned = [item.split('\n')[2] for item in director_text]

    title = movie_details.get('title')
    overview = movie_details.get('overview')
    director = director_text_cleaned
    genre_name = [genre['name'] for genre in movie_details['genres']]

    return {
        'title': title,
        'director': ', '.join(director),
        'overview': overview,
        'genres': ' '.join(genre_name),
        'cast': ' '.join(actors_without_roles),
        'keywords': ' '.join(keywords_cleaned),
        'original_title': title,
    }

movie_info = get_movie_details(movie_title)

if movie_info['title'] not in data['title'].values:
  data.loc[len(data)] = movie_info

chosen_movie = data.loc[data['title'] == movie_info['title']]
data = data.fillna('')

def clean_data(x):
    if isinstance(x, str) and x.strip():
        words = x.split()
        cleaned_items = []
        i = 0
        while i < len(words):
            if i + 1 < len(words):
                combined = (words[i] + words[i+1]).lower()
                cleaned_items.append(combined)
                i += 2
            else:
                cleaned_items.append(words[i].lower())
                i += 1

        return " ".join(cleaned_items)
    return ''

def simple_clean(x):
    if isinstance(x, str):
        return x.lower()
    return ''

relevant = ['genres', 'keywords', 'overview', 'title', 'cast', 'director']
needed = data[relevant].copy()

for col in ['cast', 'director']:
    needed[col] = needed[col].apply(clean_data)

for col in ['genres', 'keywords']:
    needed[col] = needed[col].apply(simple_clean)

def create_soup(x):
    return (x['genres'] + ' ') * 5 + \
           (x['keywords'] + ' ') * 4 + \
           (x['director'] + ' ') * 5 + \
           (x['cast'] + ' ') * 3 + \
           x['overview']

needed['soup'] = needed.apply(create_soup, axis=1)

cv = CountVectorizer(stop_words='english', ngram_range=(1, 2))
matrix = cv.fit_transform(needed['soup'])
cosine_sim = cosine_similarity(matrix)

movie_similarities = list(enumerate(cosine_sim[chosen_movie.index[0]]))

distance = sorted(movie_similarities, reverse=True, key=lambda x: x[1])

print(f"\nMovie Title: {str(chosen_movie['title'].iloc[0])}")
print(f"Directed By: {str(chosen_movie['director'].iloc[0])}\n")

print(f"Top 5 similar movies to '{movie_info['title']}':")
for i in range(1, 11):
    similar_movie_idx = distance[i][0]
    similarity_score = distance[i][1]
    similar_movie_title = needed.loc[similar_movie_idx, 'title']
    print(f"  {i}. {similar_movie_title} (Similarity: {similarity_score:.2f})")

Enter Movie Title: shutter island

Movie Title: Shutter Island
Directed By: Martin Scorsese

Top 5 similar movies to 'Shutter Island':
  1. The Sisterhood of Night (Similarity: 0.30)
  2. The Girl with the Dragon Tattoo (Similarity: 0.30)
  3. Pi (Similarity: 0.27)
  4. Arlington Road (Similarity: 0.27)
  5. Amnesiac (Similarity: 0.27)
  6. Dream House (Similarity: 0.27)
  7. Goddess of Love (Similarity: 0.27)
  8. Slow Burn (Similarity: 0.26)
  9. All the King's Men (Similarity: 0.26)
  10. Chloe (Similarity: 0.26)
