In [None]:
!pip install nltk



In [53]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer

data = pd.read_csv("movies.csv")
movie_title = input("Enter Movie Title: ").title()
API_KEY = '40aef168ab16c2f7c59380272ba1b17e'
BASE_URL = 'https://api.themoviedb.org/3'

def get_movie_details(movie_name):
    search_url = f"{BASE_URL}/search/movie"
    params = {
        'api_key': API_KEY,
        'query': movie_name
    }

    response = requests.get(search_url, params=params)
    results = response.json().get('results')

    if not results:
        return "No movie found."

    first_movie = results[0]
    movie_id = first_movie['id']

    detail_url = f"{BASE_URL}/movie/{movie_id}"
    detail_params = {'api_key': API_KEY}
    actual_url = f"https://www.themoviedb.org/movie/{movie_id}"

    movie_details = requests.get(detail_url, params=detail_params).json()
    website = requests.get(actual_url)
    soup = BeautifulSoup(website.text, 'html.parser')

    keyword_response = requests.get(f"{detail_url}/keywords", params=detail_params)
    keyword_response_data = keyword_response.json()
    if 'keywords' in keyword_response_data:
        keyword_names = [kw['name'] for kw in keyword_response_data['keywords']]
    else:
        keyword_names = []

    director_response = requests.get(f"{detail_url}/credits", params=detail_params)
    director_response_data = director_response.json()
    director_names_list = []
    if 'crew' in director_response_data:
        for member in director_response_data['crew']:
            if member.get('job') == 'Director':
                director_names_list.append(member.get('name'))

    actor_response = requests.get(f"{detail_url}/credits", params=detail_params)
    actor_response_data = actor_response.json()
    actor_names = []
    actor_list = actor_response_data['cast']
    for actor in actor_list[:]:
            actor_names.append(actor['name'])

    title = movie_details.get('title')
    overview = movie_details.get('overview')
    genre_name = [genre['name'] for genre in movie_details['genres']]

    return {
        'title': title,
        'director': ', '.join(director_names_list),
        'overview': overview,
        'genres': ' '.join(genre_name),
        'cast': ' '.join(actor_names),
        'keywords': ' '.join(keyword_names),
        'original_title': title,
    }

movie_info = get_movie_details(movie_title)

if movie_info['title'] not in data['title'].values:
  data.loc[len(data)] = movie_info

chosen_movie = data.loc[data['title'] == movie_info['title']]
data = data.fillna('')

def clean_data(x):
    if isinstance(x, str) and x.strip():
        words = x.split()
        cleaned_items = []
        i = 0
        while i < len(words):
            if i + 1 < len(words):
                combined = (words[i] + words[i+1]).lower()
                cleaned_items.append(combined)
                i += 2
            else:
                cleaned_items.append(words[i].lower())
                i += 1

        return " ".join(cleaned_items)
    return ''

def simple_clean(x):
    if isinstance(x, str):
        return x.lower()
    return ''

relevant = ['genres', 'keywords', 'overview', 'title', 'cast', 'director']
needed = data[relevant].copy()

for col in ['cast', 'director']:
    needed[col] = needed[col].apply(clean_data)

for col in ['genres', 'keywords']:
  needed[col] = needed[col].apply(simple_clean)

def create_soup(x):
    return (x['genres'] + ' ') * 4 + \
           (x['keywords'] + ' ') * 3 + \
           (x['director'] + ' ') * 4 + \
           (x['cast'] + ' ') * 2

ps = PorterStemmer()

def stem_text(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

needed['soup'] = needed.apply(create_soup, axis=1).apply(stem_text)
data['overview_stemmed'] = data['overview'].fillna('').apply(stem_text)

cv = CountVectorizer(stop_words='english', ngram_range=(1, 2))
matrix = cv.fit_transform(needed['soup'])
cosine_sim = cosine_similarity(matrix)

td = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_matrix = td.fit_transform(data['overview_stemmed'])
tdif_sim = cosine_similarity(tfidf_matrix)

final_sim = (0.6 * cosine_sim) + (0.4 * tdif_sim)

movie_similarities = list(enumerate(final_sim[chosen_movie.index[0]]))

distance = sorted(movie_similarities, reverse=True, key=lambda x: x[1])

print(f"\nMovie Title: {str(chosen_movie['title'].iloc[0])}")
print(f"Directed By: {str(chosen_movie['director'].iloc[0])}\n")

print(f"Top 5 similar movies to '{movie_info['title']}':")
for i in range(1, 11):
    similar_movie_idx = distance[i][0]
    similarity_score = distance[i][1]
    similar_movie_title = needed.loc[similar_movie_idx, 'title']
    print(f"  {i}. {similar_movie_title} (Similarity: {similarity_score:.2f})")


Enter Movie Title: transformers

Movie Title: Transformers
Directed By: Michael Bay

Top 5 similar movies to 'Transformers':
  1. Transformers: Revenge of the Fallen (Similarity: 0.40)
  2. Transformers: Age of Extinction (Similarity: 0.36)
  3. Transformers: Dark of the Moon (Similarity: 0.29)
  4. Independence Day: Resurgence (Similarity: 0.28)
  5. Fantastic Four (Similarity: 0.28)
  6. Six-String Samurai (Similarity: 0.27)
  7. After Earth (Similarity: 0.27)
  8. U.F.O. (Similarity: 0.26)
  9. Mad Max Beyond Thunderdome (Similarity: 0.25)
  10. Ender's Game (Similarity: 0.25)


In [54]:
print(needed)

                                        genres  \
0     action adventure fantasy science fiction   
1                     adventure fantasy action   
2                       action adventure crime   
3                  action crime drama thriller   
4             action adventure science fiction   
...                                        ...   
4798                     action crime thriller   
4799                            comedy romance   
4800             comedy drama romance tv movie   
4801                                             
4802                               documentary   

                                               keywords  \
0     culture clash future space war space colony so...   
1     ocean drug abuse exotic island east india trad...   
2            spy based on novel secret agent sequel mi6   
3     dc comics crime fighter terrorist secret ident...   
4     based on novel mars medallion space travel pri...   
...                                          