In [1]:
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

# Load the data
movies = pd.read_csv('tmdb_5000_movies.csv')
credit = pd.read_csv('tmdb_5000_credits.csv')

# Merge the datasets
movies = movies.merge(credit, on='title')

# Select relevant columns
movies = movies[['id_x', 'title', 'overview_x', 'genres_x', 'keywords_x']]
movies.dropna(inplace=True)

# Helper function to process genres and keywords
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres_x'] = movies['genres_x'].apply(convert)
movies['keywords_x'] = movies['keywords_x'].apply(convert)

# Preprocess overview, genres, and keywords
movies['overview_x'] = movies['overview_x'].apply(lambda x: x.split())
movies['genres_x'] = movies['genres_x'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords_x'] = movies['keywords_x'].apply(lambda x: [i.replace(" ", "") for i in x])

# Combine tags
movies['tags'] = movies['overview_x'] + movies['genres_x'] + movies['keywords_x']
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x).lower())

# Initialize stemmer
ps = PorterStemmer()

def stem(text):
    return " ".join(ps.stem(word) for word in text.split())

movies['tags'] = movies['tags'].apply(stem)

# Create feature vectors
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies['tags']).toarray()

# Calculate cosine similarity
similarity = cosine_similarity(vectors)

# Recommendation function
def recommend(movie):
    try:
        movie_index = movies[movies['title'] == movie].index[0]
        distances = similarity[movie_index]
        movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
        for i in movie_list:
            print(movies.iloc[i[0]].title)
    except IndexError:
        print("Movie not found!")



KeyError: "['id_x', 'overview_x', 'genres_x', 'keywords_x'] not in index"