In [None]:
import numpy as np
import pandas as pd
import pickle

In [None]:
credits_df = pd.read_csv('credits.csv')
movies_df = pd.read_csv('movies.csv')

In [None]:
# Set the maximum number of columns to display
pd.set_option('display.max_columns', None)

# Set the maximum width for each column
pd.set_option('display.max_colwidth', 50)

pd.set_option('display.max_rows', 25)

In [None]:
credits_df

In [None]:
movies_df

In [None]:
credits_df.head()

In [None]:
movies_df.tail()

In [None]:
movies_df = movies_df.merge(credits_df, on='title')
movies_df.columns

In [None]:
movies_df.head()

In [None]:
movies_df.shape

In [None]:
movies_df.info()

In [None]:
movies_df = movies_df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [None]:
movies_df.head()

In [None]:
movies_df.info()

In [None]:
movies_df.isnull().sum()

In [None]:
movies_df.dropna(inplace=True)

In [None]:
movies_df.duplicated().sum()

In [None]:
movies_df.iloc[0].genres

In [None]:
import ast

In [None]:
def convert(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

In [None]:
movies_df['genres'] = movies_df['genres'].apply(convert)
movies_df['keywords'] = movies_df['keywords'].apply(convert)

In [None]:
movies_df.head()

In [None]:
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter +=1
        else:
            break
    return L

In [None]:
movies_df['cast'][0]

In [None]:
movies_df['cast'] = movies_df['cast'].apply(convert3)

In [None]:
movies_df.head()

In [None]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [None]:
movies_df['crew'] = movies_df['crew'].apply(fetch_director)

In [None]:
movies_df.head()

In [None]:
movies_df['overview'][0]

In [None]:
movies = movies_df

In [None]:
movies.head()

In [None]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
movies['overview'].head()

In [None]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x] if x is not None else None)
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [None]:
none_count = movies['cast'].apply(lambda x: 1 if x is None else 0).sum()
none_count

In [None]:
movies.head()

In [None]:
movies['tags'] = movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']
movies['tags'][0]

In [None]:
new_df = movies[['movie_id','title', 'tags']]
new_df['tags'] = new_df['tags'].apply(lambda x:' '.join(x))

In [None]:
new_df

In [None]:
new_df['tags'][0]

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Matrix based on count vectorizer and tfidf vectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
cv.fit_transform(new_df['tags']).toarray().shape

In [None]:
# vectors = cv.fit_transform(new_df['tags']).toarray()
vectors = tfidf.fit_transform(new_df['tags']).toarray()
vectors[0]

In [None]:
len(cv.get_feature_names_out())

In [None]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [None]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarity(vectors)

In [None]:
cosine_similarity(vectors).shape

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
similarity[0]

In [None]:
similarity[0].shape

In [None]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x:x[1])[1:6]

In [None]:
def recommend(movie):
    # movie_index = new_df[new_df['title']==movie].index[0]
    # modification to make the search case-insesitive
    movie_index = new_df[new_df['title'].apply(lambda x:x.lower())==movie.lower()].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    for i in movie_list:
        print(new_df.iloc[i[0]].title)

In [None]:
recommend('Iron Man')

In [None]:
recommend('Avatar')

In [None]:
recommend('Independence Day')

In [None]:
recommend('Liar liar')

In [None]:
pickle.dump(new_df, open('movies_list.pkl', 'wb'))

In [None]:
pickle.dump(new_df, open('movies_list.pkl', 'wb'))

In [None]:
import os



In [None]:
file_path = 'movies_list.pkl'
if os.path.exists(file_path):
    # Proceed with opening and loading the file
    pickle.load(open(file_path, 'rb'))
else:
    print(f"File '{file_path}' does not exist.")