In [None]:
import numpy as np
import pandas as pd

In [None]:
movies= pd.read_csv('tmdb_5000_movies.csv')
credits= pd.read_csv('tmdb_5000_credits.csv')

In [None]:
print(movies.shape, credits.shape)

In [None]:
movies.head(1)

In [None]:
credits.head(1)

In [None]:
movies=movies.merge(credits, on='title')

In [None]:
movies.shape

In [None]:
movies.head(1)

## Data Preprocessing

In [None]:
movies.columns

In [None]:
column_list=['budget','original_language','homepage','original_title','production_companies','production_countries', 'revenue','spoken_languages', 'status','tagline', 'vote_average',
       'vote_count', 'id','tagline','release_date','runtime','popularity' ]
movies.drop(columns=column_list, inplace=True)

In [None]:
movies.shape

In [None]:
movies.head(1)

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True, axis=0)

In [None]:
movies.isnull().sum()

In [None]:
movies.duplicated().sum()

In [None]:
movies.shape

In [None]:
movies.iloc[0].genres

In [None]:
#columns like genre and keywords are strings and need to be converted to lists for preprocessing
import ast
def convert(obj):
    l=[]
    for i in ast.literal_eval(obj):
       l.append(i['name'])
    return l

In [None]:
movies['genres']=movies['genres'].apply(convert)
movies['keywords']=movies['keywords'].apply(convert)

In [None]:
movies.head()

In [None]:
movies.iloc[0]['crew']

In [None]:
#Taking only the top 5 members from the cast
def convert_cast(obj):
    l=[]
    count=0
    for i in ast.literal_eval(obj):
        if count<5:
            l.append(i['name'])
            count+=1
        else:
            break
    return l
        

In [None]:
#We only require director's name from the crew data
def convert_crew(obj):
    l=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            l.append(i['name'])
        else:
            continue
    return l
        

In [None]:
movies['cast']=movies['cast'].apply(convert_cast)
movies['crew']=movies['crew'].apply(convert_crew)

In [None]:
movies['overview']=movies['overview'].apply(lambda x : x.split())

In [None]:
movies['genres']=movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords']=movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast']=movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew']=movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])


In [None]:
movies.head()

In [None]:
#Creating a tags columns which contains all the relevant information about the movie
movies['tags']=movies['overview']+ movies['genres']+ movies['keywords']+movies['cast']+ movies['crew']

In [None]:
movies.head()

In [None]:
movies.drop(columns=['overview','genres', 'keywords', 'cast', 'crew'], inplace=True )

In [None]:
movies.head()

In [None]:
#converting the list of words to strings
movies['tags']=movies['tags'].apply(lambda x: " ".join(x))
movies['tags']= movies['tags'].apply(lambda x:x.lower())

In [None]:
#applying lemmatizer to the data so that words like activity and activities are treated as same

#[activity, activities]
#[activity, activity]

#Stemming can be used too

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)   #returning a stemmed string

In [None]:
stem('activities')

In [None]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatizer.lemmatize("to")

In [None]:
def lemmatize(text):
    y=[]
    for i in text.split():
        y.append(lemmatizer.lemmatize(i))
    return " ".join(y)

In [None]:
movies['tags']=movies['tags'].apply(lemmatize)

In [None]:
movies.head()

## Vectorisation

text vectorisation: converting texts to vectors

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
#consider the 50000 most used words and exclude stop words
cv= CountVectorizer(max_features= 5000, stop_words='english')

In [None]:
vectors=cv.fit_transform(movies['tags']).toarray()  #converting a sparse matrix to numpy array

In [None]:
vectors

In [None]:
vectors.shape   

In [None]:
print(cv.get_feature_names_out())  #printing the most common words

## Calculating cosine similarity between vectors

using cosine similarity and distance since euclidean distance is not a good measure in higher dimensionality spaces

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#calculating the similarity between each movie
similarity = cosine_similarity(vectors)
print(similarity[0])

In [None]:
similarity.shape

In [None]:
#sorting based on the distances, enumerate helps in keeping the index position
sorted(list(enumerate(similarity[0])), reverse=True, key= lambda x:x[1])[1:6]

## Function to Recommend Movies

In [None]:
def recommend(movie):
    movie_index= movies[movies['title']==movie].index[0]
    distances= similarity[movie_index]
    movies_list=sorted(list(enumerate(distances)), reverse=True, key= lambda x:x[1])[1:8]
    for i in movies_list:
        print(movies.iloc[i[0]].title, i[1])

In [None]:
def recommend_genre(genre, movies_df):
    filtered_movies = movies_df[movies_df['tags'].apply(lambda tags: genre in tags)]
    sampled_movies = filtered_movies.sample(n=min(9, len(filtered_movies)), random_state=42)
    recommended = []
    count=0

    for index, row in sampled_movies.iterrows():
        if count<8:
            movie_id = row['movie_id']
            recommended.append(row['title'])
            print(row['title'],movie_id)
            count+=1
        else:
            break
    

In [None]:
print("Recommendations based on Genre \n")
recommend_genre('action', movies)

In [None]:
print("Recommendations and Similarities\n")
recommend('Avatar')    #testing a random movie

## Saving the Data

In [None]:
import joblib

joblib.dump(movies.to_dict(), 'movies.pkl')
joblib.dump(similarity, 'similarity.pkl')

In [None]:
print(pd.__version__)
print(np.__version__)
print(joblib.__version__)