<a href="https://colab.research.google.com/github/lijin-durairaj-code-mode/machine-learning/blob/main/movie_recommender_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re

#nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

#sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#config
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth', 1000000)

#objects
stemming=PorterStemmer()

In [None]:
class movie_recommender:

    def pull_data(self):
        movies_data=pd.read_csv(r'tmdb_5000_movie_rating\tmdb_5000_movies.csv')
        credits_data=pd.read_csv(r'tmdb_5000_movie_rating\tmdb_5000_credits.csv')
        return movies_data,credits_data

    def extract_crew(self,_crew):
        crew_name=[]
        for _a in json.loads(_crew):
            if _a['department'] in ['Directing','Editing','Camera']:
                crew_name.append(_a['name'])
        return ', '.join(crew_name)

    def get_keywords(self,_list):
        keywords=[]
        for _genres in json.loads(_list):
            keywords.append(_genres['name'])
        return ', '.join(keywords)

    def pipeline_action(self,movies_data,credits_data):
        _merged_data=movies_data.merge(credits_data,left_on='id',right_on='movie_id',suffixes=('_movies_dup', '_credits_dup'))
        _merged_data['genres_extracted_word']=_merged_data['genres'].apply(lambda x:self.get_keywords(x))
        _merged_data['keywords_extracted_word']=_merged_data['keywords'].apply(lambda x:self.get_keywords(x))
        _merged_data['extracted_crew']=_merged_data['crew'].apply(lambda x:self.extract_crew(x))
        _merged_data['keywords']=_merged_data.apply(lambda x:(str(x['overview'])+str(x['tagline'])+str(x['title_movies_dup'])+str(x['vote_average'])+str(x['extracted_crew'])+str(x['genres_extracted_word'])+str(x['keywords_extracted_word'])),axis=1)
        _merged_data.drop(['budget', 'homepage',   'original_language',
           'original_title', 'overview', 'popularity', 'production_companies',
           'production_countries', 'release_date', 'revenue', 'runtime',
           'spoken_languages', 'status', 'tagline',
           'vote_average', 'vote_count', 'movie_id', 'title_credits_dup', 'cast',
           'crew', 'genres_extracted_word', 'keywords_extracted_word',
           'extracted_crew'],axis=1,inplace=True)
        return _merged_data

    def edit_sentences(self,_text):
        #remove stopword
        _text_ouput=[_word for _word in word_tokenize(_text) if _word not in english_stopwords]
        #remove puntutation
        _text_ouput=[word for word in _text_ouput if word not in string.punctuation]
        #stemming
        _text_ouput=[stemming.stem(word) for word in _text_ouput]
        #lowercase
        _text_ouput=[word.lower() for word in _text_ouput]
        return ' '.join(_text_ouput)

    def apply_vectorization(self,_corpus):
        count_vectorizer=CountVectorizer(stop_words='english',lowercase=True,min_df=5)
        transformed_data=count_vectorizer.fit_transform(_corpus)
        return transformed_data,count_vectorizer

    def main(self):
        movies_data,credits_data=self.pull_data()
        _data=self.pipeline_action(movies_data,credits_data)
        _data['edited_words']=_data['keywords'].apply(lambda x:self.edit_sentences(x))
        transformed_data,count_vectorizer=self.apply_vectorization(_data['edited_words'])
        _cosine_similarity=cosine_similarity(transformed_data)
        return _cosine_similarity,count_vectorizer,_data



movies=movie_recommender()
# _cosine_similarity,count_vectorizer,_data=movies.main()

In [None]:
def suggest_movies(_movie,_cosine_similarity):
    movie_index=_data[_data['title_movies_dup']==_movie].index[0]
    _sorted_index=sorted(list(enumerate(_cosine_similarity[movie_index])),key=lambda x:x[1],reverse=True)[1:6]
    _genres_lst=[]
    for _i in _sorted_index:
        for _genres in json.loads(_data.iloc[_i[0],:]['genres']):
            _genres_lst.append(_genres['name'])
        print('movie name - {0} ---- genres {1}'.format(_data.iloc[_i[0],:]['title_movies_dup'],', '.join(_genres_lst)))
