## <center>RECOMMENDER SYSTEM</center>

### Requirements (Libraries and Dataset)

In [None]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from ast import literal_eval

from google.colab import drive
drive.mount('/content/drive')

movies_data = pd.read_csv('/content/drive/MyDrive/Portfolio/1. Capstone Projects/Recommender System/movies.csv')

Mounted at /content/drive


## Method1: (Based on genre, production, direction, and cast)

### Data Editing

In [None]:
movies_data['production_company'] = movies_data['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
movies_data['production_country'] = movies_data['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
movies_data['crews'] = movies_data['crew'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
movies_data['production_company'] = movies_data['production_company'].apply(lambda x: ' '.join(map(str,x)))
movies_data['production_country'] = movies_data['production_country'].apply(lambda x: ' '.join(map(str,x)))
movies_data['crews'] = movies_data['crews'].apply(lambda x: ' '.join(map(str,x)))

### Building the model

In [None]:
# features selection
selected_features = ['genres','keywords','tagline','cast','director','production_company','production_country','crews']

# null value treatment
for feature in selected_features:
    movies_data[feature] = movies_data[feature].fillna('')

# combining column values into sentence
combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']+' '+movies_data['production_company']+' '+movies_data['production_country']+' '+movies_data['crews']

# building vector space model
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)
similarity = cosine_similarity(feature_vectors)

# getting inputs
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

# comparing input movies with list of available titles
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)  

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_data[movies_data.index==index]['title'].values[0]
    if (i<31):
        print(i, '.',title_from_index)
        i+=1

 Enter your favourite movie name : Superman
Movies suggested for you : 

1 . Superman
2 . Superman II
3 . Superman III
4 . Superman IV: The Quest for Peace
5 . The Godfather
6 . Ladyhawke
7 . Conspiracy Theory
8 . Man of Steel
9 . Radio Flyer
10 . Lethal Weapon 3
11 . Superman Returns
12 . The Godfather: Part II
13 . Scrooged
14 . Assassins
15 . Lethal Weapon 4
16 . The Omen
17 . Crimson Tide
18 . Batman
19 . The Helix... Loaded
20 . Batman: The Dark Knight Returns, Part 2
21 . Dick Tracy
22 . The Dark Knight
23 . Teenage Mutant Ninja Turtles II: The Secret of the Ooze
24 . Battlefield Earth
25 . Bulworth
26 . Batman Returns
27 . Timeline
28 . The Thief and the Cobbler
29 . Suicide Squad
30 . Batman v Superman: Dawn of Justice


-------------------------------------------------------------------------------

## Method2: (Based on overview and tagline)

### Data Editing

In [None]:
movies_data['tagline'] = movies_data['tagline'].fillna('')
movies_data['description'] = movies_data['overview'] + movies_data['tagline']
movies_data['description'] = movies_data['description'].fillna('')

### Building the model

In [None]:
# vector modeling
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies_data['description'])

# finding similarity index (single movie to all movies)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

titles = movies_data['title']
indices = pd.Series(movies_data.index, index=movies_data['title'])

def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

get_title = str(input())
get_recommendations(get_title)

Superman


2433    Superman IV: The Quest for Peace
870                          Superman II
10                      Superman Returns
14                          Man of Steel
30                          Spider-Man 2
3542                                R100
914                 Central Intelligence
4659                          Horse Camp
1368                      Identity Thief
4664                             Bronson
2218                      Death Sentence
916                         Daddy's Home
5                           Spider-Man 3
1296                        Superman III
1237                  The Out-of-Towners
303                             Catwoman
819                           Date Night
3109                              Edmond
45                           World War Z
413       Nutty Professor II: The Klumps
2527         National Lampoon's Vacation
2015                          Spaceballs
788                             Deadpool
477                        Thirteen Days
1777            

## Method 3 (Based on Overview only)

In [None]:
# data set
movies_data = pd.read_csv('/content/drive/MyDrive/Portfolio/1. Capstone Projects/Recommender System/movies.csv')

# null value treatment
movies_data['overview'] = movies_data['overview'].fillna("")

## Model building
# removing stop words such as is,the,are from the overview
tfidf = TfidfVectorizer(stop_words="english")

# building vector space model
tfidf_matrix = tfidf.fit_transform(movies_data['overview'])

# developing cosine similarity linear kernal
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# developing indices as per the title
indices = pd.Series(movies_data.index, index=movies_data['original_title']).drop_duplicates()

# final function
def get_recommendation(title, cosine_sim = cosine_sim):
    idx = indices[title]
    sim_scores = enumerate(cosine_sim[idx])
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    sim_index = [i[0] for i in sim_scores]
    print(movies_data["original_title"].iloc[sim_index])
    
get_title = str(input())
get_recommendation(get_title)

Superman
870                          Superman II
10                      Superman Returns
2433    Superman IV: The Quest for Peace
914                 Central Intelligence
4659                          Horse Camp
2527         National Lampoon's Vacation
3542                                R100
45                           World War Z
1368                      Identity Thief
1237                  The Out-of-Towners
30                          Spider-Man 2
3958                             Demonic
2218                      Death Sentence
4664                             Bronson
770                        Event Horizon
916                         Daddy's Home
4021                          Brick Lane
5                           Spider-Man 3
819                           Date Night
1296                        Superman III
303                             Catwoman
2015                          Spaceballs
1518                                  디워
413       Nutty Professor II: The Klumps
3109   

---------------------------------------------------------------------------------------------------

### <font color="green">Overall, if i have to rank the performance of model(Methods) as per the result, Method 1 > Method 3 > Method 2. Hence, Method 1 based model performed the best </font>