# Movie Recommendation System   

## Importing the libraries

In [85]:
import numpy as np
import pandas as pd

np.set_printoptions(threshold=np.inf)

## Importing the dataset

In [57]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [58]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [59]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Data Preprocessing

### Merging both datasets based on id column

In [60]:
movies_processed = movies.merge(credits, left_on="id", right_on="movie_id")
movies_processed = movies_processed.drop(columns=["title_y", "movie_id"])
movies_processed = movies_processed.rename(columns={"title_x": "title"})

In [61]:
movies_processed.shape

(4803, 22)

In [62]:
movies_processed.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Selecting features relevant to the workflow

In [63]:
movies_processed = movies_processed[["id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [64]:
movies_processed.shape

(4803, 7)

In [65]:
movies_processed.head(1)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Taking care of missing data

In [66]:
movies_processed.dropna(inplace=True)

In [67]:
movies_processed.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [68]:
movies_processed.duplicated().sum()

0

### Merging "overview", "genres", "keywords", "cast" and "crew" into single "tag" column

#### Destructuring the column formats to Lists

In [69]:
from ast import literal_eval


def destructure(obj):
    obj = literal_eval(obj)
    return [i["name"].replace(" ", "") for i in obj]


def destructure3(obj):
    obj = literal_eval(obj)
    return [i["name"].replace(" ", "") for i in obj[:3]]


def destructure_director(obj):
    obj = literal_eval(obj)
    return [next((i["name"].replace(" ", "") for i in obj if i["department"] == "Directing"), "")]

In [70]:
movies_processed["overview"] = movies_processed["overview"].apply(lambda x: x.split(" "))
movies_processed["genres"] = movies_processed["genres"].apply(destructure)
movies_processed["keywords"] = movies_processed["keywords"].apply(destructure)
movies_processed["cast"] = movies_processed["cast"].apply(destructure3)
movies_processed["crew"] = movies_processed["crew"].apply(destructure_director)

In [71]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [72]:
movies_processed['tags'] = movies_processed["overview"] + movies_processed['keywords'] + movies_processed["genres"] + movies_processed["cast"] + movies_processed["crew"]

In [73]:
movies_processed['tags'] = movies_processed['tags'].apply(lambda x: " ".join(x).lower())

In [74]:
final_df = movies_processed[["id", "title", "tags"]]
final_df.head(5)

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


### Reducing words to root words

In [75]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()


def stem(text):
    return " ".join([ps.stem(word=word) for word in text.split()])

In [76]:
final_df["tags"].apply(stem)

0       in the 22nd century, a parapleg marin is dispa...
1       captain barbossa, long believ to be dead, ha c...
2       a cryptic messag from bond’ past send him on a...
3       follow the death of district attorney harvey d...
4       john carter is a war-weary, former militari ca...
5       the seemingli invinc spider-man goe up against...
6       when the kingdom' most wanted-and most charmin...
7       when toni stark tri to jumpstart a dormant pea...
8       as harri begin hi sixth year at hogwarts, he d...
9       fear the action of a god-lik super hero left u...
10      superman return to discov hi 5-year absenc ha ...
11      quantum of solac continu the adventur of jame ...
12      captain jack sparrow work hi way out of a bloo...
13      the texa ranger chase down a gang of outlaw le...
14      a young boy learn that he ha extraordinari pow...
15      one year after their incred adventur in the li...
16      when an unexpect enemi emerg and threaten glob...
17      captai

## Vectorizing the tags

In [77]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words="english")

In [78]:
vectors = cv.fit_transform(final_df["tags"]).toarray()

In [79]:
cv.get_feature_names_out()

array(['000', '007', '10', '100', '11', '12', '13', '14', '15', '16',
       '17', '18', '18th', '19', '1930s', '1940s', '1950', '1950s',
       '1960s', '1970s', '1971', '1974', '1976', '1980', '1980s', '1985',
       '1990s', '1999', '19th', '19thcentury', '20', '200', '2009',
       '20th', '24', '25', '30', '300', '3d', '40', '50', '500', '60',
       '60s', '70', 'aaron', 'aaroneckhart', 'abandoned', 'abducted',
       'abigailbreslin', 'abilities', 'ability', 'able', 'aboard',
       'abuse', 'abusive', 'academic', 'academy', 'accept', 'accepted',
       'accepts', 'access', 'accident', 'accidental', 'accidentally',
       'accompanied', 'accomplish', 'account', 'accountant', 'accused',
       'ace', 'achieve', 'act', 'acting', 'action', 'actionhero',
       'actions', 'activist', 'activities', 'activity', 'actor', 'actors',
       'actress', 'acts', 'actual', 'actually', 'adam', 'adams',
       'adamsandler', 'adamshankman', 'adaptation', 'adapted', 'addict',
       'addicted', 

## Finding similarity between the text vectors

In [80]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)
similarity[0]

array([1.        , 0.08980265, 0.05892557, 0.03955939, 0.18002057,
       0.11669001, 0.02006431, 0.1739164 , 0.06189845, 0.07624929,
       0.11605177, 0.08006408, 0.09622504, 0.04714045, 0.11433239,
       0.05083286, 0.08111071, 0.14816681, 0.11135885, 0.08535792,
       0.05735393, 0.09449112, 0.06666667, 0.09622504, 0.05479966,
       0.03798686, 0.16903085, 0.19448666, 0.12379689, 0.070014  ,
       0.07216878, 0.16037507, 0.08989331, 0.10425721, 0.        ,
       0.1118034 , 0.19444444, 0.08333333, 0.06201737, 0.08703883,
       0.05555556, 0.14433757, 0.        , 0.09622504, 0.0404226 ,
       0.08989331, 0.17407766, 0.2133948 , 0.06681531, 0.06189845,
       0.12171612, 0.08535792, 0.17149859, 0.05083286, 0.0347524 ,
       0.07802743, 0.14213381, 0.0285831 , 0.046676  , 0.07911878,
       0.0285831 , 0.2236068 , 0.08084521, 0.09072184, 0.03094922,
       0.02083333, 0.02457366, 0.1796053 , 0.13333333, 0.05076731,
       0.08574929, 0.08111071, 0.12171612, 0.0372678 , 0.22116

## Suggesting Top 5 movie based on similarity

In [81]:
def recommend(id):
    index = final_df[final_df["id"] == id].index[0]
    distances = similarity[index]
    indexed_distances = list(enumerate(distances))
    indexed_distances = sorted(indexed_distances, key=lambda x: x[1], reverse=True)

    return indexed_distances[:6]

In [90]:
movies.iloc[[item[0] for item in recommend(857)]]["title"].tolist()

['Saving Private Ryan',
 'The Monuments Men',
 'The Great Raid',
 "One Man's Hero",
 'The Young Unknowns',
 'Fury']

In [89]:
movies[movies["title"] == "Saving Private Ryan"]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
628,70000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 36, ""name...",,857,"[{""id"": 1327, ""name"": ""war crimes""}, {""id"": 14...",en,Saving Private Ryan,"As U.S. troops storm the beaches of Normandy, ...",76.041867,"[{""name"": ""Paramount Pictures"", ""id"": 4}, {""na...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1998-07-24,481840909,169.0,"[{""iso_639_1"": ""cs"", ""name"": ""\u010cesk\u00fd""...",Released,The mission is a man.,Saving Private Ryan,7.9,5048
