### Libraries:

In [57]:
import numpy as np
import pandas as pd
import ast

#### Data Loading:

In [58]:
movies = pd.read_csv("Datasets/tmdb_5000_movies.csv")
credits = pd.read_csv("Datasets/tmdb_5000_credits.csv")

In [59]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [60]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [61]:
movies.shape

(4803, 20)

In [62]:
credits.shape

(4803, 4)

#### Merging:

In [63]:
movies = movies.merge(credits,on="title")
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [64]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

#### Selecting Main Features:

In [65]:
movies = movies [["movie_id","title","overview","genres","keywords","cast","crew"]]
movies.shape

(4809, 7)

#### Removing Null Values:

In [66]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [67]:
movies.dropna(inplace=True)
movies.shape

(4806, 7)

#### Checking Duplicate:

In [68]:
movies.duplicated().sum()

0

#### Extracting Values:

In [69]:
def convert(text):
    l=[]
    for i in ast.literal_eval(text):
        l.append(i["name"])
    return l

#### Applying this function

In [70]:
movies["genres"] = movies["genres"].apply(convert)

In [71]:
movies["keywords"] = movies["keywords"].apply(convert)

#### Extracting Cast:

In [72]:
def convert_cast(text):
    l=[]
    counter=0
    for i in ast.literal_eval(text):
        if counter<3:
            l.append(i["name"])
        else:
            break
        counter+=1
    return l

Applying Function to cast column.
Also making cast dataframe for future use

In [73]:
movies["cast"] = movies["cast"].apply(convert_cast)
cast_df=movies [["movie_id","title","cast"]]
cast_df

Unnamed: 0,movie_id,title,cast
0,19995,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]"
1,285,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]"
2,206647,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]"
3,49026,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]"
4,49529,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]"
...,...,...,...
4804,9367,El Mariachi,"[Carlos Gallardo, Jaime de Hoyos, Peter Marqua..."
4805,72766,Newlyweds,"[Edward Burns, Kerry Bishé, Marsha Dietlein]"
4806,231617,"Signed, Sealed, Delivered","[Eric Mabius, Kristin Booth, Crystal Lowe]"
4807,126186,Shanghai Calling,"[Daniel Henney, Eliza Coupe, Bill Paxton]"


In [74]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


#### Function to fetch director from crew data

In [75]:
def fetch_director(text):
    l=[]
    for i in ast.literal_eval(text):
        if i["job"] == "Director":
            l.append(i["name"])
            break
        
    return l

Applying it to crew column. Making Director for future use

In [76]:
movies["crew"] = movies["crew"].apply(fetch_director)
director=movies [["movie_id","title","crew"]]

In [77]:
Genres=movies[["movie_id","title","genres"]]

#### spliting each word and converting into list

In [78]:
movies["overview"] = movies["overview"].apply(lambda x:x.split())

In [79]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


#### Function to remove space between words

In [80]:
def remove_space(word):
    l=[]
    for i in word:
        l.append(i.replace(" ",""))
        
    return l

#### Applying it to all lists

In [81]:
# Removing space because in vectorization each word is treated differently
movies["cast"] = movies["cast"].apply(remove_space)
movies["crew"] = movies["crew"].apply(remove_space)
movies["genres"] = movies["genres"].apply(remove_space)
movies["keywords"] = movies["keywords"].apply(remove_space)

In [82]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


#### Combining all columns into single column

In [83]:
movies["tags"]=movies["overview"]+movies["genres"]+movies["keywords"]+movies["cast"]+movies["crew"]

In [84]:
new_df = movies[["movie_id","title","tags"]]
new_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


#### Joining all elements of list with space and then changing it to lower casing.

In [85]:
new_df.loc[:,"tags"] = new_df["tags"].apply(lambda x: " ".join(x)).apply(lambda x:x.lower())
new_df.iloc[0]["tags"]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

#### Function to change word into its basic form.

In [86]:
import nltk
from nltk.stem import PorterStemmer

ps=PorterStemmer()

def stems(text):
    l=[]
    for i in text.split():
        l.append(ps.stem(i))
    
    return " ".join(l)


#### Applying it to tags

In [87]:
new_df.loc[:,"tags"]=new_df["tags"].apply(stems)

In [88]:
new_df.iloc[0]["tags"]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

#### Implementing CountVectorization

In [89]:
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer(max_features=5000,stop_words="english")

In [90]:
vector=cv.fit_transform(new_df["tags"]).toarray()
vector.shape

(4806, 5000)

In [91]:
vector[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [92]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vector)
similarity.shape

(4806, 4806)

In [93]:
new_df[new_df["title"] == "Spider-Man"].index[0]

159

In [94]:
similarity[0]

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

#### Function to recommend movie on the given input movie

In [95]:
def recommend(movie):
    index=new_df[new_df["title"] == movie].index[0]
    distance=sorted(list(enumerate(similarity[index])),reverse=True ,key=lambda x:x[1])
    for i in distance[1:6]:
        print(new_df.iloc[i[0]].title)

In [96]:
recommend("Spider-Man")

Spider-Man 3
Spider-Man 2
The Amazing Spider-Man 2
Arachnophobia
Kick-Ass


#### Function to recommend other movies of this character

In [97]:
def recommend_with_cast(movie):
    actor=cast_df[cast_df["title"]==movie]["cast"].iloc[0][0]
    same_actor=[]
    for index,row in cast_df.iterrows():
        if actor in row["cast"]:
            print(row["title"])
            same_actor.append(row["title"])
    print(actor)
    return same_actor

In [98]:
lst=recommend_with_cast("Spider-Man")
import random
random.sample(lst,5)

Spider-Man 3
Spider-Man 2
The Great Gatsby
Spider-Man
Wonder Boys
Ride with the Devil
The Good German
Brothers
Fear and Loathing in Las Vegas
Tobey Maguire


['Wonder Boys',
 'Spider-Man 2',
 'Spider-Man',
 'Spider-Man 3',
 'Fear and Loathing in Las Vegas']

In [99]:
def recommend_Dir(movie):
    index = director[director['title'] == movie].index[0]
    L=[]
    for i in director.index:
        if director.loc[i, "crew"] == director.loc[index,"crew"] and director.loc[i, "title"] != director.loc[index,"title"] : 
            print(director.iloc[i].title)
            L.append(director.iloc[i])   
    if len(L)==0:
        L.append(director.iloc[index])
    return L 

In [100]:
recommend_Dir("Pirates of the Caribbean: At World's End")

Pirates of the Caribbean: Dead Man's Chest
The Lone Ranger
Rango
Pirates of the Caribbean: The Curse of the Black Pearl
The Mexican
The Weather Man


[movie_id                                            58
 title       Pirates of the Caribbean: Dead Man's Chest
 crew                                  [Gore Verbinski]
 Name: 12, dtype: object,
 movie_id               57201
 title        The Lone Ranger
 crew        [Gore Verbinski]
 Name: 13, dtype: object,
 movie_id               44896
 title                  Rango
 crew        [Gore Verbinski]
 Name: 178, dtype: object,
 movie_id                                                   22
 title       Pirates of the Caribbean: The Curse of the Bla...
 crew                                         [Gore Verbinski]
 Name: 199, dtype: object,
 movie_id                6073
 title            The Mexican
 crew        [Gore Verbinski]
 Name: 1186, dtype: object,
 movie_id                6963
 title        The Weather Man
 crew        [Gore Verbinski]
 Name: 2210, dtype: object]

In [101]:
Genres['genres'] = Genres['genres'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Genres['genres'] = Genres['genres'].apply(lambda x: " ".join(x))


In [102]:
from sklearn.feature_extraction.text import CountVectorizer
cv2 = CountVectorizer(max_features=13,stop_words='english')

In [103]:
vector2 = cv2.fit_transform(Genres['genres']).toarray()

In [104]:
from sklearn.metrics.pairwise import cosine_similarity
similarity2 = cosine_similarity(vector2)

In [105]:
def recommend_genres(movie):
    index = Genres[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity2[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(Genres.iloc[i[0]].title)

In [106]:
recommend_genres("Drillbit Taylor")

Gulliver's Travels
The Campaign
Happy Feet
The Hangover Part II
Grown Ups 2


In [107]:
new_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4805,72766,Newlyweds,a newlyw couple' honeymoon is upend by the arr...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [108]:
import pickle
pickle.dump(new_df,open("Pickle\movie_list.pkl","wb"))
pickle.dump(similarity,open("Pickle\similarity.pkl","wb"))
pickle.dump(movies,open("Pickle\movies.pkl","wb"))
pickle.dump(director,open("Pickle\director.pkl","wb"))
pickle.dump(cast_df,open("Pickle\Actor.pkl","wb"))
pickle.dump(Genres,open("Pickle\Genres.pkl","wb"))
pickle.dump(similarity2,open("Pickle\similarity2.pkl","wb"))


### Checking Unique director names

In [109]:
director


Unnamed: 0,movie_id,title,crew
0,19995,Avatar,[James Cameron]
1,285,Pirates of the Caribbean: At World's End,[Gore Verbinski]
2,206647,Spectre,[Sam Mendes]
3,49026,The Dark Knight Rises,[Christopher Nolan]
4,49529,John Carter,[Andrew Stanton]
...,...,...,...
4804,9367,El Mariachi,[Robert Rodriguez]
4805,72766,Newlyweds,[Edward Burns]
4806,231617,"Signed, Sealed, Delivered",[Scott Smith]
4807,126186,Shanghai Calling,[Daniel Hsia]


In [110]:
temp=director[:]
temp['crew']=temp['crew'].apply(lambda x: ' '.join(x))

In [111]:
list=temp['crew'].unique()
counted_values = pd.Series(list).value_counts()

print(counted_values)

James Cameron         1
Émile Gaudreault      1
Perry Lang            1
Jake Goldberger       1
William Kaufman       1
                     ..
Michael Cohn          1
Alan Shapiro          1
Fernando Meirelles    1
Michael Hoffman       1
Brian Herzlinger      1
Name: count, Length: 2347, dtype: int64


In [112]:
temp[temp['crew']=='Michael Cohn']

Unnamed: 0,movie_id,title,crew
1810,9092,Snow White: A Tale of Terror,Michael Cohn


In [113]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4806 entries, 0 to 4808
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4806 non-null   int64 
 1   title     4806 non-null   object
 2   tags      4806 non-null   object
dtypes: int64(1), object(2)
memory usage: 279.2+ KB
