In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from datetime import datetime
from pandas import Series
from numpy import log

#!pip install plotly
import plotly.express as px # high level interface 
import plotly.graph_objects as go # lower level interface

In [3]:
movies=pd.read_csv('movies.csv')
credits=pd.read_csv('credits.csv')

In [4]:
print('movie shape : ',movies.shape)
print('credits shape : ',credits.shape)

movie shape :  (4803, 20)
credits shape :  (4803, 4)


In [5]:
print('movie columns : ',movies.columns)
print()
print('credits columns : ',credits.columns)

movie columns :  Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

credits columns :  Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')


In [6]:
print('null values in movie dataset :\n', movies.isnull().sum())
print()
print('null values in credits dataset :\n', credits.isnull().sum())

null values in movie dataset :
 budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

null values in credits dataset :
 movie_id    0
title       0
cast        0
crew        0
dtype: int64


In [7]:
# So we need to merge both tables with some unique field and that is title. But before that we need to see unique values for title in both of 
# the table are same or not

In [8]:
print('unique value for title in movie dataset : ',movies['title'].nunique())
print('unique value for title in credits dataset : ',credits['title'].nunique())


unique value for title in movie dataset :  4800
unique value for title in credits dataset :  4800


In [9]:
movie_final=movies.merge(credits, on='title')

In [10]:
movie_final.shape

(4809, 23)

In [11]:
movie_final.duplicated().sum()

0

In [12]:
movie_final.isnull().sum()/len(movie_final)*100

budget                   0.000000
genres                   0.000000
homepage                64.379289
id                       0.000000
keywords                 0.000000
original_language        0.000000
original_title           0.000000
overview                 0.062383
popularity               0.000000
production_companies     0.000000
production_countries     0.000000
release_date             0.020794
revenue                  0.000000
runtime                  0.041589
spoken_languages         0.000000
status                   0.000000
tagline                 17.550426
title                    0.000000
vote_average             0.000000
vote_count               0.000000
movie_id                 0.000000
cast                     0.000000
crew                     0.000000
dtype: float64

In [13]:
## Analysing and droping non significant features
## Here homepage and tagline having null values so will directly drop them 
## Movie id and id both are same, so we can drop either of them 

In [14]:
movie_final.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [15]:
movie_final['original_language'].value_counts()/len(movie_final)*100  
## Here 93% movie having english langauge only so we majority is of english movie langauge only so we will delete this column

original_language
en    93.782491
fr     1.455604
es     0.665419
zh     0.561447
de     0.561447
hi     0.395093
ja     0.332710
it     0.291121
ko     0.249532
cn     0.249532
ru     0.228738
pt     0.187149
da     0.145560
sv     0.103972
nl     0.083177
fa     0.083177
th     0.062383
he     0.062383
ta     0.041589
cs     0.041589
ro     0.041589
id     0.041589
ar     0.041589
vi     0.020794
sl     0.020794
ps     0.020794
no     0.020794
ky     0.020794
hu     0.020794
pl     0.020794
af     0.020794
nb     0.020794
tr     0.020794
is     0.020794
xx     0.020794
te     0.020794
el     0.020794
Name: count, dtype: float64

In [16]:
movie_final['release_date'].value_counts()

release_date
2006-01-01    10
2002-01-01     8
2014-12-25     7
2013-07-18     7
2004-09-03     7
              ..
2002-08-20     1
1987-11-05     1
2004-11-11     1
1984-05-23     1
2012-05-03     1
Name: count, Length: 3280, dtype: int64

In [17]:
movie_final['status'].value_counts()

status
Released           4801
Rumored               5
Post Production       3
Name: count, dtype: int64

In [18]:
movie_final.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [19]:
# We are building content based recommendation system so we will consider below features only (Note: we are considering Movie id because whe we deploy
# our model on aws, azure then at that time we will need this id)
movie1=movie_final[['genres','keywords','overview', 'title', 'movie_id', 'cast', 'crew']]

In [20]:
movie1.shape

(4809, 7)

In [21]:
movie1.head(2)

Unnamed: 0,genres,keywords,overview,title,movie_id,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",Avatar,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [22]:
import ast
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i["name"])
    return L

In [23]:
movie1['genres'] = movie1['genres'].apply(convert)

In [24]:
movie1['keywords'] = movie1['keywords'].apply(convert)

In [25]:
movie1.head(2)

Unnamed: 0,genres,keywords,overview,title,movie_id,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Avatar,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [26]:
def convert1(text):
    L = []
    counter = 0 
    for i in ast.literal_eval(text):
        if counter < 3: # we are considering main 3 person(character) only
            L.append(i["name"])
        counter +=1
    return L

In [27]:
movie1['cast'] = movie1['cast'].apply(convert1)

In [28]:
movie1.head(2)

Unnamed: 0,genres,keywords,overview,title,movie_id,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Avatar,19995,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,285,"[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [29]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [30]:
movie1['crew'] = movie1['crew'].apply(fetch_director)

In [31]:
movie1.head(2)

Unnamed: 0,genres,keywords,overview,title,movie_id,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Avatar,19995,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,285,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [32]:
movie1.isnull().sum()

genres      0
keywords    0
overview    3
title       0
movie_id    0
cast        0
crew        0
dtype: int64

In [33]:
movie1.dropna(inplace=True)

In [34]:
movie1.shape

(4806, 7)

In [35]:
## We are creating one function that will remove the space between words ex. Sam Worthington is one person only. But when we will convert text to vector
# two vectors will be created for it that is one for sam and other for Worthington. So to avoid it and consider as one vector only and avoid confussion .
# In short for 1 person i want only one token(vector value) but because of sapce two token will get create

def collapse(L):
    L1=[]
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [36]:
movie1['genres'] = movie1['genres'].apply(collapse)
movie1['keywords'] = movie1['keywords'].apply(collapse)
movie1['cast'] = movie1['cast'].apply(collapse)
movie1['crew'] = movie1['crew'].apply(collapse)

In [37]:
movie1.head(2)

Unnamed: 0,genres,keywords,overview,title,movie_id,cast,crew
0,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","In the 22nd century, a paraplegic Marine is di...",Avatar,19995,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,285,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]


In [38]:
movie1['overview'] = movie1['overview'].apply(lambda x : x.split())

In [39]:
movie1['overview'][0]

['In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.']

In [40]:
## Next step is to merge geners, keywords, overview, cast and crew. Because we will be building recommendation system based on context which include all this things 
movie1['context']=movie1['genres'] + movie1['keywords'] + movie1['overview'] + movie1['cast'] + movie1['crew']

In [41]:
movie1.head(2)

Unnamed: 0,genres,keywords,overview,title,movie_id,cast,crew,context
0,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,19995,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...",Pirates of the Caribbean: At World's End,285,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Adventure, Fantasy, Action, ocean, drugabuse,..."


In [42]:
movie1['context'][0]

['Action',
 'Adventure',
 'Fantasy',
 'ScienceFiction',
 'cultureclash',
 'future',
 'spacewar',
 'spacecolony',
 'society',
 'spacetravel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alienplanet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'loveaffair',
 'antiwar',
 'powerrelations',
 'mindandsoul',
 '3d',
 'In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.',
 'SamWorthington',
 'ZoeSaldana',
 'SigourneyWeaver',
 'JamesCameron']

In [43]:
df=movie1[['title','movie_id','context']]

In [44]:
df.head()

Unnamed: 0,title,movie_id,context
0,Avatar,19995,"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,Pirates of the Caribbean: At World's End,285,"[Adventure, Fantasy, Action, ocean, drugabuse,..."
2,Spectre,206647,"[Action, Adventure, Crime, spy, basedonnovel, ..."
3,The Dark Knight Rises,49026,"[Action, Crime, Drama, Thriller, dccomics, cri..."
4,John Carter,49529,"[Action, Adventure, ScienceFiction, basedonnov..."


In [45]:
# Now we want to create a one single sentence from context so that whenever the same pattern comes my machine will understand it quickly
df['context'] =  df['context'].apply(lambda x : " ".join(x))

In [46]:
df.head()

Unnamed: 0,title,movie_id,context
0,Avatar,19995,Action Adventure Fantasy ScienceFiction cultur...
1,Pirates of the Caribbean: At World's End,285,Adventure Fantasy Action ocean drugabuse exoti...
2,Spectre,206647,Action Adventure Crime spy basedonnovel secret...
3,The Dark Knight Rises,49026,Action Crime Drama Thriller dccomics crimefigh...
4,John Carter,49529,Action Adventure ScienceFiction basedonnovel m...


In [47]:
df['context'][0]

'Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

## Bag of words - Feature extraction

In [48]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [49]:
vector = cv.fit_transform(df['context']).toarray()

In [50]:
vector.shape

(4806, 5000)

In [51]:
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Building Recommendation System basis similarity matrix

In [52]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [53]:
similarity

array([[1.        , 0.08964215, 0.06071767, ..., 0.02519763, 0.0277885 ,
        0.        ],
       [0.08964215, 1.        , 0.06350006, ..., 0.02635231, 0.        ,
        0.        ],
       [0.06071767, 0.06350006, 1.        , ..., 0.02677398, 0.        ,
        0.        ],
       ...,
       [0.02519763, 0.02635231, 0.02677398, ..., 1.        , 0.07352146,
        0.04774099],
       [0.0277885 , 0.        , 0.        , ..., 0.07352146, 1.        ,
        0.05264981],
       [0.        , 0.        , 0.        , ..., 0.04774099, 0.05264981,
        1.        ]])

In [54]:
df.head()

Unnamed: 0,title,movie_id,context
0,Avatar,19995,Action Adventure Fantasy ScienceFiction cultur...
1,Pirates of the Caribbean: At World's End,285,Adventure Fantasy Action ocean drugabuse exoti...
2,Spectre,206647,Action Adventure Crime spy basedonnovel secret...
3,The Dark Knight Rises,49026,Action Crime Drama Thriller dccomics crimefigh...
4,John Carter,49529,Action Adventure ScienceFiction basedonnovel m...


In [55]:
df[df['title']=='Avatar'].index[0]

0

In [68]:
def recommendation(movies):
    index = df[df['title']==movies].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True)
    for i in distances[1:6]:
        print(df.iloc[i[0]].title)

In [69]:
recommendation('Avatar')

Shanghai Calling
Signed, Sealed, Delivered
Newlyweds
El Mariachi
Cavite


In [70]:
recommendation('Avatar')

Shanghai Calling
Signed, Sealed, Delivered
Newlyweds
El Mariachi
Cavite


In [71]:
import pickle

In [72]:
pickle.dump(df,open('movies.pkl','wb'))

In [73]:
pickle.dump(similarity,open('similarity.pkl','wb'))