In [3]:
import numpy as np
import pandas as pd

In [4]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [5]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [6]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [7]:
# merging 2 datasets into 1 for better understanding

data = movies.merge(credits, on= 'title')
print(data.shape)
data.info()

(4809, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status    

In [8]:
# keeping only necessary columns (genres, id, keywords, title, overview, cast, crew)

data = data[['id','title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
data.head(1)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [9]:
# dropping the rows/records with null values
data.dropna(inplace= True)
data.duplicated().sum()

np.int64(0)

In [10]:
data.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [11]:
import ast # convets string to list

In [12]:
# takes only value associated with 'name' key
def convert_genre(genre):
    ls = []
    for i in ast.literal_eval(genre):
        ls.append(i["name"])
    return ls

In [13]:
data['genres'] = data['genres'].apply(convert_genre)

In [14]:
data['keywords'] = data['keywords'].apply(convert_genre)

In [15]:
def convert_cast(genre):
    ls = []
    count = 0
    for i in ast.literal_eval(genre):
        if count != 3:
            ls.append(i["name"])
            count += 1
        else:
            break
    return ls

In [16]:
data['cast'] = data['cast'].apply(convert_cast)

In [17]:
## fetching director's names from the crew column

In [18]:
def convert_crew(genre):
    director = ""
    for i in ast.literal_eval(genre):
        if i['job'] == 'Director':
            director = i['name']
            break
    return [director]

In [19]:
data['crew'] = data['crew'].apply(convert_crew)

In [20]:
data.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [21]:
data['genres'] = data['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
data['keywords'] = data['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
data['cast'] = data['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
data['crew'] = data['crew'].apply(lambda x: [i.replace(" ", "") for i in x])
data['overview'] = data['overview'].apply(lambda x: x.split())

In [22]:
data['tags'] = data['overview'] + data['genres'] + data['keywords'] + data['cast'] + data['crew']
new_data = data[['id', 'title', 'tags']]
new_data.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [23]:
new_data['tags'] = new_data['tags'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['tags'] = new_data['tags'].apply(lambda x: ' '.join(x))


In [24]:
# import nltk for stemming
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [25]:
# function for stemming the words
def stem_text(text):
    temp = []
    for i in text.split():
        temp.append(ps.stem(i))
    return " ".join(temp)

In [26]:
new_data['tags'] = new_data['tags'].apply(stem_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['tags'] = new_data['tags'].apply(stem_text)


In [27]:
# taking top 5000 most frequent words after stemming
from sklearn.feature_extraction.text import CountVectorizer
C_vec = CountVectorizer(stop_words= 'english', max_features= 5000)

vectors = C_vec.fit_transform(new_data['tags']).toarray()

In [28]:
new_data['tags'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [29]:
vectors.shape

(4806, 5000)

In [30]:
# calculating cosine distance between every pair of tag vectors
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

In [33]:
np.save('similarity.npy', similarity)

In [None]:
def recommend(movie):
    index = new_data[new_data['title'] == movie].index[0]
    distances = similarity[index]
    top_5_movies = sorted(list(enumerate(distances)), reverse= True, key= lambda x: x[1])[1:6]
    
    for i in top_5_movies:
        print(new_data.iloc[i[0]].title)

In [None]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [34]:
import pickle
pickle.dump(new_data, open('movies.pkl', 'wb'))