In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies = movies.merge(credits,on='title')

In [4]:
# need to only keep the columns that are helpful for a content based recommender

In [5]:
# id
# title
# cast
# crew
# keywords
# overview
# genres
movies = movies[['id', 'title', 'cast', 'crew', 'keywords', 'overview', 'genres']]

In [8]:
movies.isnull().sum()

id          0
title       0
cast        0
crew        0
keywords    0
overview    0
genres      0
dtype: int64

In [7]:
movies.dropna(inplace=True)

In [9]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [10]:
# genres should be preprocessed to look like this
# ['Action', 'Adventure', 'ScienceFiction', 'Thriller']
import ast

In [11]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i["name"])
    return L

In [12]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [13]:
movies.head()

Unnamed: 0,id,title,cast,crew,keywords,overview,genres
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]"
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]"
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]"
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]"
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]"


In [14]:
def convert3(obj):
    L = []
    a = 0
    for i in ast.literal_eval(obj):
        if a != 3:
            L.append(i["name"])
            a += 1
        else:
            break
    return L

In [15]:
movies['cast'] = movies['cast'].apply(convert3)

In [16]:
def director_name(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i["job"] == "Director":
            L.append(i["name"])
            break
    return L

In [17]:
movies['crew'] = movies['crew'].apply(director_name)

In [18]:
# overview needs to be split into keywords as well as split() is used
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [19]:
# need to remove spaces in all categories
# ex. chris pratt and chris hemsworth would both be "related" 
# in the current format but if they are written as
# chrispratt and chrishemsworth that issue isn't there
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [20]:
movies.head()

Unnamed: 0,id,title,cast,crew,keywords,overview,genres
0,19995,Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]"
1,285,Pirates of the Caribbean: At World's End,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]"
2,206647,Spectre,"[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]"
3,49026,The Dark Knight Rises,"[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]"
4,49529,John Carter,"[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]"


In [21]:
movies['tags'] = movies['overview'] + movies['keywords'] + movies['genres'] + movies['cast'] + movies['crew']

In [22]:
movie_data = movies[['id', 'title', 'tags']]

In [23]:
movie_data

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [24]:
# list in tags should be a string
movie_data['tags'] = movie_data['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data['tags'] = movie_data['tags'].apply(lambda x:" ".join(x))


In [25]:
movie_data.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [26]:
# all words in tags should be lowercase
movie_data['tags'] = movie_data['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data['tags'] = movie_data['tags'].apply(lambda x: x.lower())


In [27]:
# stop_words gets rid of and, or, in, etc
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 10000, stop_words = "english")

In [28]:
vectors = cv.fit_transform(movie_data['tags']).toarray()

In [29]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [30]:
cv.get_feature_names()



['000',
 '007',
 '10',
 '100',
 '10th',
 '11',
 '12',
 '12th',
 '13',
 '14',
 '15',
 '150',
 '15th',
 '16',
 '16th',
 '17',
 '17th',
 '18',
 '1863',
 '1890',
 '18th',
 '18thcentury',
 '19',
 '1910',
 '1910s',
 '1920',
 '1920s',
 '1927',
 '1930s',
 '1937',
 '1940s',
 '1941',
 '1944',
 '1945',
 '1950',
 '1950s',
 '1955',
 '1959',
 '1960',
 '1960s',
 '1962',
 '1964',
 '1965',
 '1967',
 '1969',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1976',
 '1977',
 '1979',
 '1980',
 '1980s',
 '1984',
 '1985',
 '1986',
 '1987',
 '1990',
 '1990s',
 '1994',
 '1995',
 '1996',
 '1997',
 '1999',
 '19th',
 '19thcentury',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2007',
 '2008',
 '2009',
 '2011',
 '2012',
 '20th',
 '21st',
 '21stcentury',
 '22nd',
 '23',
 '24',
 '25',
 '27',
 '28',
 '29',
 '30',
 '300',
 '35',
 '3d',
 '40',
 '400',
 '47',
 '50',
 '500',
 '51',
 '60',
 '60s',
 '70',
 '70s',
 '7th',
 '80',
 'aaron',
 'aaroneckhart',
 'aarontaylor',
 'abandoned',
 'abandonment',
 'abando

In [31]:
# words like action and actions are counted as two different words
# this is not helpful so I need to get root words
# this can be done with stemming which gets the root word
# stemming should be applied directly on tags

In [32]:
!pip install nltk



In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
similarity = cosine_similarity(vectors)

In [35]:
similarity

array([[1.        , 0.07644708, 0.05096472, ..., 0.02111002, 0.02272727,
        0.        ],
       [0.07644708, 1.        , 0.05714286, ..., 0.02366905, 0.        ,
        0.        ],
       [0.05096472, 0.05714286, 1.        , ..., 0.02366905, 0.        ,
        0.        ],
       ...,
       [0.02111002, 0.02366905, 0.02366905, ..., 1.        , 0.06333005,
        0.04174829],
       [0.02272727, 0.        , 0.        , ..., 0.06333005, 1.        ,
        0.04494666],
       [0.        , 0.        , 0.        , ..., 0.04174829, 0.04494666,
        1.        ]])

In [36]:
def recommend(movie):
    # find the index of the movie given and access similarity[movie_index]
    # sort similarity[i] from highest to lowest similarity
    # need to keep index position even if it's sorted by using enumerate
    # lambda is used so that the sorting is based on the similarity not the index
    
    movie_index = movie_data[movie_data['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:16]
    
    for i in movies_list:
        print(movie_data.iloc[i[0]].title)

In [37]:
recommend("Batman Begins")

The Dark Knight
The Dark Knight Rises
Batman
Batman
Batman & Robin
Amidst the Devil's Wings
Batman v Superman: Dawn of Justice
Batman Forever
Defendor
Dead Man Down
Batman Returns
Mi America
Teenage Mutant Ninja Turtles
Nine Queens
10th & Wolf


In [38]:
import pickle

In [39]:
pickle.dump(movie_data, open("movies.pkl", "wb"))

In [40]:
movie_data['title'].values

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

In [41]:
pickle.dump(movie_data.to_dict(), open("movie_dict.pkl", "wb"))

In [42]:
pickle.dump(similarity, open("similarity.pkl", "wb"))