In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
import pyarrow
import pickle

# Part 1: Preprocessing

In [2]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

Check columns and just one entry to get general understanding (for both dataframes)

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
movies.shape

(4803, 20)

In [5]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [6]:
credits.shape

(4803, 4)

Merge the two DF's

In [7]:
movies = pd.merge(movies, credits, left_on='id', right_on='movie_id', how='inner').drop(['movie_id'],axis=1)   # Drop the redundant column
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [8]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'title_y', 'cast', 'crew'],
      dtype='object')

Only keep necessary columns

In [9]:
movies = movies[['id', 'title_x', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.rename(columns={'title_x': 'title'}, inplace=True)          # we have a 'title_x' and 'title_y' column in the merged df, I believe because of both being present as 'title' in both df's prior to merging
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [10]:
movies.isna().sum()

id          0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

Only 3 NaN's, can be dropped

In [11]:
movies.dropna(inplace=True)

In [12]:
movies.shape

(4800, 7)

Write a function to extract and keep only value of "name" keys from the list elements in genres and keywords

Also write function to keep only first 3 members of cast, and keep only the director in crew

In [13]:
# what does one entry look like?
movies.iloc[0]['genres']

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [14]:
# ast.literal_eval(x) turns the string x into a list (if it is at all possible, which it is in this case)

movies['genres'] = movies['genres'].apply(lambda x: [dict_element['name'] for dict_element in ast.literal_eval(x)])
movies['keywords'] = movies['keywords'].apply(lambda x: [dict_element['name'] for dict_element in ast.literal_eval(x)])
movies['cast'] = movies['cast'].apply(lambda x: [dict_element['name'] for dict_element in ast.literal_eval(x)[:3]])    # we only want to keep first 3 cast members

# Now we use a generator comprehension because we want to stop iterating over the dictionary elements once our director is found
# In the code below, the next() function is designed to stop further iteration once it finds the first instance of dict_element['job']=='Director'
# It is necessary to return None in case no director is found, because in case no director is found, you will get a stop iteration error which is when the gen comprehension gets exhausted but you're still telling it to keep going

movies['crew'] = movies['crew'].apply(lambda x: next(([dict_element['name']] for dict_element in ast.literal_eval(x) if dict_element['job']=='Director'), None))

movies.head(5)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [15]:
movies.isna().sum()

id           0
title        0
overview     0
genres       0
keywords     0
cast         0
crew        30
dtype: int64

There 30 movies whose director field is None

Now we do a bit of NLP

Remove spaces from all people's names in both columns

In [16]:
movies['cast'] = movies['cast'].apply(lambda x: [element.replace(" ","") for element in ast.literal_eval(str(x))])
movies['genres'] = movies['genres'].apply(lambda x: [element.replace(" ","") for element in ast.literal_eval(str(x))])
movies['keywords'] = movies['keywords'].apply(lambda x: [element.replace(" ","") for element in ast.literal_eval(str(x))])
movies['crew'] = movies['crew'].apply(lambda x: [element.replace(" ","") for element in x] if x is not None else [])   # Has to be dealt with separately because of the 30 NaN's

movies.head(5)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


The code above can be modified to fit into just two lines by using the Dataframe.map function. Please feel free to PR with this change if you're able to do it

Convert all entries in `overview` into list of words

In [17]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]


Consolidate all columns into one column (except id and title)

In [18]:
movies['tags'] = movies[['overview', 'genres', 'keywords', 'cast', 'crew']].sum(axis=1)
movies.drop(['overview', 'genres', 'keywords', 'cast', 'crew'], axis=1, inplace=True)
movies.head(3)

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."


In [19]:
movies.iloc[2]['tags']

['A',
 'cryptic',
 'message',
 'from',
 'Bond’s',
 'past',
 'sends',
 'him',
 'on',
 'a',
 'trail',
 'to',
 'uncover',
 'a',
 'sinister',
 'organization.',
 'While',
 'M',
 'battles',
 'political',
 'forces',
 'to',
 'keep',
 'the',
 'secret',
 'service',
 'alive,',
 'Bond',
 'peels',
 'back',
 'the',
 'layers',
 'of',
 'deceit',
 'to',
 'reveal',
 'the',
 'terrible',
 'truth',
 'behind',
 'SPECTRE.',
 'Action',
 'Adventure',
 'Crime',
 'spy',
 'basedonnovel',
 'secretagent',
 'sequel',
 'mi6',
 'britishsecretservice',
 'unitedkingdom',
 'DanielCraig',
 'ChristophWaltz',
 'LéaSeydoux',
 'SamMendes']

Convert to str, and then convert to lowercase

In [20]:
# Convert to one long string
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

# Convert to lowercase
movies['tags'] = movies['tags'].apply(lambda x: x.lower())

movies.head(3)

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...


In [21]:
# Check one entry just to be sure
movies.iloc[0]['tags']

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

Checkpoint - Part 1

In [22]:
movies.to_feather('part_one_checkpoint.feather')

# Part 2: Core NLP

In [3]:
movies = pd.read_feather('part_one_checkpoint.feather')
movies.head(3)

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...


In [4]:
movies.shape

(4800, 3)

In [5]:
import nltk
from nltk.stem import PorterStemmer

Stem all words

In [6]:
ps = PorterStemmer()

In [7]:
movies['tags'] = movies['tags'].apply(lambda sent: " ".join([ps.stem(word) for word in sent.split()]))

movies.iloc[0]['tags']

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

That worked just as expected

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')   # max_features ensures that the vectorizer only considers the first 5000 unique words (features)

In [9]:
vector = cv.fit_transform(movies['tags']).toarray()
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
vector.shape

(4800, 5000)

In [11]:
vector[0][100:200]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)
similarity

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

This is a square matrix btw, all diagonal elements are 1 because the cosim of a word with itself is 1

In [13]:
similarity.shape

(4800, 4800)

In [19]:
similarity[0]

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

Build a recommendation function, the star of our project

In [14]:
'''
Code below is the recommender fucntion without using a function. So it is easier to debug and understand

You will be recommended movies like 'movie_name'
'''
# movie_name = 'Transformers'
# idx = movies[movies['title']==movie_name].index[0]
# # got index of Batman

# distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])
# for i in distances[1:6]:
#     print(movies.iloc[i[0]]['title'])

"\nCode below is the recommender fucntion without using a function. So it is easier to debug and understand\n\nYou will be recommended movies like 'movie_name'\n"

In [15]:
def recommend_similar(movie_name):
    reco_list = []
    
    original_idx_in_movies = movies[movies['title']==movie_name].index[0]
    distances = sorted(enumerate(similarity[original_idx_in_movies]), reverse=True, key=lambda x: x[1])
    print(distances)

    for pair in distances[1:6]:
        reco_idx = pair[0]
        reco_list.append(movies.iloc[reco_idx]['title'])

    return reco_list

In [16]:
recommend_similar('Tangled')

[(6, 1.0000000000000007), (2309, 0.20640627484613444), (269, 0.20380986614602725), (255, 0.19138975058773822), (1676, 0.18605210188381271), (42, 0.17902871850985827), (1695, 0.1771924779845835), (1594, 0.16545563033895141), (67, 0.16537964611894462), (251, 0.16012815380508716), (506, 0.15866576560438353), (391, 0.15860619520032995), (896, 0.15626907697949846), (1984, 0.1554857684028483), (124, 0.15480470613460084), (917, 0.1534851533787301), (1426, 0.15191090506255), (55, 0.1515980089423659), (2396, 0.14890247043403096), (194, 0.14884168150705018), (465, 0.14617633655117157), (4049, 0.14617633655117157), (130, 0.14470719035407653), (429, 0.14470719035407653), (182, 0.14322297480788662), (3447, 0.14322297480788662), (3881, 0.14322297480788662), (34, 0.1432229748078866), (899, 0.1427382933000825), (254, 0.14175398238766682), (3900, 0.14175398238766682), (390, 0.14084763640246842), (812, 0.13953907641285954), (704, 0.13894669485837496), (1587, 0.13846153846153847), (358, 0.138196031911463

['Out of Inferno',
 'The Princess and the Frog',
 'Home on the Range',
 'Animals United',
 'Toy Story 3']

In [17]:
pickle.dump(movies, open('artifacts/movies.pkl', 'wb'))
pickle.dump(similarity, open('artifacts/similarity.pkl', 'wb'))