In [1]:
# importing libraries
import numpy as np
import pandas as pd

In [2]:
# reading datasets
credits_df = pd.read_csv("credits.csv")
movies_df = pd.read_csv("movies.csv")

# combine both datasets together
movies_df = movies_df.merge(credits_df, on="title")

# dropping columns
movies_df = movies_df[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [3]:
# remove na rows
movies_df.dropna(inplace = True)

In [4]:
# checking the genre column
movies_df.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [5]:
# checking the keywords column
movies_df.iloc[0].keywords

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [6]:
# abstract syntax tree
import ast

In [7]:
# define function to convert the genres and keyword columns to only contain the name, in a string/list form, not dict
def convert(obj):
    list = []
    for i in ast.literal_eval(obj):
        list.append(i["name"])
    return list

In [8]:
# converting the columns
movies_df["genres"] = movies_df["genres"].apply(convert)
movies_df["keywords"] = movies_df["keywords"].apply(convert)

In [9]:
# define function to convert the cast columns : only display the top 3 casts
def convert_cast(obj):
    list = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            list.append(i["name"])
        else:
            break
            
    return list

In [10]:
# convert the cast column
movies_df["cast"] = movies_df["cast"].apply(convert_cast)

In [11]:
# define function to convert crew column : we only want to keep the name of the crew who is a director
def convert_crew(obj):
    list = []
    for i in ast.literal_eval(obj):
        if i['job'] == "Director":
            list.append(i["name"])
            break
            
    return list

In [12]:
# convert the crew column
movies_df["crew"] = movies_df["crew"].apply(convert_crew)

In [13]:
movies_df

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]
...,...,...,...,...,...,...,...
4803,9367,El Mariachi,El Mariachi just wants to play his guitar and ...,"[Action, Crime, Thriller]","[united states–mexico barrier, legs, arms, pap...","[Carlos Gallardo, Jaime de Hoyos, Peter Marqua...",[Robert Rodriguez]
4804,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,"[Comedy, Romance]",[],"[Edward Burns, Kerry Bishé, Marsha Dietlein, C...",[Edward Burns]
4805,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...","[Comedy, Drama, Romance, TV Movie]","[date, love at first sight, narration, investi...","[Eric Mabius, Kristin Booth, Crystal Lowe, Geo...",[Scott Smith]
4806,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,[],[],"[Daniel Henney, Eliza Coupe, Bill Paxton, Alan...",[Daniel Hsia]


In [14]:
# separate the overview sentences into elements in a list
movies_df['overview'] = movies_df['overview'].apply(lambda x : x.split())

In [15]:
# remove spacing between words in the same element
movies_df['genres'] = movies_df['genres'].apply(lambda x :[i.replace(" ", "") for i in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x :[i.replace(" ", "") for i in x])
movies_df['cast'] = movies_df['cast'].apply(lambda x :[i.replace(" ", "") for i in x])
movies_df['crew'] = movies_df['crew'].apply(lambda x :[i.replace(" ", "") for i in x])

In [16]:
# create new column by adding the columns together
movies_df['tags'] = movies_df['overview']+movies_df['genres']+movies_df['keywords']+movies_df['cast']+movies_df['crew']
movies_df['tags'] = movies_df['tags'].apply(lambda x : " ".join(x))
movies_df['tags'] = movies_df['tags'].apply(lambda x : x.lower())

In [17]:
# dropping all the duplicated columns
new_df = movies_df[['movie_id', 'title', 'tags']]

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words = "english")
# transforming the tags column to an array recognisable by machine 
vectors = cv.fit_transform(new_df['tags']).toarray()

In [19]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [20]:
# define the stem method to reduce all the words in the tag column into their root word, using the stem method from ps
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [21]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [22]:
new_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
4803,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4804,72766,Newlyweds,a newlyw couple' honeymoon is upend by the arr...
4805,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
4806,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
similarity = cosine_similarity(vectors)

In [25]:
# create the recommendation system
def recommend(movie):
    movie_index = new_df[new_df['title']==movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x : x[1])[1:6]
    
    for i in movies_list:
        return(new_df.iloc[i[0]].title)

In [35]:
recommend("Thor")

Thor: The Dark World
Avengers: Age of Ultron
The Avengers
Captain America: The First Avenger
Iron Man


539

array([1.        , 0.07142857, 0.05143445, ..., 0.02326211, 0.02571722,
       0.        ])