In [44]:
import numpy as np
import pandas as pd
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load datasets
movie = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv("/content/tmbd_5000_credits.csv")

# Merge both datasets on title
movies = movie.merge(credits, on='title')

# Keep important columns
movies = movies[['id', 'title', 'overview', 'keywords', 'genres', 'cast', 'crew']]

# Drop rows with null values
movies.dropna(inplace=True)

movies.head()

  credits = pd.read_csv("/content/tmbd_5000_credits.csv")


Unnamed: 0,id,title,overview,keywords,genres,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [45]:
# Process 'overview' column

print (movies["overview"][0])

#Split string into list of separate words, remove

movies["overview"] = movies["overview"].apply(lambda x: x.split())

print (movies["overview"][0])



In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.
['In', 'the', '22nd', 'century,', 'a', 'paraplegic', 'Marine', 'is', 'dispatched', 'to', 'the', 'moon', 'Pandora', 'on', 'a', 'unique', 'mission,', 'but', 'becomes', 'torn', 'between', 'following', 'orders', 'and', 'protecting', 'an', 'alien', 'civilization.']


In [46]:
#Process keywords column

def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies["keywords"] = movies["keywords"].apply(convert)
print (movies["keywords"])

0       [culture clash, future, space war, space colon...
1       [ocean, drug abuse, exotic island, east india ...
2       [spy, based on novel, secret agent, sequel, mi...
3       [dc comics, crime fighter, terrorist, secret i...
4       [based on novel, mars, medallion, space travel...
                              ...                        
1487    [restaurant, sport, coach, athlete, newcastle ...
1488                                     [broken trachea]
1489    [competition, airport, air controller, adversary]
1490    [river, mobile phone, bravery, archer, daughte...
1491    [submarine, dc comics, missile, shark attack, ...
Name: keywords, Length: 1489, dtype: object


In [47]:
#Process genres column

movies["genres"] = movies["genres"].apply(convert)
print (movies["genres"])


0           [Action, Adventure, Fantasy, Science Fiction]
1                            [Adventure, Fantasy, Action]
2                              [Action, Adventure, Crime]
3                        [Action, Crime, Drama, Thriller]
4                    [Action, Adventure, Science Fiction]
                              ...                        
1487                                              [Drama]
1488                            [Action, Crime, Thriller]
1489                                      [Comedy, Drama]
1490                     [Horror, Drama, Science Fiction]
1491    [Family, Adventure, Comedy, Science Fiction, C...
Name: genres, Length: 1489, dtype: object


In [48]:
#Process cast column, keep only top 3 actors from each

def convert3(obj):
  L=[]
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L[:3]

movies["cast"] = movies["cast"].apply(convert3)
print (movies["cast"])

0        [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1           [Johnny Depp, Orlando Bloom, Keira Knightley]
2            [Daniel Craig, Christoph Waltz, Léa Seydoux]
3            [Christian Bale, Michael Caine, Gary Oldman]
4          [Taylor Kitsch, Lynn Collins, Samantha Morton]
                              ...                        
1487         [Kuno Becker, Alessandro Nivola, Anna Friel]
1488          [Jason Statham, Chris Sarandon, James Hong]
1489    [John Cusack, Billy Bob Thornton, Cate Blanchett]
1490             [Saoirse Ronan, Diane Kruger, Jake Abel]
1491       [Jack Nicholson, Michael Keaton, Kim Basinger]
Name: cast, Length: 1489, dtype: object


In [49]:
print (movies["crew"][0])


[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"}, 

In [50]:
#Clean
#Check entries for unterminated strings

for row in movies["crew"]:
  if row[-1] != "]":

    print (row)
#Looks like some of the data got cut off, try to remove last botched entry and close off

corrected_rows=[]
for row in movies["crew"]:
  if row[-1] != "]":
    y = 0
    while row[-y] != "{":
      y+=1
    print (y)
    row = row[:(-y-2)] + "]"
    corrected_rows.append(row)
  else:
    corrected_rows.append(row)

movies["crew"] = corrected_rows



[{"credit_id": "52fe4bf7c3a368484e1a0683", "department": "Production", "gender": 2, "id": 488, "job": "Executive Producer", "name": "Steven Spielberg"}, {"credit_id": "554918b892514104ca000244", "department": "Production", "gender": 0, "id": 561, "job": "Casting", "name": "John Papsidera"}, {"credit_id": "52fe4bf7c3a368484e1a0689", "department": "Production", "gender": 2, "id": 664, "job": "Producer", "name": "Frank Marshall"}, {"credit_id": "554918a492514104ba0002bc", "department": "Camera", "gender": 2, "id": 892, "job": "Director of Photography", "name": "John Schwartzman"}, {"credit_id": "554924a9c3a36841b600032b", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Re-Recording Mixer", "name": "Christopher Boyes"}, {"credit_id": "5549258092514104c700038b", "department": "Sound", "gender": 2, "id": 2216, "job": "Sound Designer", "name": "Gary Rydstrom"}, {"credit_id": "5549190292514104c4000279", "department": "Art", "gender": 2, "id": 2529, "job": "Set Decoration", "name":

In [51]:
print (movies["crew"][28])

[{"credit_id": "52fe4bf7c3a368484e1a0683", "department": "Production", "gender": 2, "id": 488, "job": "Executive Producer", "name": "Steven Spielberg"}, {"credit_id": "554918b892514104ca000244", "department": "Production", "gender": 0, "id": 561, "job": "Casting", "name": "John Papsidera"}, {"credit_id": "52fe4bf7c3a368484e1a0689", "department": "Production", "gender": 2, "id": 664, "job": "Producer", "name": "Frank Marshall"}, {"credit_id": "554918a492514104ba0002bc", "department": "Camera", "gender": 2, "id": 892, "job": "Director of Photography", "name": "John Schwartzman"}, {"credit_id": "554924a9c3a36841b600032b", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Re-Recording Mixer", "name": "Christopher Boyes"}, {"credit_id": "5549258092514104c700038b", "department": "Sound", "gender": 2, "id": 2216, "job": "Sound Designer", "name": "Gary Rydstrom"}, {"credit_id": "5549190292514104c4000279", "department": "Art", "gender": 2, "id": 2529, "job": "Set Decoration", "name":

In [52]:
from os import lockf
#Change crew column into director column

def get_director(obj):
  L = []
  loc = 0
  for i in ast.literal_eval(obj):
    if i["job"]=="Director":
      L.append(i["name"])
      break
    loc += 1
  return (L)

movies["crew"] = movies["crew"].apply(get_director)
print (movies["crew"])

0           [James Cameron]
1          [Gore Verbinski]
2              [Sam Mendes]
3       [Christopher Nolan]
4          [Andrew Stanton]
               ...         
1487         [Danny Cannon]
1488           [Boaz Yakin]
1489          [Mike Newell]
1490        [Andrew Niccol]
1491           [Tim Burton]
Name: crew, Length: 1489, dtype: object


In [53]:
# Create 'tags' column by combining overview + keywords + genres + cast + crew
movies['tags'] = movies['overview'] + movies['cast'] + movies['crew'] + movies['keywords']

# Final dataset with relevant columns
movies = movies[['id', 'title', 'tags']]

In [54]:
# Remove spaces from tags
movies['tags'] = movies['tags'].apply(lambda x: [i.replace(" ", "") for i in x])

# Stemming
ps = PorterStemmer()

def stemming(text):
    l = []
    for i in text:
        l.append(ps.stem(i))
    return " ".join(l)

movies['tags'] = movies['tags'].apply(stemming)

In [55]:
# Vectorization
vectorizer = CountVectorizer(max_features=500, stop_words='english')
vectors = vectorizer.fit_transform(movies['tags']).toarray()

# Cosine similarity
similarity = cosine_similarity(vectors)

In [56]:
def Recommendation_system(movie_title):
    movie_index = movies[movies['title'] == movie_title].index[0]
    distances = sorted(list(enumerate(similarity[movie_index])), reverse=True, key=lambda x: x[1])

    for i in distances[1:20]:
        print(movies.iloc[i[0]].title)

In [57]:
pickle.dump(movies, open('model.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [58]:
#Test model

Recommendation_system("Avatar")

Independence Day
Aliens vs Predator: Requiem
Beowulf
The Thing
Titan A.E.
Treasure Planet
Edge of Tomorrow
Meet Dave
Aliens in the Attic
Predators
Tears of the Sun
Galaxy Quest
Battleship
Home
The Watch
Prometheus
We Were Soldiers
The 5th Wave
Mission to Mars
