In [84]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pickle

In [85]:
#Load Data

credits = pd.read_csv("../data_sets/tmdb_5000_credits.csv")
movies = pd.read_csv("../data_sets/tmdb_5000_movies.csv")

In [86]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [87]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [88]:
movies = movies.merge(credits, left_on="title",right_on="title")


In [89]:
movies.shape

(4809, 23)

In [90]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [91]:
movies = movies[["movie_id","title","overview","genres","keywords","cast","crew"]]

In [92]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [93]:
def convert(obj):
    keep = []
    for i in ast.literal_eval(obj):
        keep.append(i["name"])
    return keep  

In [94]:
movies["genres"] = movies["genres"].apply(convert)

In [95]:
movies['genres'][1]

['Adventure', 'Fantasy', 'Action']

In [96]:
movies["keywords"] = movies["keywords"].apply(convert)

In [97]:
movies.iloc[0]['keywords']

['culture clash',
 'future',
 'space war',
 'space colony',
 'society',
 'space travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love affair',
 'anti war',
 'power relations',
 'mind and soul',
 '3d']

In [98]:
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)[:3]]) # only stop 3 actor solves["cast"] = movies["cast"].apply(lambda x: [i['name'] for i in ast.literal_eval(x) [:3]]

In [99]:
movies["cast"][0]

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

In [100]:
movies['crew'] = movies['crew'].apply(lambda x: [i ["name"] for i in ast.literal_eval(x) if i['job']=="Director"])

In [101]:
movies["crew"][0]

['James Cameron']

In [102]:
# collection of the keywords,genres,cast,crew into a tags
movies['tags'] = movies["genres"]+ movies["keywords"]+ movies["cast"] + movies["crew"]

In [103]:
print(movies["tags"][0], end=" ")

['Action', 'Adventure', 'Fantasy', 'Science Fiction', 'culture clash', 'future', 'space war', 'space colony', 'society', 'space travel', 'futuristic', 'romance', 'space', 'alien', 'tribe', 'alien planet', 'cgi', 'marine', 'soldier', 'battle', 'love affair', 'anti war', 'power relations', 'mind and soul', '3d', 'Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver', 'James Cameron'] 

In [104]:
# Merge id, title and overview with the new created tag
movies = movies[["movie_id","title","overview","tags"]]

In [105]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction, ..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action, ocean, drug abuse..."


In [106]:
# Applying a function to remove spaces and convert to lowercase
movies['tags'] = movies['tags'].apply(lambda tags: [tag.replace(' ', '').lower() for tag in tags])

In [107]:
#tag status
print(movies['tags'][0:1], end=" ")

0    [action, adventure, fantasy, sciencefiction, c...
Name: tags, dtype: object 

In [108]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[action, adventure, fantasy, sciencefiction, c..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[adventure, fantasy, action, ocean, drugabuse,..."


In [109]:
# Combine the list of tags into a single string for each movie
movies['tags_combined'] = movies['tags'].apply(lambda x: ' '.join(x))

# Now use TfidfVectorizer on the combined tags
vectorized_data = TfidfVectorizer(stop_words='english')
vectorized_data_metrix = vectorized_data.fit_transform(movies['tags_combined'])


In [110]:
cosine_sim = cosine_similarity(vectorized_data_metrix,vectorized_data_metrix)

In [111]:
def get_movie_recommendation(title,cosine_sim=cosine_sim):
     index = movies[movies["title"]== title].index[0]
     sim_score = list(enumerate(cosine_sim[index]))
     sim_score = sim_score[1:11] # get top 10 similar movies
     movie_indicies = [i[0] for i in sim_score ]
     return movies['title'].iloc[movie_indicies]

In [112]:
get_movie_recommendation("Avatar")

1     Pirates of the Caribbean: At World's End
2                                      Spectre
3                        The Dark Knight Rises
4                                  John Carter
5                                 Spider-Man 3
6                                      Tangled
7                      Avengers: Age of Ultron
8       Harry Potter and the Half-Blood Prince
9           Batman v Superman: Dawn of Justice
10                            Superman Returns
Name: title, dtype: object

In [None]:
with open('../model/movies_data.pkl','wb') as file:
     pickle.dump((movies, cosine_sim), file)