In [100]:
import re
import json

import pandas as pd
import numpy as np

from collections import deque

## Process dataset

In [2]:
base_folder = "../movies-dataset/"
movies_metadata_fn = "movies_metadata.csv"
credits_fn = "credits.csv"
links_fn = "links.csv"

## Process movies_metadata data structure/schema

In [3]:
metadata = pd.read_csv(base_folder + movies_metadata_fn)
metadata.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## Cast id to int64 and drop any NAN values!

In [4]:
metadata.id = pd.to_numeric(metadata.id, downcast='signed', errors='coerce')

In [5]:
metadata = metadata[metadata['id'].notna()]

In [6]:
list(metadata.columns.values)

['adult',
 'belongs_to_collection',
 'budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count']

In [35]:
def CustomParser(data):
    obj = json.loads(data)
    return obj

We probably need id, title from this dataframe.

## Process credits data structure/schema

In [41]:
credits = pd.read_csv(base_folder + credits_fn)
# credits = pd.read_csv(base_folder + credits_fn, converters={'cast':CustomParser}, header=0)
# Cast id to int
credits.id = pd.to_numeric(credits.id, downcast='signed', errors='coerce')
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [8]:
# cast id to int64 for later join
metadata['id'] = metadata['id'].astype(np.int64)
credits['id'] = credits['id'].astype(np.int64)

In [9]:
metadata.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                         int64
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

In [46]:
credits.dtypes

cast    object
crew    object
id       int32
dtype: object

In [11]:
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [12]:
credits.head(3)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602


## Let's join the two dataset based on movie id

We start with one example movie `Toy Story` with id = 862 in metadata dataset.

In [13]:
merged = pd.merge(metadata, credits, on='id')

In [14]:
merged.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."


In [15]:
toy_story_id = 862
merged.loc[merged['id'] == toy_story_id]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."


## Examine crew/cast json data schme for toy story

In [49]:
cast = merged.loc[merged['id'] == toy_story_id].cast
crew = merged.loc[merged['id'] == toy_story_id].crew

In [50]:
cast

0    [{'cast_id': 14, 'character': 'Woody (voice)',...
Name: cast, dtype: object

## Find all movies Tom hanks has acted in

In [86]:
def has_played(actor_name, cast_data):
    for cast in cast_data:
        name = cast['name']
        actor_id = cast['id']
        cast_id = cast['cast_id']
        credit_id = cast['credit_id']        
        if actor_name.lower() == name.lower():
            print("name: {}, id: {}, cast_id: {}, credit_id: {}".format(name, actor_id, cast_id, credit_id))
            return True
    return False

## Setup data structure

In [106]:
# a map from movie id to a list of actor id's
movie_actor_adj_list = {}
# a map from actor id to a list of movie id's
actor_movie_adj_list = {}
# a map from movies id to their title
movies_map = {}
# a map from actors id to their name
actors_map = {}

In [136]:
cnt, errors = 0, 0
failed_movies = {}
for index, row in merged.iterrows():
    cnt += 1
    movie_id, movie_title = row['id'], row['title']
    if movie_id not in movies_map:
        movies_map[movie_id] = movie_title
    dirty_json = row['cast']
    try:
        regex_replace = [(r"([ \{,:\[])(u)?'([^']+)'", r'\1"\3"'), (r" None", r' null')]
        for r, s in regex_replace:
            dirty_json = re.sub(r, s, dirty_json)
        cast_data = json.loads(dirty_json)
#         if has_played('Tom Hanks', cast_data):
#             print("Movie id: {}, title: {}".format(movie_id, movie_title))
        for cast in cast_data:
            actor_name = cast['name']
            actor_id = cast['id']
            if actor_id not in actors_map:
                actors_map[actor_id] = actor_name
            # build movie-actor adj list
            if movie_id not in movie_actor_adj_list:
                movie_actor_adj_list[movie_id] = [actor_id]
            else:
                movie_actor_adj_list[movie_id].append(actor_id)
            # build actor-movie adj list
            if actor_id not in actor_movie_adj_list:
                actor_movie_adj_list[actor_id] = [movie_id]
            else:
                actor_movie_adj_list[actor_id].append(movie_id)
    except json.JSONDecodeError as err:
        # print("JSONDecodeError: {}, Movie id: {}, title: {}".format(err, movie_id, movie_title))
        failed_movies[movie_id] = True
        errors += 1
print("Parsed credist: {}, errors: {}".format(cnt, errors))

name: Tom Hanks, id: 31, cast_id: 14, credit_id: 52fe4284c3a36847f8024f95
Movie id: 862, title: Toy Story
name: Tom Hanks, id: 31, cast_id: 21, credit_id: 52fe452fc3a36847f80c1111
Movie id: 9800, title: Philadelphia
name: Tom Hanks, id: 31, cast_id: 13, credit_id: 52fe4283c3a36847f8024bd9
Movie id: 858, title: Sleepless in Seattle
name: Tom Hanks, id: 31, cast_id: 8, credit_id: 554f734e92514162f2001889
Movie id: 32562, title: The Celluloid Closet
name: Tom Hanks, id: 31, cast_id: 10, credit_id: 52fe4283c3a36847f8024aef
Movie id: 857, title: Saving Private Ryan
name: Tom Hanks, id: 31, cast_id: 8, credit_id: 52fe44ad9251416c7503d27b
Movie id: 11974, title: The 'Burbs
name: Tom Hanks, id: 31, cast_id: 1, credit_id: 52fe4360c3a36847f804faaf
Movie id: 2619, title: Splash
name: Tom Hanks, id: 31, cast_id: 2, credit_id: 52fe4608c3a368484e07ceef
Movie id: 29968, title: Nothing in Common
name: Tom Hanks, id: 31, cast_id: 1, credit_id: 52fe44fec3a36847f80b660d
Movie id: 9489, title: You've Got 

In [108]:
movie_actor_adj_list[862]

[31,
 12898,
 7167,
 12899,
 12900,
 7907,
 8873,
 1116442,
 12901,
 12133,
 8655,
 12903,
 37221]

In [157]:
inv_actors_map = {v: k for k, v in actors_map.items()}
inv_movies_map = {v: k for k, v in movies_map.items()}

In [152]:
kevin_id = inv_actors_map['Kevin Bacon']
print(kevin_id)

4724


In [149]:
DEBUG = False
q = deque()
q.append(kevin_id)
bacon_degrees = {kevin_id: 0}
visited = {}
degree = 1

while q:
    u = q.popleft()
    if DEBUG:
        print("u: {}".format(u))
#         print(q)
    if u not in visited:
        visited[u] = True
        if DEBUG:
            print("degree(u): {}".format(bacon_degrees[u]))
        if bacon_degrees[u] % 2 == 0:
            # actor type node
            neighbors = actor_movie_adj_list[u]
            if DEBUG:
                print("actor type, neighbors: {}".format(neighbors))
        else:
            # movie type node
            neighbors = movie_actor_adj_list[u]
            if DEBUG:
                print("movie type, neighbors: {}".format(neighbors))
        for v in neighbors:
            if v not in visited:
                q.append(v)
                if v not in bacon_degrees:
                    bacon_degrees[v] = bacon_degrees[u] + 1

In [150]:
bacon_degrees[kevin_id]

0

In [145]:
actors_map[2224]

'Christian Slater'

In [135]:
movies_map[9413]

'Picture Perfect'

In [155]:
actor_id = inv_actors_map['Tom Hanks']
bacon_degrees[actor_id]

In [160]:
actor_id = inv_actors_map['Tom Cruise']
bacon_degrees[actor_id]

2

In [158]:
movie_id = inv_movies_map['Apollo 13']
failed_movies[movie_id]

568

In [163]:
actor_id = inv_actors_map['Tom Cruise']
tom_cruise_movies = actor_movie_adj_list[actor_id]

In [162]:
actor_id = inv_actors_map['Kevin Bacon']
kevin_bacon_movies = actor_movie_adj_list[actor_id]

In [164]:
set(tom_cruise_movies).intersection(set(kevin_bacon_movies))

{881}

In [165]:
movies_map[881]

'A Few Good Men'