In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import datetime

In [2]:
df = pd.read_csv('tmdb/tmdb_5000_credits.csv')
df = df[['movie_id','title','cast']]

In [3]:
df['cast'] = df['cast'].apply(json.loads)

In [4]:
df['cast'][0][0]['name']

'Sam Worthington'

In [5]:
df

Unnamed: 0,movie_id,title,cast
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '..."
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa..."
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr..."
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba..."
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c..."
5,559,Spider-Man 3,"[{'cast_id': 30, 'character': 'Peter Parker / ..."
6,38757,Tangled,"[{'cast_id': 34, 'character': 'Flynn Rider (vo..."
7,99861,Avengers: Age of Ultron,"[{'cast_id': 76, 'character': 'Tony Stark / Ir..."
8,767,Harry Potter and the Half-Blood Prince,"[{'cast_id': 3, 'character': 'Harry Potter', '..."
9,209112,Batman v Superman: Dawn of Justice,"[{'cast_id': 18, 'character': 'Bruce Wayne / B..."


In [6]:
def getActorMovieDict(df):
    actor_dict = {}
    for i, j in df.iterrows():
        try:
            actor_list = []
            for i in range(3): 
                actor = j['cast'][i]['name']
                actor_list.append(actor)
            actor_dict[j['movie_id']] = actor_list
        except:
            pass
    return actor_dict

In [7]:
actor_dict = getActorMovieDict(df)
actor_dict

{19995: ['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver'],
 285: ['Johnny Depp', 'Orlando Bloom', 'Keira Knightley'],
 206647: ['Daniel Craig', 'Christoph Waltz', 'Léa Seydoux'],
 49026: ['Christian Bale', 'Michael Caine', 'Gary Oldman'],
 49529: ['Taylor Kitsch', 'Lynn Collins', 'Samantha Morton'],
 559: ['Tobey Maguire', 'Kirsten Dunst', 'James Franco'],
 38757: ['Zachary Levi', 'Mandy Moore', 'Donna Murphy'],
 99861: ['Robert Downey Jr.', 'Chris Hemsworth', 'Mark Ruffalo'],
 767: ['Daniel Radcliffe', 'Rupert Grint', 'Emma Watson'],
 209112: ['Ben Affleck', 'Henry Cavill', 'Gal Gadot'],
 1452: ['Brandon Routh', 'Kevin Spacey', 'Kate Bosworth'],
 10764: ['Daniel Craig', 'Olga Kurylenko', 'Mathieu Amalric'],
 58: ['Johnny Depp', 'Orlando Bloom', 'Keira Knightley'],
 57201: ['Johnny Depp', 'Armie Hammer', 'William Fichtner'],
 49521: ['Henry Cavill', 'Amy Adams', 'Michael Shannon'],
 2454: ['Ben Barnes', 'William Moseley', 'Anna Popplewell'],
 24428: ['Robert Downey Jr.', 'Chris Ev

In [8]:
def getUniqueActors(actor_dict):
    actors = set()
    for value in actor_dict.values():
        for actor in value:
            actors.add(actor)
    return list(actors)
getUniqueActors(actor_dict)

['Kiana Tom',
 'Synnøve Macody Lund',
 'Alyssa Milano',
 'Jenna Fischer',
 'Ritesh Deshmukh',
 'Tony Curtis',
 'Daniel Zovatto',
 'Connie Young',
 'Ellen Block',
 'Nicole Smolen',
 'Tom Kenny',
 'Syamsul Arifin',
 'Ian Hart',
 'Felicity Jones',
 'Steve Harvey',
 'Graham Chapman',
 'Claudia Cardinale',
 'Dana Kimmell',
 'Mark Herrier',
 'Jim Cummings',
 'Astro',
 'Miriam Shor',
 'Henry Cavill',
 'Chris Chatman',
 'Ed Oxenbould',
 'Stian Smestad',
 'Iko Uwais',
 'Emily Osment',
 'Harrison Ford',
 'Cyril Raffaelli',
 'Carole Bouquet',
 'Mireille Enos',
 'Ursula Andress',
 "Maryam d'Abo",
 'Boris Kodjoe',
 'Keri Russell',
 'Jenny Slate',
 'Anthony Mackie',
 'DJ Qualls',
 'Sherri Shepherd',
 'Cecilia Narova',
 'Fabio Testi',
 'T.J. McGibbon',
 'Sandra Bullock',
 'Sean Brosnan',
 'Rupert Grint',
 'Melody Anderson',
 'Chyler Leigh',
 'Duane Martin',
 'Jeff Bennett',
 'Glen-Paul Waru',
 'Mike Judge',
 'Meg Ryan',
 'Giancarlo Esposito',
 'Chuck Norris',
 'Ron Moody',
 'Allison Egan',
 'Taylor S

In [9]:
df_filtered = pd.DataFrame.copy(df)
df_filtered.head()

Unnamed: 0,movie_id,title,cast
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '..."
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa..."
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr..."
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba..."
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c..."


In [10]:
df_movies = pd.read_csv('tmdb/tmdb_5000_movies.csv', parse_dates=['release_date'])
df_movies = df_movies[['id', 'release_date']]
df_movies.rename(columns={'id':'movie_id'}, inplace=True)

In [11]:
df_movies.head()

Unnamed: 0,movie_id,release_date
0,5,1995-12-09
1,11,1977-05-25
2,12,2003-05-30
3,13,1994-07-06
4,14,1999-09-15


In [12]:
merged = df_filtered.merge(df_movies, how='inner', on='movie_id')
merged.head()

Unnamed: 0,movie_id,title,cast,release_date
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...",2009-12-10
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...",2007-05-19
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...",2015-10-26
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...",2012-07-16
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...",2012-03-07


In [15]:
# date range from https://www.cnbc.com/2015/11/17/why-movies-are-sometimes-here-and-gone-in-theaters.html
def getActorList(row):
    actor_list = []
    try:
        for i in range(3): 
            actor = row[i]['name']
            actor_list.append(actor)
    except:
        pass
    return actor_list

In [29]:
merged['actors'] = merged['cast'].apply(getActorList)
merged

Unnamed: 0,movie_id,title,cast,release_date,actors
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...",2009-12-10,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]"
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...",2007-05-19,"[Johnny Depp, Orlando Bloom, Keira Knightley]"
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...",2015-10-26,"[Daniel Craig, Christoph Waltz, Léa Seydoux]"
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...",2012-07-16,"[Christian Bale, Michael Caine, Gary Oldman]"
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...",2012-03-07,"[Taylor Kitsch, Lynn Collins, Samantha Morton]"
5,559,Spider-Man 3,"[{'cast_id': 30, 'character': 'Peter Parker / ...",2007-05-01,"[Tobey Maguire, Kirsten Dunst, James Franco]"
6,38757,Tangled,"[{'cast_id': 34, 'character': 'Flynn Rider (vo...",2010-11-24,"[Zachary Levi, Mandy Moore, Donna Murphy]"
7,99861,Avengers: Age of Ultron,"[{'cast_id': 76, 'character': 'Tony Stark / Ir...",2015-04-22,"[Robert Downey Jr., Chris Hemsworth, Mark Ruff..."
8,767,Harry Potter and the Half-Blood Prince,"[{'cast_id': 3, 'character': 'Harry Potter', '...",2009-07-07,"[Daniel Radcliffe, Rupert Grint, Emma Watson]"
9,209112,Batman v Superman: Dawn of Justice,"[{'cast_id': 18, 'character': 'Bruce Wayne / B...",2016-03-23,"[Ben Affleck, Henry Cavill, Gal Gadot]"


In [35]:
def getEndDate(release_date):
    return release_date + datetime.timedelta(days=31)

In [37]:
merged['end_date'] = merged['release_date'].apply(getEndDate)
merged = merged[['movie_id', 'title', 'release_date', 'end_date', 'actors']]
merged.head()

Unnamed: 0,movie_id,title,release_date,end_date,actors
0,19995,Avatar,2009-12-10,2010-01-10,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]"
1,285,Pirates of the Caribbean: At World's End,2007-05-19,2007-06-19,"[Johnny Depp, Orlando Bloom, Keira Knightley]"
2,206647,Spectre,2015-10-26,2015-11-26,"[Daniel Craig, Christoph Waltz, Léa Seydoux]"
3,49026,The Dark Knight Rises,2012-07-16,2012-08-16,"[Christian Bale, Michael Caine, Gary Oldman]"
4,49529,John Carter,2012-03-07,2012-04-07,"[Taylor Kitsch, Lynn Collins, Samantha Morton]"


In [39]:
export = merged.to_json(path_or_buf=r'.\intermediates\actors.json', orient='records')