In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import datetime

In [2]:
df = pd.read_csv('tmdb/tmdb_5000_credits.csv')
df = df[['movie_id','title','cast']]

In [3]:
df['cast'] = df['cast'].apply(json.loads)

In [4]:
df['cast'][0][0]['name']

'Sam Worthington'

In [5]:
df.head()

Unnamed: 0,movie_id,title,cast
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '..."
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa..."
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr..."
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba..."
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c..."


In [6]:
df_filtered = pd.DataFrame.copy(df)
df_filtered.head()

Unnamed: 0,movie_id,title,cast
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '..."
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa..."
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr..."
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba..."
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c..."


In [7]:
df_movies = pd.read_csv('tmdb/tmdb_5000_movies.csv', parse_dates=['release_date'])
df_movies = df_movies[['id', 'release_date']]
df_movies.rename(columns={'id':'movie_id'}, inplace=True)

In [8]:
df_movies.head()

Unnamed: 0,movie_id,release_date
0,5,1995-12-09
1,11,1977-05-25
2,12,2003-05-30
3,13,1994-07-06
4,14,1999-09-15


In [9]:
merged = df_filtered.merge(df_movies, how='inner', on='movie_id')
merged.head()

Unnamed: 0,movie_id,title,cast,release_date
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...",2009-12-10
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...",2007-05-19
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...",2015-10-26
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...",2012-07-16
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...",2012-03-07


In [10]:
def getActorList(row):
    actor_list = []
    try:
        for i in range(3): 
            actor = row[i]['name']
            actor_list.append(actor)
    except:
        pass
    return actor_list

In [11]:
merged['actors'] = merged['cast'].apply(getActorList)
merged.head()

Unnamed: 0,movie_id,title,cast,release_date,actors
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...",2009-12-10,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]"
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...",2007-05-19,"[Johnny Depp, Orlando Bloom, Keira Knightley]"
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...",2015-10-26,"[Daniel Craig, Christoph Waltz, Léa Seydoux]"
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...",2012-07-16,"[Christian Bale, Michael Caine, Gary Oldman]"
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...",2012-03-07,"[Taylor Kitsch, Lynn Collins, Samantha Morton]"


In [12]:
# date range from https://www.cnbc.com/2015/11/17/why-movies-are-sometimes-here-and-gone-in-theaters.html
def getEndDate(release_date):
    return release_date + datetime.timedelta(days=31)
def getStartDate(release_date):
    return release_date - datetime.timedelta(days=31)

In [13]:
merged['end_date'] = merged['release_date'].apply(getEndDate)
merged['start_date'] = merged['release_date'].apply(getStartDate)
merged = merged[['movie_id', 'title', 'start_date', 'release_date', 'end_date', 'actors']]
merged.head()

Unnamed: 0,movie_id,title,start_date,release_date,end_date,actors
0,19995,Avatar,2009-11-09,2009-12-10,2010-01-10,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]"
1,285,Pirates of the Caribbean: At World's End,2007-04-18,2007-05-19,2007-06-19,"[Johnny Depp, Orlando Bloom, Keira Knightley]"
2,206647,Spectre,2015-09-25,2015-10-26,2015-11-26,"[Daniel Craig, Christoph Waltz, Léa Seydoux]"
3,49026,The Dark Knight Rises,2012-06-15,2012-07-16,2012-08-16,"[Christian Bale, Michael Caine, Gary Oldman]"
4,49529,John Carter,2012-02-05,2012-03-07,2012-04-07,"[Taylor Kitsch, Lynn Collins, Samantha Morton]"


In [14]:
def returnStringDate(date):
    return date.strftime('%Y-%m-%d')
merged = merged.dropna()
merged['start_date'] = merged['start_date'].apply(returnStringDate)
merged['release_date'] = merged['release_date'].apply(returnStringDate)
merged['end_date'] = merged['end_date'].apply(returnStringDate)

In [15]:
export = merged.to_json(path_or_buf=r'.\intermediates\actors.json', orient='records')