## Data Collection

This notebook is set up to collect data from an unofficial IMDb API (theimdbapi.org) as well as the MovieDB API (api.themoviedb.org) based on a list of Hollywood abusers that I have manually collected.

The broad steps are as follows:
0. Create CSV of abusers and all of the films they have been involved in with columns:
    * Name
    * Role (only looking at "Actor", "Actress", "Director", "Producer", "Writer")
    * IMDb ID
    * Title
    * Type (only "Film" is included)
    * URL
    * Year
0. Create CSV of all films touched by abusers and main cast and crew with columns:
    * MovieDB ID
    * Title
    * Name
    * Role (MovieDB API provides more information than IMDb unofficial API)
    * Gender
0. Create CSV of each person and all their projects:
    * IMDb Person ID
    * Name
    * IMDb Movie ID
    * Title
    * Type (only "Film" is included)
    * URL
    * Year

## 1. Abuser Data

In [1]:
# Set up Python

import pandas as pd
import numpy as np
import csv
import json
import urllib
import requests
import re

In [None]:
# Import CSV of abusers

abusers = pd.read_csv('../data/hollywood_abusers_list.csv')
abusers.head()

In [None]:
# Create list of abusers from 'name' column in `abusers`

abuser_names = abusers['name'].tolist()
# debugging: print(abuser_names)

# Create a dictionary of abusers from 'name' and 'imdb_id' in `abusers` when name search in API doesn't work

abuser_ids = abusers.set_index('name')['imdb_id'].to_dict()
# debugging: abuser_ids

# Create a list of roles for this dataset

roles = ['actor', 'actress', 'director', 'producer', 'writer']

# debugging: print(roles)

In [None]:
# Loop through `abuser_ids` to access unofficial IMDb API and store in dictionary

imdb_person_id = "http://www.theimdbapi.org/api/person?"
imdb_person_name = "http://www.theimdbapi.org/api/find/person?"

abuser_urls = {}
for a in abuser_names:
    result_encoded = imdb_person_name + urllib.parse.urlencode({'name': a})
    result = result_encoded.replace('%2B', '+')
    
    # Create `abuser_urls` dictionary
    
    abuser = a
    urls = result
    abuser_urls[abuser] = urls
    
# debugging: print(abuser_urls)

In [None]:
# Create dataframe with placeholder data that will be filtered out later

placeholder_abusers = {'imdb_id': 'tt0000000',
               'title': 'test',
               'type': 'test',
               'url': 'http://www.test.com/',
               'year': '2017',
               'name': 'test',
               'role': 'test'}

df_abusers = pd.DataFrame(placeholder_abusers, index = [0])

# debugging: df_abusers

#### Note: cell below takes a long time to run as it needs to spit out probably 100k rows.

In [None]:
# Loop through `abuser_urls` dictionary and `roles` list to append API data to `df_abusers`

for abuser, url in abuser_urls.items():
    abuser_json = requests.get(url).json()
    for r in roles:
        try:
            df_abusers_temp = pd.DataFrame(abuser_json[0]['filmography'][r]).assign(name = lambda x: abuser, role = lambda x: r) 
            df_abusers = df_abusers.append(df_abusers_temp, ignore_index = True)
        except KeyError:
            pass

df_abusers.info()

In [None]:
# Create new dataframe from `df_abusers` where type is "Film" (also deletes test row)

df_abusers_clean = df_abusers[df_abusers.type == 'Film']

# Drop duplicate rows in case there are any

df_abusers_clean.drop_duplicates(['imdb_id', 'name', 'role'], keep = 'first')

# Drop rows where year is NaN as those films are on hold or have not yet been released

df_abusers_clean.dropna(subset=['year'])

# Remove extraneous characters from 'year'

df_abusers_clean['year'] = df_abusers_clean['year'].astype(str).apply(lambda x: x[:4])

# Turn empty columns that aren't NaN into NaN and then drop rows where 'year' is NaN

df_abusers_clean = df_abusers_clean.apply(lambda x: x.str.strip()).replace('', np.nan)
df_abusers_clean = df_abusers_clean[df_abusers_clean.year != 'nan']
df_abusers_clean = df_abusers_clean.dropna(subset = ['year'])

df_abusers_clean.info()

In [None]:
# Create new CSV from `df_abusers` dataframe

df_abusers_clean.to_csv('../data/abusers_filmography.csv', index = False)

Step 1 complete!

## 2. Film Data

In [None]:
# Import CSV of abusers' filmography

df_filmography = pd.read_csv('../data/abusers_filmography.csv')
df_filmography.head()

In [None]:
# Drop all columns except for 'imdb_id'

df_filmography.drop(['name', 'role', 'title', 'type', 'url', 'year'], axis = 1)

# Drop duplicate films

df_filmography.drop_duplicates(keep = 'first', inplace = True)

In [None]:
# Create list of film IDs from 'imdb_id' column in `df_films`

film_imdb_ids = df_filmography['imdb_id'].tolist()
film_imdb_ids

In [None]:
# Loop through `film_imdb_ids` to create dictionary for corresponding MovieDB API URLs

moviedb_id = "https://api.themoviedb.org/3/movie/"
moviedb_key = '21acf1b7274aa72fe97cd9acc7eb382a'

film_imdb_moviedb_ids = {}
for f in film_imdb_ids:
    result = moviedb_id + f + "?" + urllib.parse.urlencode({'api_key': moviedb_key}) + "&external_source=imdb_id"
    
    # Create `film_imdb_moviedb_ids` dictionary
    
    imdb = f
    moviedb = result
    film_imdb_moviedb_ids[imdb] = moviedb

film_imdb_moviedb_ids

#### Cells below strictly for CSV export to create IMDb to MovieDB lookup table

In [None]:
# Loop through `film_imdb_moviedb_ids` in the MovieDB API to access corresponding MovieDB ID for IMDb ID lookup

film_imdb_moviedb_id_lookup = {}
for i, m in film_imdb_moviedb_ids.items():
    moviedb_json = requests.get(m).json()
    
    # Create `film_moviedb_ids` dictionary
    
    try:
        moviedb_id = moviedb_json['id']
        film_imdb_moviedb_id_lookup[i] = moviedb_id
    except KeyError:
        pass

film_imdb_moviedb_id_lookup

In [None]:
# Convert `film_imdb_moviedb_id_lookup` dictionary to dataframe

df_film_imdb_moviedb_id_lookup = pd.DataFrame(film_imdb_moviedb_id_lookup, index = [0])
df_film_imdb_moviedb_id_lookup = df_film_imdb_moviedb_id_lookup.transpose().reset_index()
df_film_imdb_moviedb_id_lookup.columns = ['imdb_id', 'moviedb_id']

# Create new CSV

df_film_imdb_moviedb_id_lookup.to_csv('../data/imdb_moviedb_id_lookup.csv', index = False)

Return to creating dataframes

#### Note: cell below takes a long time to run.

In [None]:
# Loop through `film_imdb_moviedb_ids` in the MovieDB API to access corresponding IDs and Title

film_moviedb_ids = {}
for i, m in film_imdb_moviedb_ids.items():
    moviedb_json = requests.get(m).json()
    
    # Create `film_moviedb_ids` dictionary
    
    try:
        ids = moviedb_json['id']
        titles = moviedb_json['title']
        film_moviedb_ids[ids] = titles
    except KeyError:
        pass

# debugging: film_moviedb_ids

In [None]:
# Create dataframe with placeholder data that will be filtered out later

placeholder_credits = {'moviedb_id': '0000',
                       'title': 'test',
                       'name': 'test',
                       'gender': '0',
                       'role': 'test'}
    
df_credits = pd.DataFrame(placeholder_credits, index = [0])

# debugging: df_credits

In [None]:
# Loop through `film_moviedb_ids` to pull full credits for each film

for id, title in film_moviedb_ids.items():
    credits_url = moviedb_id + str(id) + "/credits?" + urllib.parse.urlencode({'api_key': moviedb_key})
    credits_json = requests.get(credits_url).json()
    
    # Parse JSON and set to DataFrame
    
    try:
        cast = pd.DataFrame(credits_json['cast']).assign(moviedb_id = lambda x: id,\
                                                         title = lambda x: title, role = lambda x: 'Actor')
        df_credits = df_credits.append(cast, ignore_index = True)
        crew = pd.DataFrame(credits_json['crew']).assign(moviedb_id = lambda x: id,\
                                                         title = lambda x: title)
        crew = crew.rename(columns = {'job': 'role'})
        df_credits = df_credits.append(crew, ignore_index = True)
    except KeyError:
        pass

df_credits.info()

In [None]:
# Create new dataframe from `df_credits` to only keep rows with relevant credits

df_credits_clean = df_credits[(df_credits.role == 'Actor') | (df_credits.role == 'Director') |\
                             (df_credits.role == 'Executive Producer') | (df_credits.role == 'Producer') |\
                             (df_credits.role == 'Screenplay')]

# Drop extraneous columns

df_credits_clean = df_credits_clean.drop(['cast_id', 'character', 'credit_id',\
                                        'department', 'order', 'profile_path'], axis = 1)

# Rename 'id' to 'person_id'

df_credits_clean = df_credits_clean.rename(columns = {'id': 'person_id'})

# Drop duplicate rows in case there are any

df_credits_clean.drop_duplicates(['moviedb_id', 'person_id', 'role' ], keep = 'first')

# Convert 'person_id' to int64

df_credits_clean['person_id'] = df_credits_clean['person_id'].astype('int64')

# Reorder columns

df_credits_clean = df_credits_clean[['moviedb_id', 'title', 'person_id', 'name', 'role', 'gender']]

df_credits_clean.info()

In [None]:
# Create new CSV from `df_credits` dataframe

df_credits_clean.to_csv('../data/abusers_film_credits.csv', index = False)

Step 2 complete!

## 3. Person Table

In [34]:
# Import CSV of film credits

df_person_credits = pd.read_csv('../data/abusers_film_credits.csv')
df_person_credits.head()

Unnamed: 0,moviedb_id,title,person_id,name,role,gender
0,14462,The Manchurian Candidate,5292,Denzel Washington,Actor,2
1,14462,The Manchurian Candidate,5064,Meryl Streep,Actor,1
2,14462,The Manchurian Candidate,23626,Liev Schreiber,Actor,2
3,14462,The Manchurian Candidate,10127,Jon Voight,Actor,2
4,14462,The Manchurian Candidate,55314,Kimberly Elise,Actor,1


In [35]:
# Create new dataframe with all columns except 'person_id' dropped

df_person_id = df_person_credits.drop(['moviedb_id', 'title', 'name', 'role', 'gender'], axis = 1)

# Drop duplicate films

df_person_id.drop_duplicates(keep = 'first', inplace = True)

# Convert 'person_id' column to object so it's iterable

df_person_id['person_id'] = df_person_id['person_id'].astype('object')

# debugging: df_person_id

In [36]:
# Create list of person IDs from 'person_id' column in `df_person_ids`

person_id = df_person_id['person_id'].tolist()
# debugging: person_id

In [38]:
# Loop through `person_ids` to create dictionary for corresponding MovieDB API URLs

moviedb_person_id = "https://api.themoviedb.org/3/person/"
moviedb_key = '21acf1b7274aa72fe97cd9acc7eb382a'

person_moviedb_urls = {}
for p in person_id:
    result = moviedb_person_id + str(p) + "?" + urllib.parse.urlencode({'api_key': moviedb_key})
    
    # Create `person_moviedb_urls` dictionary
    
    person_id = p
    moviedb_url = result
    person_moviedb_urls[person_id] = moviedb_url

# debugging: person_moviedb_urls

In [39]:
person_moviedb_urls

{5292: 'https://api.themoviedb.org/3/person/5292?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
 5064: 'https://api.themoviedb.org/3/person/5064?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
 23626: 'https://api.themoviedb.org/3/person/23626?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
 10127: 'https://api.themoviedb.org/3/person/10127?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
 55314: 'https://api.themoviedb.org/3/person/55314?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
 21657: 'https://api.themoviedb.org/3/person/21657?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
 2954: 'https://api.themoviedb.org/3/person/2954?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
 16358: 'https://api.themoviedb.org/3/person/16358?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
 2310: 'https://api.themoviedb.org/3/person/2310?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
 43366: 'https://api.themoviedb.org/3/person/43366?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
 15854: 'https://api.themoviedb.org/3/person/15854?api_key=21acf1b72

#### Note: cell below takes a long time to run as it runs through API and spits out more than 10k rows.

In [41]:
# Loop through `person_moviedb_urls` in the MovieDB API to access corresponding IMDB IDs

person_imdb_ids = {}
for i, url in person_moviedb_urls.items():
    person_json = requests.get(url).json()
    
    # Create `person_imdb_ids`dictionary
    
    try:
        imdb_id = person_json['imdb_id']
    except KeyError:
        pass
    moviedb_id = i
    person_imdb_ids[moviedb_id] = imdb_id

# debugging: person_imdb_ids

KeyboardInterrupt: 

In [44]:
# Convert `person_imdb_ids` dictionary to dataframe

df_person_imdb_ids = pd.DataFrame(person_imdb_ids, index = [0])
df_person_imdb_ids_clean = df_person_imdb_ids.transpose().reset_index()
df_person_imdb_ids_clean.columns = ["moviedb_id", "imdb_id"]

# Create new CSV

df_person_imdb_ids_clean.to_csv('../data/person_api_imdb_id.csv', index = False)

#### Note to self: if kernel needs to be restarted, start here for step 3.

In [2]:
# Import `person_api_imdb_id.csv` as kernel restarts

df_person_api_imdb_id = pd.read_csv('../data/person_api_imdb_id.csv')

# Drop rows where 'imdb_id' is NaN

df_person_api_imdb_id = df_person_api_imdb_id.dropna(subset=['imdb_id'])

df_person_api_imdb_id.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11588 entries, 0 to 13859
Data columns (total 2 columns):
moviedb_url    11588 non-null object
imdb_id        11588 non-null object
dtypes: object(2)
memory usage: 271.6+ KB


In [3]:
# Create list of 'imdb_id' from `person_imdb_ids` dataframe

person_imdb_ids = df_person_api_imdb_id['imdb_id'].tolist()
# debugging: person_imdb_ids

In [9]:
# Loop through `person_imdb_ids` to access unofficial IMDb API and store in dictionary

imdb_person_id = "http://www.theimdbapi.org/api/person?"

imdb_person_url = {}
for i in person_imdb_ids:
    urls = imdb_person_id + urllib.parse.urlencode({'person_id': i})
    
    # Create `imdb_person_url` dictionary
    
    imdb_person_url[i] = urls
    
# debugging: imdb_person_url

In [7]:
# Create dataframe with placeholder data that will be filtered out later

placeholder_person = {'person_id': 'nm0000000',
                      'name': 'test',
                      'imdb_id': 'tt0000000',
                      'title': 'test', 
                      'type': 'test',
                      'url': 'http://www.test.com/',
                      'year': '2017',
                      'name': 'test',
                      'role': 'test'}
    
df_person_filmography = pd.DataFrame(placeholder_person, index = [0])
# debugging: df_person_filmography

In [11]:
# Repeat list of rows

roles = ['actor', 'actress', 'director', 'producer', 'writer']

#### Note to self: next cell takes forever to run as it needs to loop through API pages to pull the filmography data for 30-50k people who have worked with known abusers. End result is hundreds of thousands of rows large if not over one million. Run sparingly.

In [22]:
# Loop through `df_person_filmography` and `roles` list to append API data to `df_person_filmography`

for i, url in imdb_person_url.items():
    person_filmography_json = [requests.get(url).json()]
    #print(person_filmography_json)
    for r in roles:
        try:
            df_person_temp = pd.DataFrame(person_filmography_json[0]['filmography'][r]).assign(name = lambda x: person_filmography_json[0]['title'], person_id = lambda x: person_filmography_json[0]['person_id'], role = lambda x: r)
            df_person_filmography = df_person_filmography.append(df_person_temp, ignore_index = True)
        except KeyError:
            pass
        except TypeError:
            pass
        except JSONDecodeError:
            pass

df_person_filmography.info()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

#### NOTE: CELL ABOVE TAKES FOREVER AND KEEPS BREAKING

Despite the JSONDecodeError exception, code breaks with "JSONDecodeError: Expecting value: line 1 column 1 (char 0)" and dataframe ends around 100k and only loops through 2-3k people.

#### Need to fix for final deliverable

In [23]:
df_person_filmography.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100260 entries, 0 to 100259
Data columns (total 8 columns):
imdb_id      100260 non-null object
name         100260 non-null object
person_id    100260 non-null object
role         100260 non-null object
title        100260 non-null object
type         100260 non-null object
url          100260 non-null object
year         100260 non-null object
dtypes: object(8)
memory usage: 6.1+ MB


In [24]:
df_person_filmography.head()

Unnamed: 0,imdb_id,name,person_id,role,title,type,url,year
0,tt0000000,test,nm0000000,test,test,test,http://www.test.com/,2017
1,tt0017419,Carol Dempster,nm0218781,actress,The Sorrows of Satan,Film,http://www.imdb.com/title/tt0017419/,1926
2,tt0016420,Carol Dempster,nm0218781,actress,That Royle Girl,Film,http://www.imdb.com/title/tt0016420/,1925
3,tt0016308,Carol Dempster,nm0218781,actress,Sally of the Sawdust,Film,http://www.imdb.com/title/tt0016308/,1925
4,tt0015018,Carol Dempster,nm0218781,actress,Isn't Life Wonderful,Film,http://www.imdb.com/title/tt0015018/,1924


In [25]:
# Create new dataframe from `df_person_filmography` where type is "Film" (also deletes test row)

df_person_filmography_clean = df_person_filmography[df_person_filmography.type == 'Film']

# Drop duplicate rows in case there are any

df_person_filmography_clean.drop_duplicates(['imdb_id', 'name', 'person_id', 'role'], keep = 'first')

# Drop rows where year is NaN as those films are on hold or have not yet been released

df_person_filmography_clean.dropna(subset=['year'])

# Remove extraneous characters from 'year'

df_person_filmography_clean['year'] = df_person_filmography_clean['year'].astype(str).apply(lambda x: x[:4])

# Turn empty columns that aren't NaN into NaN and then drop rows where 'year' is NaN

df_person_filmography_clean = df_person_filmography_clean.apply(lambda x: x.str.strip()).replace('', np.nan)
df_person_filmography_clean = df_person_filmography_clean[df_person_filmography_clean.year != 'nan']
df_person_filmography_clean = df_person_filmography_clean.dropna(subset = ['year'])

df_person_filmography_clean.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


<class 'pandas.core.frame.DataFrame'>
Int64Index: 57889 entries, 1 to 100253
Data columns (total 8 columns):
imdb_id      57889 non-null object
name         57889 non-null object
person_id    57889 non-null object
role         57889 non-null object
title        57889 non-null object
type         57889 non-null object
url          57889 non-null object
year         57889 non-null object
dtypes: object(8)
memory usage: 4.0+ MB


In [26]:
# Create new CSV from `df_person_filmography_clean` dataframe

df_person_filmography_clean.to_csv('../data/person_filmography.csv', index = False)

Step 3 complete!

## Ingore everything below

In [None]:
# EXAMPLE HERE TO DELETE

for i, u in test5.items():
    test_json = [requests.get(u).json()]
    #print(test_json)
    #name = test_json[0]['title']
    #person_id = test_json[0]['person_id']
    for r in roles:
        try:
            df_temp = pd.DataFrame(test_json[0]['filmography'][r]).assign(name = lambda x: test_json[0]['title'] , person_id = lambda x: test_json[0]['person_id'], role = lambda x: r) 
            test9 = test9.append(df_temp, ignore_index = True)
        except KeyError:
            pass
test9

In [14]:
# TEST

test3 = {53930: 'https://api.themoviedb.org/3/person/53930?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
         42003: 'https://api.themoviedb.org/3/person/42003?api_key=21acf1b7274aa72fe97cd9acc7eb382a',
         62547: 'https://api.themoviedb.org/3/person/62547?api_key=21acf1b7274aa72fe97cd9acc7eb382a', 
         109: 'https://api.themoviedb.org/3/person/109?api_key=21acf1b7274aa72fe97cd9acc7eb382a' }

test4 = {}
for i, u in test3.items():
    test_json = requests.get(u).json()
    
    # dictionary
    
    imdb_id = test_json['imdb_id']
    test4[u] = imdb_id

test4

{'https://api.themoviedb.org/3/person/109?api_key=21acf1b7274aa72fe97cd9acc7eb382a': 'nm0000704',
 'https://api.themoviedb.org/3/person/42003?api_key=21acf1b7274aa72fe97cd9acc7eb382a': 'nm0834373',
 'https://api.themoviedb.org/3/person/53930?api_key=21acf1b7274aa72fe97cd9acc7eb382a': 'nm0001499',
 'https://api.themoviedb.org/3/person/62547?api_key=21acf1b7274aa72fe97cd9acc7eb382a': 'nm0860749'}

In [15]:
# TEST

imdb_person_id = "http://www.theimdbapi.org/api/person?"

test5 = {}
for u, t in test4.items():
    urls = imdb_person_id + urllib.parse.urlencode({'person_id': t})
    
    # dictionary
    
    test5[t] = urls
    
test5

{'nm0000704': 'http://www.theimdbapi.org/api/person?person_id=nm0000704',
 'nm0001499': 'http://www.theimdbapi.org/api/person?person_id=nm0001499',
 'nm0834373': 'http://www.theimdbapi.org/api/person?person_id=nm0834373',
 'nm0860749': 'http://www.theimdbapi.org/api/person?person_id=nm0860749'}

In [16]:
# TEST

test6 = {'imdb_id': 'nm0000000',
         'name': 'test',
         'actor': 'test', 
         'director': 'test', 
         'producer': 'test', 
         'writer': 'test'}
    
test7 = pd.DataFrame(test6, index = [0])

test7

test8 = {'imdb_id': 'tt0000000',
         'title': 'test',
         'type': 'test',
         'url': 'http://www.test.com/',
         'year': '2017',
         'name': 'test',
         'role': 'test'}

test9 = pd.DataFrame(test8, index = [0])

test9

Unnamed: 0,imdb_id,name,role,title,type,url,year
0,tt0000000,test,test,test,test,http://www.test.com/,2017


In [17]:
# TEST

# Repeat list of rows

roles = ['actor', 'actress', 'director', 'producer', 'writer']

for i, u in test5.items():
    test_json = [requests.get(u).json()]
    #print(test_json)
    #name = test_json[0]['title']
    #person_id = test_json[0]['person_id']
    for r in roles:
        try:
            df_temp = pd.DataFrame(test_json[0]['filmography'][r]).assign(name = lambda x: test_json[0]['title'] , person_id = lambda x: test_json[0]['person_id'], role = lambda x: r) 
            test9 = test9.append(df_temp, ignore_index = True)
        except KeyError:
            pass
test9

Unnamed: 0,imdb_id,name,person_id,role,title,type,url,year
0,tt0000000,test,,test,test,test,http://www.test.com/,2017
1,tt3514324,Tina Majorino,nm0001499,actress,ScorpionGo with the Flo(rence),TV Series,http://www.imdb.com/title/tt3514324/,2017
2,tt4265778,Tina Majorino,nm0001499,actress,If the Days of the Week Were People,Video short,http://www.imdb.com/title/tt4265778/,2014
3,tt2402137,Tina Majorino,nm0001499,actress,LegendsWilderness of MirrorsIdentityIconoclast...,TV Series,http://www.imdb.com/title/tt2402137/,2014
4,tt2771372,Tina Majorino,nm0001499,actress,Veronica Mars,Film,http://www.imdb.com/title/tt2771372/,2014
5,tt3186456,Tina Majorino,nm0001499,actress,You Me & Her,Short Film,http://www.imdb.com/title/tt3186456/,2014
6,tt0413573,Tina Majorino,nm0001499,actress,Grey's AnatomyI Want You with MeSeal Our FateP...,TV Series,http://www.imdb.com/title/tt0413573/,2012-2013
7,tt0844441,Tina Majorino,nm0001499,actress,"True BloodGone, Gone, GoneEverybody Wants to R...",TV Series,http://www.imdb.com/title/tt0844441/,2012
8,tt1717210,Tina Majorino,nm0001499,actress,Should've Been Romeo,Film,http://www.imdb.com/title/tt1717210/,2012
9,tt0460627,Tina Majorino,nm0001499,actress,BonesThe Bump in the RoadThe Male in the MailT...,TV Series,http://www.imdb.com/title/tt0460627/,2011-2012


In [18]:
test9.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295 entries, 0 to 294
Data columns (total 8 columns):
imdb_id      295 non-null object
name         295 non-null object
person_id    294 non-null object
role         295 non-null object
title        295 non-null object
type         295 non-null object
url          295 non-null object
year         295 non-null object
dtypes: object(8)
memory usage: 18.5+ KB


In [19]:
test9['role'].value_counts()

actress     181
actor        98
producer     15
test          1
Name: role, dtype: int64

In [20]:
test9['name'].value_counts()

Elijah Wood        113
Gail Strickland     94
Susanna Thompso     53
Tina Majorino       34
test                 1
Name: name, dtype: int64

In [21]:
test9 = test9[test9.type == 'Film']
test9

Unnamed: 0,imdb_id,name,person_id,role,title,type,url,year
4,tt2771372,Tina Majorino,nm0001499,actress,Veronica Mars,Film,http://www.imdb.com/title/tt2771372/,2014
8,tt1717210,Tina Majorino,nm0001499,actress,Should've Been Romeo,Film,http://www.imdb.com/title/tt1717210/,2012
14,tt1600200,Tina Majorino,nm0001499,actress,In Security,Film,http://www.imdb.com/title/tt1600200/,2010
16,tt0384683,Tina Majorino,nm0001499,actress,What We Do Is Secret,Film,http://www.imdb.com/title/tt0384683/,2007
19,tt0403598,Tina Majorino,nm0001499,actress,Think Tank,Film,http://www.imdb.com/title/tt0403598/,2006
20,tt0780624,Tina Majorino,nm0001499,actress,Testing Bob,Film,http://www.imdb.com/title/tt0780624/,2005
22,tt0374900,Tina Majorino,nm0001499,actress,Napoleon Dynamite,Film,http://www.imdb.com/title/tt0374900/,2004
23,tt0164993,Tina Majorino,nm0001499,actress,Alice in Wonderland,Film,http://www.imdb.com/title/tt0164993/,1999
24,tt0136399,Tina Majorino,nm0001499,actress,"Merry Christmas, George Bailey",Film,http://www.imdb.com/title/tt0136399/,1997
25,tt0128072,Tina Majorino,nm0001499,actress,Before Women Had Wings,Film,http://www.imdb.com/title/tt0128072/,1997


In [None]:
tina1 = requests.get('http://www.theimdbapi.org/api/person?person_id=nm0001499').json()
tina1

In [None]:
tina2 = requests.get('http://www.theimdbapi.org/api/find/person?name=Mindy%20Kaling').json()
tina2

In [None]:
tina3 = [tina1]
tina3

In [None]:
# TEST

moviedb_imdb_id = "https://api.themoviedb.org/3/movie/"
moviedb_key = '21acf1b7274aa72fe97cd9acc7eb382a'

test_ids = ['tt0120737', # Lord of the Rings
            'tt0133093', # The Matrix
            'tt1935089'] # Blind

test_moviedb_ids = {}
for t in test_ids:
    result = moviedb_imdb_id + t + "?" + urllib.parse.urlencode({'api_key': moviedb_key}) + "&external_source=imdb_id"
    
    imdb = t
    moviedb = result
    test_moviedb_ids[imdb] = moviedb

test_moviedb_ids

In [None]:
# TEST

test1_ids = {}
for t, m in test_moviedb_ids.items():
    test1_json = requests.get(m).json()
    
    #print(test1_json)
    
    ids = test1_json['id']
    titles = test1_json['title']
    test1_ids[ids] = titles
    #test1_ids.append(test1_ids)
    
test1_ids

In [None]:
# TEST

test_credits = {'moviedb_id': '0000',
               'title': 'test',
               'name': 'test',
               'role': 'test'}
    
df_credits_test = pd.DataFrame(test_credits, index = [0])

df_credits_test

In [None]:
# TEST

df_credits_test
for i, t in test1_ids.items():
    credits_url = moviedb_imdb_id + str(i) + "/credits?" + urllib.parse.urlencode({'api_key': moviedb_key})
    credits_json = requests.get(credits_url).json()
    
    try:
        cast = pd.DataFrame(credits_json['cast']).assign(moviedb_id = lambda x: i, title = lambda x: t, role = lambda x: 'Actor')
        df_credits_test = df_credits_test.append(cast, ignore_index = True)
        crew = pd.DataFrame(credits_json['crew']).assign(moviedb_id = lambda x: i, title = lambda x: t)
        crew = crew.rename(columns = {'job': 'role'})
        df_credits_test = df_credits_test.append(crew, ignore_index = True)
    except KeyError:
        pass

df_credits_test.head()

In [None]:
# TEST

df_credits_test = df_credits_test.drop(['cast_id', 'character', 'credit_id', 'department', 'id', 'order', 'profile_path'], axis = 1)

df_credits_test.head()

In [None]:
# Movie DB call with IMDb ID 
# https://api.themoviedb.org/3/find/tt1935089?api_key=21acf1b7274aa72fe97cd9acc7eb382a&external_source=imdb_id

# Movie DB call with Movie DB ID
#https://api.themoviedb.org/3/movie/364733?api_key=21acf1b7274aa72fe97cd9acc7eb382a

# Movie DB call for cast
# https://api.themoviedb.org/3/movie/364733/credits?api_key=21acf1b7274aa72fe97cd9acc7eb382a

In [None]:
# Test

dict = {'tt1935089': 'http://www.theimdbapi.org/api/movie?movie_id=tt1935089', 
        'tt4975302': 'http://www.theimdbapi.org/api/movie?movie_id=tt4975302', 
        'tt2452352': 'http://www.theimdbapi.org/api/movie?movie_id=tt2452352'}

test_hold = {'imdb_id': 'tt0000000',
               'title': 'test',
               'url': 'http://www.test.com/',
               'year': '2017',
               'name': 'test',
               'role': 'test'}
    
df_test = pd.DataFrame(test_hold, index = [0])

df_test

In [None]:
for a, b in dict.items():
    test_json = requests.get(b).json()
    print(test_json['title'])
    print(test_json['url']['url'])
    print(test_json['director'])
    print(test_json['writers'])
    print(test_json['stars'])

# df_test.info()


#try:
#    df_temp = pd.DataFrame(abuser_json[0]['filmography'][r]).assign(name = lambda x: abuser , role = lambda x: r) 
#    df_abusers = df_abusers.append(df_temp, ignore_index = True)
#except KeyError:
#    pass


In [None]:
# Loop through `person_moviedb_urls` in the MovieDB API to access corresponding IMDB IDs

person_imdb_ids = {}
for i, u in person_moviedb_urls.items():
    person_json = requests.get(u).json
    
    # Create `person_imdb_ids` dictionary
    
    try:
        ids = i
        imdb_ids = person_json['imdb_id']
        person_imdb_ids[ids] = imdb_ids
    except KeyError:
        pass

# debugging: person_imdb_ids