# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import requests
import os

In [2]:
# Get five-thirty-eight bechdel data
def get_five_thirty_eight_bechdel_data():
    data = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/bechdel/movies.csv")
    select_columns = data[['year', 'imdb', 'title', 'clean_test', 'binary', 'budget_2013$', 'domgross_2013$', 'intgross_2013$']]
    return select_columns

In [3]:
# Print correlation and covariance matrices
def print_corr_cov_matrices(data):
    print(data.corr())
    print(data.cov())

In [4]:
# Check for duplicates: True if duplicates were found, False if all data is unique
def check_duplicates(data):
    if len(select_columns['imdb']) != len(select_columns['imdb'].unique()):
        return True
    else:
        return False

In [5]:
# Add column to dataframe
def add_column(df, col_name, new_list):
    df.insert(len(df.columns), str(col_name), new_list)
    return df

In [6]:
# Returns percentage of women in given crew role (e.g. Producer, Director
def get_perc_women_role(crew_response, job_type):
    female_count = 0
    job_count = 0
    for person in crew_response['crew']:
        if person['job'] == job_type:
            job_count += 1
            if person['gender'] == 1:
                female_count += 1
    if job_count == 0:
        return None
    else:
        return female_count / job_count

In [7]:
# Populate data
data_file_name = "bechdel_analysis_data.csv"
data_file_path = "./bechdel_analysis_data.csv"

# See if data has already been populated in the current directory
if os.path.isfile(data_file_path) == 'test':
    select_columns = pd.read_csv(data_file_path)
else:
    select_columns = get_five_thirty_eight_bechdel_data()
    
#Gather TMDB data - setup
tmdb_api_key = "55d7071c3daf17bcf8cc0f4a6f688a24"
movie_ids = select_columns['imdb']


# Initialize empty arrays
adult = [None] * len(movie_ids)
genres = [None] * len(movie_ids)
prod_comps = [None] * len(movie_ids)
overview = [None] * len(movie_ids)
perc_women_producers = [None] * len(movie_ids)
perc_women_directors = [None] * len(movie_ids)

# Query the API for data
for i in range(len(movie_ids)):
    try:
        response = requests.get("https://api.themoviedb.org/3/movie/" + str(movie_ids[i]) + "?api_key=" + str(tmdb_api_key) + "&language=en-US").json()
        crew_response = requests.get("https://api.themoviedb.org/3/movie/" + str(movie_ids[i]) + "/credits?api_key=" + str(tmdb_api_key) + "&language=en-US").json()
        adult[i] = response['adult']
        genres[i] = ','.join([ genre['name'] for genre in response['genres']])
        prod_comps[i] = ','.join([company['name'] for company in response['production_companies']])
        overview[i] = response['overview']
        perc_women_producers[i] = get_perc_women_role(crew_response, 'Producer')
        perc_women_directors[i] = get_perc_women_role(crew_response, 'Director')

    except Exception as err:
        print(f"Unexpected {err=}, {type(err)=}")
        continue

# Add desired columns to select_columns
desired_cols = {'adult': adult, 'genres': genres, 'prod_comps': prod_comps, 'overview': overview,
                'perc_women_producers': perc_women_producers, 
                'perc_women_directors': perc_women_directors}
for key in desired_cols.keys():
    select_columns = add_column(select_columns, key, desired_cols[key])

#Save to csv so we don't have to requery the API every time
select_columns.to_csv(data_file_name, encoding='utf-8', index=False)

Unexpected err=KeyError('adult'), type(err)=<class 'KeyError'>
Unexpected err=KeyError('adult'), type(err)=<class 'KeyError'>
Unexpected err=KeyError('adult'), type(err)=<class 'KeyError'>
Unexpected err=KeyError('adult'), type(err)=<class 'KeyError'>
Unexpected err=KeyError('adult'), type(err)=<class 'KeyError'>
Unexpected err=KeyError('adult'), type(err)=<class 'KeyError'>


In [8]:
# Check for null values: return columns and sums of null values located in each
def null_values_in_columns(data):
    null_cols = data.isnull().any()
    return [data.columns[i] for i in range(len(data.columns)) if null_cols[i] == True]

cols_with_nulls = null_values_in_columns(select_columns)
print(cols_with_nulls)

['domgross_2013$', 'intgross_2013$', 'adult', 'genres', 'prod_comps', 'overview', 'perc_women_producers', 'perc_women_directors']


In [9]:
# mutative function to remove rows with nulls
def get_non_null_rows(df, col_with_nulls):
    df = df[df[col_with_nulls].isnull() == False]
    return df

In [10]:
for i in cols_with_nulls:
    select_columns = get_non_null_rows(select_columns, i)
    
select_columns.dropna()

Unnamed: 0,year,imdb,title,clean_test,binary,budget_2013$,domgross_2013$,intgross_2013$,adult,genres,prod_comps,overview,perc_women_producers,perc_women_directors
0,2013,tt1711425,21 &amp; Over,notalk,FAIL,13000000,25682380.0,42195766.0,False,Comedy,"Mandeville Films,Relativity Media,SkyLand Ente...",Brilliant student Jeff Chang has the most impo...,0.00,0.0
1,2012,tt1343727,Dredd 3D,ok,PASS,45658735,13611086.0,41467257.0,False,"Action,Science Fiction","Rena Film,DNA Films,Reliance Big Entertainment...","In the future, America is a dystopian wastelan...",0.00,0.0
2,2013,tt2024544,12 Years a Slave,notalk,FAIL,20000000,53107035.0,158607035.0,False,"Drama,History","New Regency Pictures,Plan B Entertainment,Rive...","In the pre-Civil War United States, Solomon No...",0.25,0.0
3,2013,tt1272878,2 Guns,notalk,FAIL,61000000,75612460.0,132493015.0,False,"Action,Comedy,Crime","Universal Pictures,Marc Platt Productions,EFO ...",A DEA agent and an undercover Naval Intelligen...,0.00,0.0
4,2013,tt0453562,42,men,FAIL,40000000,95020213.0,95020213.0,False,Drama,"Warner Bros. Pictures,Legendary Pictures","The powerful story of Jackie Robinson, the leg...",0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1789,1971,tt0067741,Shaft,notalk,FAIL,305063707,404702718.0,616827003.0,False,"Action,Crime,Thriller","Shaft Productions,Metro-Goldwyn-Mayer",Cool black private eye John Shaft is hired by ...,0.00,0.0
1790,1971,tt0067800,Straw Dogs,notalk,FAIL,143862856,59412143.0,64760273.0,False,"Crime,Drama,Thriller",ABC Pictures,"David Sumner, a mild-mannered academic from th...",0.00,0.0
1791,1971,tt0067116,The French Connection,notalk,FAIL,12659931,236848653.0,236848653.0,False,"Action,Crime,Thriller","D'Antoni Productions,Schine-Moore Productions,...",Tough narcotics detective 'Popeye' Doyle is in...,0.00,0.0
1792,1971,tt0067992,Willy Wonka &amp; the Chocolate Factory,men,FAIL,17263543,23018057.0,23018057.0,False,"Family,Fantasy,Comedy",David L. Wolper Productions,When eccentric candy man Willy Wonka promises ...,0.00,0.0


In [11]:
print(null_values_in_columns(select_columns))
print()
print(select_columns.isnull().sum())
print()
print(select_columns)

[]

year                    0
imdb                    0
title                   0
clean_test              0
binary                  0
budget_2013$            0
domgross_2013$          0
intgross_2013$          0
adult                   0
genres                  0
prod_comps              0
overview                0
perc_women_producers    0
perc_women_directors    0
dtype: int64

      year       imdb                                    title clean_test  \
0     2013  tt1711425                            21 &amp; Over     notalk   
1     2012  tt1343727                                 Dredd 3D         ok   
2     2013  tt2024544                         12 Years a Slave     notalk   
3     2013  tt1272878                                   2 Guns     notalk   
4     2013  tt0453562                                       42        men   
...    ...        ...                                      ...        ...   
1789  1971  tt0067741                                    Shaft     notalk   
17

In [12]:
select_columns.to_csv("bechdel_analysis_data_cleaned.csv", encoding='utf-8', index=False)