# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import requests
import os

In [2]:
# Get five-thirty-eight bechdel data
def get_five_thirty_eight_bechdel_data():
    data = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/bechdel/movies.csv")
    select_columns = data[['year', 'imdb', 'title', 'clean_test', 'binary', 'budget_2013$', 'domgross_2013$', 'intgross_2013$']]
    return select_columns

In [3]:
# Print correlation and covariance matrices
def print_corr_cov_matrices(data):
    print(data.corr())
    print(data.cov())

In [4]:
# Check for duplicates: True if duplicates were found, False if all data is unique
def check_duplicates(data):
    if len(select_columns['imdb']) != len(select_columns['imdb'].unique()):
        return True
    else:
        return False

In [5]:
# Add column to dataframe
def add_column(df, col_name, new_list):
    df.insert(len(df.columns), str(col_name), new_list)
    return df

In [6]:
# Populate data
data_file_name = "bechdel_analysis_data.csv"
data_file_path = "./bechdel_analysis_data.csv"

# See if data has already been populated in the current directory
if os.path.isfile(data_file_path):
    select_columns = pd.read_csv(data_file_path)
    print(select_columns)
else:
    select_columns = get_five_thirty_eight_bechdel_data()
    
    #Gather TMDB data - setup
    tmdb_api_key = "55d7071c3daf17bcf8cc0f4a6f688a24"
    movie_ids = select_columns['imdb']

    # Initialize empty arrays
    adult = [None] * len(movie_ids)
    genres = [None] * len(movie_ids)
    prod_comps = [None] * len(movie_ids)
    overview = [None] * len(movie_ids)
    tagline = [None] * len(movie_ids)
    
    # Query the API for data
    for i in range(len(movie_ids)):
        try:
            response = requests.get("https://api.themoviedb.org/3/movie/" + str(movie_ids[i]) + "?api_key=" + str(tmdb_api_key) + "&language=en-US").json()
            adult[i] = response['adult']
            genres[i] = ','.join([ genre['name'] for genre in response['genres']])
            prod_comps[i] = ','.join([company['name'] for company in response['production_companies']])
            overview[i] = response['overview']
            tagline[i] = response['tagline']
        except Exception as err:
            print(f"Unexpected {err=}, {type(err)=}")
            continue
            
    # Add desired columns to select_columns
    desired_cols = {'adult': adult, 'genres': genres, 'prod_comps': prod_comps, 'overview':overview, 'tagline':tagline}
    for key in desired_cols.keys():
        select_columns = add_column(select_columns, key, desired_cols[key])
        
    #Save to csv so we don't have to requery the API every time
    select_columns.to_csv(data_file_name, encoding='utf-8', index=False)

      year       imdb                                    title clean_test  \
0     2013  tt1711425                            21 &amp; Over     notalk   
1     2012  tt1343727                                 Dredd 3D         ok   
2     2013  tt2024544                         12 Years a Slave     notalk   
3     2013  tt1272878                                   2 Guns     notalk   
4     2013  tt0453562                                       42        men   
...    ...        ...                                      ...        ...   
1771  1971  tt0067741                                    Shaft     notalk   
1772  1971  tt0067800                               Straw Dogs     notalk   
1773  1971  tt0067116                    The French Connection     notalk   
1774  1971  tt0067992  Willy Wonka &amp; the Chocolate Factory        men   
1775  1970  tt0065466           Beyond the Valley of the Dolls         ok   

     binary  budget_2013$  domgross_2013$  intgross_2013$  adult  \
0      

In [7]:
# Check for null values: return columns and sums of null values located in each
def null_values_in_columns(data):
    null_cols = data.isnull().any()
    return [data.columns[i] for i in range(len(data.columns)) if null_cols[i] == True]

cols_with_nulls = null_values_in_columns(select_columns)
print(cols_with_nulls)

['adult', 'genres', 'prod_comps', 'overview', 'tagline']


In [8]:
# mutative function to remove rows with nulls
def get_non_null_rows(df, col_with_nulls):
    df = df[df[col_with_nulls].isnull() == False]
    return df

In [9]:
for i in cols_with_nulls:
    select_columns = get_non_null_rows(select_columns, i)

In [10]:
print(null_values_in_columns(select_columns))
print()
print(select_columns.isnull().sum())
print()
print(select_columns)

[]

year              0
imdb              0
title             0
clean_test        0
binary            0
budget_2013$      0
domgross_2013$    0
intgross_2013$    0
adult             0
genres            0
prod_comps        0
overview          0
tagline           0
dtype: int64

      year       imdb                                    title clean_test  \
0     2013  tt1711425                            21 &amp; Over     notalk   
1     2012  tt1343727                                 Dredd 3D         ok   
2     2013  tt2024544                         12 Years a Slave     notalk   
3     2013  tt1272878                                   2 Guns     notalk   
4     2013  tt0453562                                       42        men   
...    ...        ...                                      ...        ...   
1771  1971  tt0067741                                    Shaft     notalk   
1772  1971  tt0067800                               Straw Dogs     notalk   
1773  1971  tt0067116        