In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib
from matplotlib import pyplot as plt
import nltk

# Set working directory
os.chdir("/Users/Sam Edds/Desktop/Stats_551")

In [None]:
#Pull down all movies from the Bechdel Test API

import json
import requests
url = "http://bechdeltest.com/api/v1/getAllMovies"
r = requests.get(url)
all_movies = r.json()

# Put into pandas df and output text file

df_movies = pd.DataFrame(all_movies)
df_movies.to_csv("all_bechdel.txt")

In [2]:
# Read in merged file from Katherine
bechdel_full = pd.read_csv("bechdel_full.csv")
bechdel_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8071 entries, 0 to 8070
Data columns (total 13 columns):
tconst            8071 non-null object
V1                8071 non-null int64
id                8071 non-null int64
imdbid            8071 non-null object
rating            8071 non-null int64
title             8071 non-null object
year              8071 non-null int64
primaryTitle      8071 non-null object
isAdult           8071 non-null int64
startYear         8071 non-null object
runtimeMinutes    8071 non-null object
genres            8071 non-null object
titleType         8071 non-null object
dtypes: int64(5), object(8)
memory usage: 819.8+ KB


In [3]:
# Remove anything that is not movies (7,521 total obs)
bechdel_full = bechdel_full[bechdel_full['titleType'] == 'movie']
len(bechdel_full)

7521

In [4]:
# show all rows 
pd.options.display.max_columns = None

# Look at full data set for missingness and variable types
bechdel_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7521 entries, 21 to 8070
Data columns (total 13 columns):
tconst            7521 non-null object
V1                7521 non-null int64
id                7521 non-null int64
imdbid            7521 non-null object
rating            7521 non-null int64
title             7521 non-null object
year              7521 non-null int64
primaryTitle      7521 non-null object
isAdult           7521 non-null int64
startYear         7521 non-null object
runtimeMinutes    7521 non-null object
genres            7521 non-null object
titleType         7521 non-null object
dtypes: int64(5), object(8)
memory usage: 822.6+ KB


In [5]:
# Check if primary title and title do not match
bechdel_full['title_match'] = bechdel_full['title'] == bechdel_full['primaryTitle']
# Look at differences in title and primary Title
bechdel_full['title_match'] = bechdel_full['title_match'].astype('str')
bechdel_full[bechdel_full.title_match.str.contains("False")]



Unnamed: 0,tconst,V1,id,imdbid,rating,title,year,primaryTitle,isAdult,startYear,runtimeMinutes,genres,titleType,title_match
21,tt0000574,44,1349,tt0000574,1,"Story of the Kelly Gang, The",1906,The Story of the Kelly Gang,0,1906,70,"Biography,Crime,Drama",movie,False
28,tt0004972,64,1258,tt0004972,2,"Birth of a Nation, The",1915,The Birth of a Nation,0,1915,195,"Drama,History,War",movie,False
30,tt0006864,68,5944,tt0006864,0,Intolerance: Love&#39;s Struggle Throughout th...,1916,Intolerance: Love's Struggle Throughout the Ages,0,1916,163,"Drama,History",movie,False
33,tt0008443,72,1994,tt0008443,3,"Poor Little Rich Girl, The",1917,The Poor Little Rich Girl,0,1917,65,"Comedy,Drama,Family",movie,False
41,tt0009968,80,2588,tt0009968,0,Broken Blossoms or The Yellow Man and the Girl,1919,Broken Blossoms,0,1919,90,"Drama,Romance",movie,False
42,tt0010281,77,7108,tt0010281,3,Ich mochte kein Mann sein,1918,I Don't Want to Be a Man,0,1918,45,"Comedy,Romance",movie,False
43,tt0010323,81,1306,tt0010323,0,"Cabinet of Dr. Caligari, The",1920,The Cabinet of Dr. Caligari,0,1920,76,"Fantasy,Horror,Mystery",movie,False
47,tt0011979,86,4204,tt0011979,3,"Blot, The",1921,The Blot,0,1921,91,Drama,movie,False
48,tt0012349,84,3945,tt0012349,0,"Kid, The",1921,The Kid,0,1921,68,"Comedy,Drama,Family",movie,False
49,tt0012364,85,5898,tt0012364,2,K&ouml;rkarlen (The Phantom Carriage),1921,The Phantom Carriage,0,1921,100,"Drama,Fantasy,Horror",movie,False


In [6]:
# summarize 'isAdult'
bechdel_full.groupby('isAdult').agg(['count'])
bechdel_full[bechdel_full['isAdult'] == 1]
# Remove from dataset
bechdel_full = bechdel_full[bechdel_full['isAdult'] != 1]

In [7]:
# Make startYear and year into strings
bechdel_full['startYear'] = bechdel_full.startYear.astype('str')
bechdel_full['year'] = bechdel_full.year.astype('str')

# is startYear different from year?
bechdel_full[bechdel_full['year'] != bechdel_full['startYear']]

Unnamed: 0,tconst,V1,id,imdbid,rating,title,year,primaryTitle,isAdult,startYear,runtimeMinutes,genres,titleType,title_match
126,tt0020642,193,1327,tt0020642,3,Anna Christie,1931,Anna Christie,0,1930,85,Drama,movie,True
130,tt0021015,159,7287,tt0021015,3,Juno and the Paycock [The Shame of Mary Boyle],1929,Juno and the Paycock,0,1930,85,"Comedy,Drama",movie,False
327,tt0032339,368,1975,tt0032339,0,A Chump at Oxford,1940,A Chump at Oxford,0,1939,62,Comedy,movie,True
388,tt0035279,716,5938,tt0035279,1,Saboteur,1956,Saboteur,0,1942,109,"Thriller,War",movie,True
406,tt0036244,440,6310,tt0036244,1,"Ox-Bow Incident, The",1943,The Ox-Bow Incident,0,1942,75,"Drama,Western",movie,False
434,tt0037365,477,6848,tt0037365,2,Thin Man Goes Home,1945,The Thin Man Goes Home,0,1944,100,"Comedy,Crime,Mystery",movie,False
519,tt0041088,542,4944,tt0041088,2,Act of Violence,1948,Act of Violence,0,1949,82,"Drama,Film-Noir,Thriller",movie,True
542,tt0042369,591,5350,tt0042369,1,D.O.A.,1950,D.O.A.,0,1949,83,"Drama,Film-Noir,Mystery",movie,True
566,tt0043769,623,1243,tt0043769,2,"Magic Box, The",1952,The Magic Box,0,1951,118,"Biography,Drama",movie,False
630,tt0046969,687,3158,tt0046969,3,"Fast and the Furious, The",1955,The Fast and the Furious,0,1954,73,"Crime,Drama,Mystery",movie,False


In [8]:
# Remove a few with missing genre
bechdel_full = bechdel_full[bechdel_full['genres'] != '\\N']

# Remove dupes
bechdel_full = bechdel_full.drop_duplicates(['imdbid'], keep = 'first')

# Make sure 'Terms of Endearment' is a 3
bechdel_full['rating'] = np.where(bechdel_full['primaryTitle'] == 'Terms of Endearment', 3, bechdel_full['rating'])
bechdel_full[bechdel_full['imdbid'] == 'tt0086425']


Unnamed: 0,tconst,V1,id,imdbid,rating,title,year,primaryTitle,isAdult,startYear,runtimeMinutes,genres,titleType,title_match
1641,tt0086425,1670,4448,tt0086425,3,Terms of Endearment,1983,Terms of Endearment,0,1983,132,"Comedy,Drama",movie,True


In [9]:
# Parse genre into a wide dataframe of variables

# First reset index
bechdel_full = bechdel_full.reset_index()

# Tokenize genres and make into list of lists
def tokenize(df):  
    genres = df['genres'].tolist()
    tks = [nltk.word_tokenize(x) for x in genres]

    movie_genre = list()

    # Make into a wide 
    for title in tks:
        # Make a new dictionary and append at the end of each movie
        word_cnt = dict()
        for word in title:
            if not word in word_cnt:
                word_cnt[word] = 1
            else:
                word_cnt[word] += 1
        # Append title specific dictionary to list of all dictionaries
        movie_genre.append(word_cnt)
    
    # Wide df with each genre
    genre_df = pd.DataFrame(movie_genre)
    genre_df = genre_df.fillna(0)
    return genre_df

# Call function on dataframe
genre_df = tokenize(bechdel_full)


In [17]:
# Join by index
bechdel_full1 = bechdel_full.join(genre_df)
# Check number of obs
print(len(bechdel_full1))

7510


In [18]:
# Add decade variables
bechdel_full1['decade'] = bechdel_full1['startYear'].str[:3] + '0'
bechdel_full1['decade'] = np.where(bechdel_full1['decade'] == '1900', '1900-1920',
                                  np.where(bechdel_full1['decade'] == '1910', '1900-1920',
                                           np.where(bechdel_full1['decade'] == '1920', '1900-1920', bechdel_full1['decade'])))

In [19]:
# Rename unicode column names
bechdel_full1 = bechdel_full1.rename(columns = {bechdel_full1.columns[15] : 'n_genre'})

In [20]:
# Drop useless variables
bechdel_full1 = bechdel_full1.drop(['tconst', 'V1', 'id', 'title', 'year',
                                  'isAdult', 'genres', 'titleType', 'n_genre',
                                  'title_match', 'News'], axis = 1)

In [21]:
# Remove missing start date
bechdel_full1 = bechdel_full1[bechdel_full1['startYear'] != '\\N']
bechdel_full1 = bechdel_full1.rename(columns = {'startYear' : 'year'})

In [22]:
# Binary start/end date
bechdel_full1['pass'] = np.where(bechdel_full1['rating'] == 3, 1, 0)
bechdel_full1['notpass'] = np.where(bechdel_full1['rating'] == 3, 0, 1)

In [23]:
# Clean up runtime Minutes
bechdel_full1 = bechdel_full1[bechdel_full1['runtimeMinutes'] != '\\N']
bechdel_full1['runtimeMinutes'] = bechdel_full1['runtimeMinutes'].astype(int)

In [22]:
# Output cleaned data set
bechdel_full1.to_csv("bechdel_cleaned.csv")

In [23]:
# Want genre proportions information 

# Drop additional variables
bechdel_full2 = bechdel_full1.drop(['index','imdbid', 'year', 'runtimeMinutes', 'decade',
                                   'rating', 'primaryTitle'], axis = 1)
# Transpose
bechdel_full2 = bechdel_full2.transpose()
# Sum Totals
total = bechdel_full2.sum(axis = 1)

# Passed sums only
passed =  bechdel_full1.drop(['index','imdbid', 'year', 'runtimeMinutes', 'decade',
                                   'rating', 'primaryTitle'], axis = 1)
passed = passed[passed['pass'] == 1]
passed = passed.transpose()
total_passed = passed.sum(axis = 1)

# Combine into dataframe and output for Olivia
total = pd.DataFrame(total)
total_passed = pd.DataFrame(total_passed)
passed_data = pd.concat([total, total_passed], axis = 1)
passed_data.to_csv("genre_passed.csv")
