In [1]:
import pandas as pd
df = pd.read_csv("BECHDEL___masterList.csv", dtype={'budget': float, 'revenue': float, 
                                                    'vote_count': float, 'Bechdel_Test': float})
df.head(3)

FileNotFoundError: [Errno 2] File b'BECHDEL___masterList.csv' does not exist: b'BECHDEL___masterList.csv'

In [None]:
df = df.drop(['revenue', 'popularity', 'budget', 'vote_average', 'vote_count'], axis=1)
df.head(3)

In [None]:
###### Pre-OneHot Encoding Plots ######
#######################################
import seaborn as sns
import matplotlib.pyplot as plt

print("Count of Failed Bechdel Tests Films: ", len(df.loc[df['Bechdel_Test'] == 0]))
print("\nCount of Passed Bechdel Tests Films: ", len(df.loc[df['Bechdel_Test'] == 1]))

# Basic Bar Graph compares films passing the Bechdel Test vs. films failing.
derp = sns.countplot(x='Bechdel_Test', data = df)

In [None]:
# Heatmap Correlation Analysis
g = sns.heatmap(df[['runtime', 'winner', 'Bechdel_Test']].corr(),cmap='RdYlGn',annot=True)

print("The strongest correlating attribute of Winning = Revenue\n\nThere also appear to be other moderate (+70%) correlations of: \n---Budget/Revenue, \n---Popularity/Vote_Count, \n---Revenue/Vote_Count")

In [None]:
g = sns.heatmap(df[['winner', 'Bechdel_Test']].corr(),cmap='RdYlGn',annot=True)

print("There is small ~10% correlation to Winning/Passed-Bechdel")

In [None]:
# Density Curve: Bechel Test
bTest0 = df[df['winner'] == 0]['Bechdel_Test']
bTest1 = df[df['winner'] == 1]['Bechdel_Test']

g = sns.kdeplot(bTest0, legend = True, shade=True, color='r', label = 'non-nominated')
g = sns.kdeplot(bTest1, legend = True, shade=True, color='b', label = 'nominated')

print("X Axis:\n\n---0.0 = Failed Bechdel Test\n---1.0 = Passed Bechdel Test")
#todo:::   change the x ticks to represent that

In [None]:
import json
import numpy as np

genres_name0 = {}
genres_name1 = {}
genres_set = set()

# Fancy Itterrows() looks for Bechdel_Test result bool, if a dictionary with the name of the
# current movie doesn't exist in the genres_name0/1 then one is appended with a tally count
# of 1. If a dict already exists then the tally count is increased by 1.
for i in range(len(df)):
    genres = json.loads(df.loc[i,'genres'])
    for it in genres:
        genres_set.add(it['name'])
        if df.loc[i,'Bechdel_Test'] == 0:
            if it['name'] not in genres_name0:
                genres_name0[it['name']] = 1
            else:
                genres_name0[it['name']] += 1
        if df.loc[i,'Bechdel_Test'] == 1:
            if it['name'] not in genres_name1:
                genres_name1[it['name']] = 1
            else:
                genres_name1[it['name']] += 1
            
genres_array0=[]
genres_array1=[]

# Binning of Genre names
for g in genres_set:
    if g in genres_name0:
        genres_array0.append(genres_name0[g])
    else:
        genres_array0.append(0)
    if g in genres_name1:
        genres_array1.append(genres_name1[g])
    else:
        genres_array1.append(0)

# NumPy % of whole calculations for upcoming plot
genres_all = []
genres_all.append(np.array(genres_array0)/sum(genres_array0))
genres_all.append(np.array(genres_array1)/sum(genres_array1))

# New DF creation to hold the by-Genre data
df_genres = pd.DataFrame(genres_all, columns=list(genres_set))
df_genres[['budget','popularity','revenue','runtime','vote_average','vote_count']]

In [None]:
# Stacked Bars: Bechdel Test % / Movie Genres
N = len(df_genres.columns)
ind = np.arange(N)
width = 0.5

plt.figure(figsize=(20,10))
p1 = plt.bar(ind, df_genres.loc[[0]].values[0], width, color='#d62728')
p2 = plt.bar(ind, df_genres.loc[[1]].values[0], width, bottom=df_genres.loc[[0]].values[0])

plt.ylabel('percentage (#genres/#movies)')
plt.title('Percentage by genres and Bechel Test pass')
plt.xticks(ind,df_genres.columns)
plt.legend((p1[0],p2[0]),('failed-Bechdel','passed-Bechdel'))

In [None]:
###### One-Hot Encoding --- ML Preperation ######
#################################################

# Since many columns such as “crew”, “cast” contain information in json format, we need to 
# extract useful information from the columns and then perform one hot encoding.
# We will transform our dataset into a all numeric matrix so that we can feed the data into 
# our machine learning model.
# To look at the structure of column (eg.”cast”), we can use: df.loc[0,’cast’]

def feature_engineering(column_name, df, json_name):
    """
    Args:
        column_name: the column name in the dataframe that contains a json file that needs to 
        conduct feature engineering on 
        
        df: dataframe that perform feature engineering on
        
        json_name: name in the json file that we want to extract
    
    Returns: new dataframe after feature engineering
    """
    
    name = {}

    for item in df[column_name]:
        group = json.loads(item)
        for it in group:
            if it[json_name] not in name:
                name[it[json_name]] = 1
            else:
                name[it[json_name]] += 1
    
    final = {}
    index = 0
    for k,v in name.items():
        if v > 1:
            final[k] = index
            index += 1
    np_item = np.zeros((len(df),len(final)))
    item_dict = {}
    row = 0
    for item in df[column_name]:
        group = json.loads(item)
        for it in group:
            if it[json_name] in final:
                index = final[it[json_name]]
                np_item[row][index] = 1
        row += 1

    df_item = pd.DataFrame(np_item, columns = list(final.keys()))
    df_output = pd.concat([df,df_item],axis = 1)
    
    return df_output

In [None]:
# DF2 expands all the json data of cast and makes new columns for each one before deleting
# the original cast column.
df2 = feature_engineering("cast", df, "name")
df2 = df2.drop('cast', axis = 1)
df2.head(3)

In [None]:
# DF3 expands all the json data of crew and makes new columns for each one before deleting
# the original crew column.
crew_name = {}

# Director
for item in df2['crew']:
    crew = json.loads(item)
    for it in crew:
        if it['job'] == 'Director':
            if it['name'] not in crew_name:
                crew_name[it['name']] = 1
            else:
                crew_name[it['name']]+=1


# Set the appear tims for Actors
final_crew = {}
index = 0
for k,v in crew_name.items():
    if v > 0:
        final_crew[k] = index
        index += 1
# print(len(final_crew))

np_crew = np.zeros((len(df2), len(final_crew)))
row = 0
for item in df2['crew']:
    crew = json.loads(item)
    for it in crew:
        if it['job'] == 'Director':
            if it['name'] in final_crew:
                index = final_crew[it['name']]
                np_crew[row][index] = 1
    row += 1

df_crew = pd.DataFrame(np_crew, columns = list(final_crew.keys()))
            
df3 = pd.concat([df2, df_crew], axis = 1)
print(df.shape)
#df3.head(3)

In [None]:
# Doing the same concept but for a multitude of json filled column entries across the entire
# DataFrame. Each evolution is morphed into the next until we get to DF8 in this part.
df3=df3.drop(['crew'],axis=1)
#genres
df4 = feature_engineering("genres", df3, "name")
df4 = df4.drop(['genres'], axis = 1)
#keywords
df5 = feature_engineering('keywords', df4, 'name')
df5 = df5.drop(['keywords'], axis = 1)
#production_companies
df6 = feature_engineering('production_companies',df5,'name')
df6 = df6.drop(['production_companies'],axis=1)
#production_countries
df7 = feature_engineering('production_countries',df6,'name')
df7 = df7.drop(['production_countries'],axis=1)
#spoken_languages
df8 = feature_engineering('spoken_languages',df7,'iso_639_1')
df8 = df8.drop(['spoken_languages'],axis=1)

# movieName, filmInfo, and award columns do not exist -- possible artifacts from team?

df8.head(3)

In [None]:
# DF_CLEAN is created from DF8 sans the below columns. It will then be encoded into OneHot,
# and then hopefully fit into training sets to create models.
df_clean = df8.drop(["movie_title", "original_title", "overview", "tagline", "title", "original_language",
                    "status", "release_date"], axis=1)

print(df_clean.shape)
df_clean.head(3)

In [None]:
# Debugging for 2 cells below ValueError
# Ended up being easier to just drop the two problem rows containing NaNs
df_clean = df_clean.dropna(axis='index', how='any')

print(f"Number of null values in entire DF: {df_clean.isnull().sum().sum()}\n")

# If non-float types exist they'll be printed
for dtype in df_clean.dtypes:
    if dtype != float:
        print(dtype)

In [None]:
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#?                      Big Brain Stuff Ahead                    ?#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# Model Training --- I replaced winner with Bechdel_Test
X = df_clean[df_clean.columns.difference(['Bechdel_Test'])]
# X = StandardScaler().fit_transform(X)
y = df_clean['winner']

In [None]:
from sklearn.model_selection import train_test_split
# test_size: what proportion of original data is used for test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=90001, shuffle=True, stratify=y)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on training set only.
scaler.fit(X_train)

# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(0.95)
fit = pca.fit(X_train)

X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression(penalty='elasticnet', multi_class='ovr', n_jobs=-1, l1_ratio=0, random_state=288, solver='saga', max_iter=5000, verbose=10)
logisticRegr.fit(X_train, y_train)

In [None]:
# Predict for One Observation
predicted = logisticRegr.predict(X_test)
print(logisticRegr.predict(X_test))
print(y_test.index)

In [None]:
logisticRegr.score(X_train, y_train)

In [None]:
logisticRegr.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
import math
tn, fp, fn, tp = confusion_matrix(y_test, predicted).ravel()

print(f"Total - Y_Test: {len(y_test)}")
print(f"Total - Predicted: {len(predicted)}\n")

print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}\n")

sensitivity = tp/(tp+fn)
print(f"Sensetivity / True Positive Rate: {sensitivity}\n")

specificity = tn/(tn+fp)
print(f"Specificity / True Negative Rate: {specificity}\n")

precision = tp/(tp+fp)
print(f"Precision / Positive Predictive Value: {precision}\n")

npv = tn/(tn+fn)
print(f"Negative Predictive Value: {npv}\n")

miss_rate = fn/(fn+tp)
print(f"Miss Rate / False Negative Rate: {miss_rate}\n")

fall_out = fp/(fp+tn)
print(f"Fall-Out / False Positive Rate: {fall_out}\n")

fdr = fp/(fp+tp)
print(f"False Discovery Rate: {fdr}\n")

fOMr = fn/(fn+tn)
print(f"False Omission Rate: {fOMr}\n")

threat_score = tp/(tp+fn+fp)
print(f"Threat Score / Critical Success Index: {threat_score}\n")

accuracy = (tp+tn)/(tp+tn+fp+fn)
print(f"Accuracy: {accuracy}\n")

f1_score = (2*tp)/((2*tp)+fp+fn)
print(f"F1 Score / Harmonic Mean of Precision(PPV) and Sensetivity(TPR): {f1_score}\n")

mcc = ((tp*tn)-(fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
print(f"Matthews Correlation Coefficient: {mcc}\n")

informedness = (sensitivity+specificity-1)
print(f"Informedness / Bookmaker Informedness: {informedness}\n")

markedness = (precision+npv-1)
print(f"Markedness: {markedness}\n")

In [None]:
# 3-fold Cross validation

#from sklearn.model_selection import StratifiedKFold
#skf = StratifiedKFold()

from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(random_state=55)

sumAccuracy = []
#for train,test in skf.split(X,y):
for train,test in sss.split(X,y):
    df_train = df_clean.iloc[train]
    df_test = df_clean.iloc[test]
    train_X = df_train[df_clean.columns.difference(['Bechdel_Test'])]
    train_y = df_train['winner']
    test_X = df_test[df_clean.columns.difference(['Bechdel_Test'])]
    test_y = df_test['winner']
    logisticRegr.fit(train_X, train_y)
    sumAccuracy.append(logisticRegr.score(test_X, test_y))
avg = np.mean(sumAccuracy)
print(f"\n\n3-Fold Cross Validation Mean Score: {avg}\n\n")