# TMDB Top 5000 movies dataset

## Goal: Analyze the careers of female film directors in contrast with male directors.

#### Some questions that can be answered:
    
    - how long until female directors get a chance to direct a 2nd movie?
    - do female directors hire a more diverse cast and crew?
    - do flops effect the careers more than they do male directors? (may be hard to answer due to small sample size)
    - do female directors direct certain genres more than others?
    - can we distinguish between movies directed by males/females?
    
#### TODO
    
    1) get recent movies from TMDB via API
    2) preprocess the crew and cast data and tag for female/male
    3) search keywords of movie df for movies directed by women
    4) preprocess movie dataframe for female/male classification and revenue analysis
    5) EDA - can we answer any of the above questions?
    6) Classifcation 

Comes from : https://www.kaggle.com/tmdb/tmdb-movie-metadata/data

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ast

%matplotlib inline

In [52]:
credits = pd.read_csv("tmdb_5000_credits.csv",)
movies = pd.read_csv("tmdb_5000_movies.csv",parse_dates=['release_date'])

In [47]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


#### Creatring Crew and Cast datasets

In [53]:
# function that adds in the movie_id for cast and crew df
def add_id_dicts(x,col):
    new_list = []
    for y in x[col]:
        y['movie_id'] = x['movie_id']
        new_list.append(y)
    return new_list
# function that converts string to list, adds in ids, then flattens and concatenates the cast/crew cols
def preprocess_col_dicts(df,col):
    # making so does not effect the original df
    df = df.copy()
    # convert to list structure instead of string
    df[col] = df[col].apply(lambda x: ast.literal_eval(x))
    # add in the movie id for when seperating out
    df[col] = df.apply(lambda x: add_id_dicts(x,col),axis=1)
    # flatten then concatenate
    return pd.DataFrame([item for sublist in list(df[col]) for item in sublist])

In [54]:
# create dataframes
cast = preprocess_col_dicts(credits,'cast')
crew = preprocess_col_dicts(credits,'crew')

In [55]:
crew.head()

Unnamed: 0,credit_id,department,gender,id,job,movie_id,name
0,52fe48009251416c750aca23,Editing,0,1721,Editor,19995,Stephen E. Rivkin
1,539c47ecc3a36810e3001f87,Art,2,496,Production Design,19995,Rick Carter
2,54491c89c3a3680fb4001cf7,Sound,0,900,Sound Designer,19995,Christopher Boyes
3,54491cb70e0a267480001bd0,Sound,0,900,Supervising Sound Editor,19995,Christopher Boyes
4,539c4a4cc3a36810c9002101,Production,1,1262,Casting,19995,Mali Finn


In [56]:
cast.head()

Unnamed: 0,cast_id,character,credit_id,gender,id,movie_id,name,order
0,242,Jake Sully,5602a8a7c3a3685532001c9a,2,65731,19995,Sam Worthington,0
1,3,Neytiri,52fe48009251416c750ac9cb,1,8691,19995,Zoe Saldana,1
2,25,Dr. Grace Augustine,52fe48009251416c750aca39,1,10205,19995,Sigourney Weaver,2
3,4,Col. Quaritch,52fe48009251416c750ac9cf,2,32747,19995,Stephen Lang,3
4,5,Trudy Chacon,52fe48009251416c750ac9d3,1,17647,19995,Michelle Rodriguez,4


### Tagging the Female Directors

In [58]:
#crew['job'].value_counts().head(10)
directors = crew[crew['job']=='Director'].copy()

In [59]:
directors.head()

Unnamed: 0,credit_id,department,gender,id,job,movie_id,name
6,52fe48009251416c750ac9c3,Directing,2,2710,Director,19995,James Cameron
154,52fe4232c3a36847f800b4fd,Directing,2,1704,Director,285,Gore Verbinski
186,52fe4d22c3a368484e1d8d77,Directing,2,39,Director,206647,Sam Mendes
343,52fe4781c3a36847f8139865,Directing,2,525,Director,49026,Christopher Nolan
558,52fe479ac3a36847f813ea65,Directing,2,7,Director,49529,Andrew Stanton


#### Build weak name classifier

In [63]:
import nltk

In [64]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [73]:
def pos_letter(word,x):
    return word[x]

def get_vowels(word):
    # crete target list and list of vowels
    vowels = ['a','e','i','o','u','y']    
    vowels_word = []
    
    for let in word:
        if let in vowels:
            vowels_word.append(let)
    return vowels_word#list(set(vowels_word))

pos_letter('Shrek',0)
pos_letter('Shrek',-1)

'k'

In [78]:
# from:
# http://www.nltk.org/book/ch06.html
 	
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +[(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)

# get labeled info to make df
nice_names = []
nice_labels = []
for i in range(len(labeled_names)):
    name = labeled_names[i][0]
    label = labeled_names[i][1]
    nice_names.append(name)
    nice_labels.append(label)
# make df
names_df = pd.DataFrame()
names_df['names'] = nice_names
names_df['labels'] = nice_labels
# get features
names_df['first'] = names_df['names'].apply(lambda x: pos_letter(x,0))
names_df['last'] = names_df['names'].apply(lambda x: pos_letter(x,-1))
names_df['vowels'] = names_df['names'].apply(lambda x: get_vowels(x))
names_df['n_vowels'] = names_df['vowels'].apply(len)
names_df['name_len'] = names_df['names'].apply(len)
names_df['vowel_prop'] = names_df['n_vowels']/names_df['name_len']

In [79]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression,LassoCV,SGDClassifier,RidgeClassifier

In [80]:
feats = ['first','last']

final_df = names_df.copy()
for col in  feats:
    temp = pd.get_dummies(names_df[col],prefix=col,drop_first=True)
    final_df = pd.concat([final_df,temp],axis=1)
    temp = None

In [113]:
x = final_df[final_df.columns[7:]]
# 1 hot encoder
lb = LabelBinarizer()
y = lb.fit_transform(names_df['labels'])

In [122]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42,test_size= 0.5)
#nb = MultinomialNB()
#nb = LogisticRegression()
#nb = RidgeClassifier()
#nb = AdaBoostClassifier(n_estimators=500,learning_rate=0.9)
#nb = RandomForestClassifier(n_estimators=500,min_samples_split=20)

nb.fit(x_train,y_train.ravel())
train_preds = nb.predict(x_train)
print(classification_report(y_true=y_train,y_pred= train_preds))
preds = nb.predict(x_test)
print(classification_report(y_true=y_test,y_pred= preds))

             precision    recall  f1-score   support

          0       0.81      0.85      0.83      2510
          1       0.72      0.66      0.69      1462

avg / total       0.78      0.78      0.78      3972

             precision    recall  f1-score   support

          0       0.80      0.86      0.83      2491
          1       0.73      0.63      0.68      1481

avg / total       0.77      0.78      0.77      3972



####  Can we build a better classifier?

In [None]:
#featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
#train_set, test_set = featuresets[500:], featuresets[:500]

#from sklearn.naive_bayes import MultinomialNB
#cf = MultinomialNB()
#cf.fit(train_set,)

#classifier = nltk.NaiveBayesClassifier.train(train_set)
#classifier.classify(gender_features('michael'))
#print(nltk.classify.accuracy(classifier, test_set))


## Preprocess Movies DF

In [127]:
def woman_director(x):
    fem_director = False
    for dic in x:
        if 'woman director' in dic.values():
            fem_director=True
    return fem_director

In [130]:
movies['keywords'] = movies['keywords'].apply(lambda x: ast.literal_eval(x))

In [131]:
movies['female_director'] = movies['keywords'].apply(lambda x: woman_director(x))
movies['female_director'].value_counts()

False    4479
True      324
Name: female_director, dtype: int64

With only 324 movies tagged as female director, we may need to apply the weak classifier on the director's names or grab more recent movie info from TMDB.

In [None]:
movies[movies['female_director']==True].head(5)

In [None]:
movies['revenue'].describe()

In [56]:
#grabbing the roles from the dictionary of crew memberts
roles = ['Director','Screenplay','Producer','Writer','Editor','Art Direction',
         'Casting','Costume Design','Executive Producer','Set Design']

for role in roles:
    
    names = []

    for i in range(len(credits['crew'])):

        result = []
        for j in range(len(credits['crew'][i])):
            if credits['crew'][i][j]['job'] == role:
                result.append(credits['crew'][i][j]['name'])

        if len(result) ==0:
            result.append('NA')

        names.append(result)
        
    
    credits[role] = names

In [52]:
#credits['Director'] = directors

In [None]:
#credits.head(2)

In [None]:
#credits['crew'][0][0:10]

#### Calculate Revenue Variables

In [63]:
movies['total_profit'] = movies['revenue'] - movies['budget']

In [67]:
movies['profit_margin'] = (((movies['revenue']-movies['budget'])/movies['revenue'])*100)

In [94]:
movies['budget'].describe()

count    3.766000e+03
mean     3.704284e+07
std      4.264651e+07
min      1.000000e+00
25%      8.000000e+06
50%      2.300000e+07
75%      5.000000e+07
max      3.800000e+08
Name: budget, dtype: float64

In [102]:
# only get movies with positive revenue?
movies = movies[movies['revenue'] > 0]
movies = movies[movies['budget'] > 1000]
# calculate roi
movies['roi'] = (movies['total_profit'] - movies['budget'])/movies['budget']

In [105]:
movies['roi'].describe()

count     3216.000000
mean         9.123721
std        239.882927
min         -1.999999
25%         -0.974031
50%          0.301077
75%          2.419082
max      12888.386667
Name: roi, dtype: float64

In [107]:
# which movie had the highest roi?
movies[movies['roi'] == movies['roi'].max()]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,vote_average,vote_count,total_profit,percent_profit,profit_margin,roi
4577,15000,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 9648, ""n...",http://www.paranormalactivity-movie.com/,23827,"[{""id"": 10224, ""name"": ""haunting""}, {""id"": 147...",en,Paranormal Activity,"After a young, middle class couple moves into ...",47.456823,"[{""name"": ""Blumhouse Productions"", ""id"": 3172}...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,What Happens When You Sleep?,Paranormal Activity,5.9,1316,193340800,1288939.0,99.992242,12888.386667
