In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import names
import sklearn

### Training a Naive Bayes Classifier to classify gender of names

In [4]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [5]:
import random
labeled_names = ([(name, 'male') for name in names.words('male.txt')]+
             [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

In [6]:
featuresets = [(gender_features(n), gender) 
               for (n, gender) in labeled_names]

classifier = nltk.NaiveBayesClassifier.train(featuresets)

In [7]:
#test on a new name
classifier.classify(gender_features('Trinity'))

'female'

### Use Classifier to Classify Names in Credits Dataset

In [8]:
credits = pd.read_csv('data/credits.csv')
credits.head()

Unnamed: 0,person_id,id,name,character,role
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR
2,7064,tm84618,Albert Brooks,Tom,ACTOR
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR


In [9]:
# credits['gender'] = classifier.classify(gender_features(credits['name'].split()[0]))
credits['gender'] = credits['name'].apply(lambda x : classifier.classify(gender_features(x.split()[0])))

In [10]:
credits.head(10)

Unnamed: 0,person_id,id,name,character,role,gender
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR,male
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR,female
2,7064,tm84618,Albert Brooks,Tom,ACTOR,male
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR,male
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR,male
5,32267,tm84618,Peter Boyle,Wizard,ACTOR,male
6,519612,tm84618,Leonard Harris,Senator Charles Palantine,ACTOR,male
7,29068,tm84618,Diahnne Abbott,Concession Girl,ACTOR,female
8,519613,tm84618,Gino Ardito,Policeman at Rally,ACTOR,male
9,3308,tm84618,Martin Scorsese,Passenger Watching Silhouette,ACTOR,male


### Use Classifier on titles_and_credits

In [11]:
titles_and_credits = pd.read_csv('data/titles_and_credits.csv')
titles_and_credits.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,title,type,description,release_year,age_certification,runtime,genres,...,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,person_id,name,character,role
0,0,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,,tt0075314,8.2,808582.0,40.965,8.179,3748,Robert De Niro,Travis Bickle,ACTOR
1,1,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,,tt0075314,8.2,808582.0,40.965,8.179,14658,Jodie Foster,Iris Steensma,ACTOR
2,2,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,,tt0075314,8.2,808582.0,40.965,8.179,7064,Albert Brooks,Tom,ACTOR
3,3,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,,tt0075314,8.2,808582.0,40.965,8.179,3739,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,4,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,,tt0075314,8.2,808582.0,40.965,8.179,48933,Cybill Shepherd,Betsy,ACTOR


In [12]:
titles_and_credits['gender'] = titles_and_credits['name'].apply(
    lambda x : classifier.classify(gender_features(x.split()[0])))

In [13]:
titles_and_credits.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,title,type,description,release_year,age_certification,runtime,genres,...,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,person_id,name,character,role,gender
0,0,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,3748,Robert De Niro,Travis Bickle,ACTOR,male
1,1,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,14658,Jodie Foster,Iris Steensma,ACTOR,female
2,2,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,7064,Albert Brooks,Tom,ACTOR,male
3,3,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,3739,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR,male
4,4,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,48933,Cybill Shepherd,Betsy,ACTOR,male
5,5,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,32267,Peter Boyle,Wizard,ACTOR,male
6,6,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,519612,Leonard Harris,Senator Charles Palantine,ACTOR,male
7,7,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,29068,Diahnne Abbott,Concession Girl,ACTOR,female
8,8,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,519613,Gino Ardito,Policeman at Rally,ACTOR,male
9,9,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,3308,Martin Scorsese,Passenger Watching Silhouette,ACTOR,male


### Gender Proportion vs Popularity of Titles

In [14]:
#groupby title and calculate proportion of female names - by doing onehotencoding of gender, then take avg of male and female
#for each title, create a separate observation for every genre that is listed
#scatterplot of %female vs popularity

In [15]:
titles_and_credits.groupby('id')['gender'].count()

id
tm1000037    14
tm1000147    10
tm100015     25
tm1000185    21
tm100027     11
             ..
ts97584       2
ts9794        1
ts98252      11
ts98316       1
ts987         4
Name: gender, Length: 5208, dtype: int64