In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import names
import sklearn
import seaborn as sns

### Training a Naive Bayes Classifier to classify gender of names

In [2]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [3]:
import random
labeled_names = ([(name, 'male') for name in names.words('male.txt')]+
             [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

In [4]:
featuresets = [(gender_features(n), gender) 
               for (n, gender) in labeled_names]

#featuresets is already shuffled, len = 7944
train_set, test_set = featuresets[500:], featuresets[:500]

classifier = nltk.NaiveBayesClassifier.train(train_set)

In [5]:
#test on a new name
print(classifier.classify(gender_features('Trinity')))

#calculate accuracy on test set
print(nltk.classify.accuracy(classifier, test_set))

female
0.808


### Use Classifier to Classify Names in Credits Dataset

In [6]:
credits = pd.read_csv('data/credits.csv')
credits.head()

Unnamed: 0,person_id,id,name,character,role
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR
2,7064,tm84618,Albert Brooks,Tom,ACTOR
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR


In [7]:
# credits['gender'] = classifier.classify(gender_features(credits['name'].split()[0]))
credits['gender'] = credits['name'].apply(lambda x : classifier.classify(gender_features(x.split()[0])))

In [8]:
credits.head(10)

Unnamed: 0,person_id,id,name,character,role,gender
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR,male
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR,female
2,7064,tm84618,Albert Brooks,Tom,ACTOR,male
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR,male
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR,male
5,32267,tm84618,Peter Boyle,Wizard,ACTOR,male
6,519612,tm84618,Leonard Harris,Senator Charles Palantine,ACTOR,male
7,29068,tm84618,Diahnne Abbott,Concession Girl,ACTOR,female
8,519613,tm84618,Gino Ardito,Policeman at Rally,ACTOR,male
9,3308,tm84618,Martin Scorsese,Passenger Watching Silhouette,ACTOR,male


### Use Classifier on titles_and_credits

In [9]:
titles_and_credits = pd.read_csv('data/titles_and_credits.csv')
titles_and_credits.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,title,type,description,release_year,age_certification,runtime,genres,...,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,person_id,name,character,role
0,0,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,,tt0075314,8.2,808582.0,40.965,8.179,3748,Robert De Niro,Travis Bickle,ACTOR
1,1,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,,tt0075314,8.2,808582.0,40.965,8.179,14658,Jodie Foster,Iris Steensma,ACTOR
2,2,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,,tt0075314,8.2,808582.0,40.965,8.179,7064,Albert Brooks,Tom,ACTOR
3,3,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,,tt0075314,8.2,808582.0,40.965,8.179,3739,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,4,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,,tt0075314,8.2,808582.0,40.965,8.179,48933,Cybill Shepherd,Betsy,ACTOR


In [10]:
titles_and_credits['gender'] = titles_and_credits['name'].apply(
    lambda x : classifier.classify(gender_features(x.split()[0])))

In [11]:
titles_and_credits.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,title,type,description,release_year,age_certification,runtime,genres,...,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,person_id,name,character,role,gender
0,0,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,3748,Robert De Niro,Travis Bickle,ACTOR,male
1,1,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,14658,Jodie Foster,Iris Steensma,ACTOR,female
2,2,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,7064,Albert Brooks,Tom,ACTOR,male
3,3,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,3739,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR,male
4,4,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",...,tt0075314,8.2,808582.0,40.965,8.179,48933,Cybill Shepherd,Betsy,ACTOR,male


### Gender Proportion vs Popularity of Titles

In [12]:
#groupby title and calculate proportion of female names - by doing onehotencoding of gender, then take avg of male and female
#for each title, create a separate observation for every genre that is listed
#scatterplot of %female vs popularity

In [13]:
def prop_female(x):
    gender_count = x['gender'].value_counts()
    if (len(gender_count) == 2): #if there are females (all observations either have both M/F or only M names)
        return gender_count[1] / (gender_count[0] + gender_count[1])
    else: 
        return 0

df_prop_female = titles_and_credits.groupby('id').apply(prop_female).to_frame(name = 'prop_female')

In [14]:
titles = pd.read_csv('data/titles_clean.csv')

top_genres_mask = titles['genres'].apply(lambda x: ('comedy' in x)  or ('drama' in x) or ('documentation' in x) or ('thriller' in x) or ('action' in x))
titles = titles[top_genres_mask]

print(titles.shape)
titles.head()

(2497, 16)


Unnamed: 0.1,Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.6,
1,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3
3,3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.6


In [15]:
titles = pd.merge(titles, df_prop_female, on = 'id')
print(titles.shape)
titles.head()

(2393, 17)


Unnamed: 0.1,Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,prop_female
0,1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179,0.243243
1,2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3,0.26087
2,3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811,0.479167
3,4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.6,0.236364
4,5,ts22164,Monty Python's Flying Circus,SHOW,A British sketch comedy series with the shows ...,1969,TV-14,30,"['comedy', 'european']",['GB'],4.0,tt0063929,8.8,73424.0,17.617,8.306,0.0


Making a dataframe for titles that include each of the 5 genres, and calculating descriptive stats for their prop_female and tmdb_popularity columns.

In [16]:
 # df of shows or movies that included 'comedy' as a genre label
includes_comedy_mask = titles['genres'].apply(lambda x: ('comedy' in x))
includes_comedy = titles[includes_comedy_mask]
print(includes_comedy.shape)
# print(includes_comedy.columns.values)
# print(includes_comedy[['tmdb_popularity', 'prop_female']])
includes_comedy.head()

includes_comedy[['tmdb_popularity', 'prop_female']].describe()
includes_comedy[['tmdb_popularity', 'prop_female']].median()


# df of shows or movies that included 'drama' as a genre label
includes_drama_mask = titles['genres'].apply(lambda x: ('drama' in x))
includes_drama = titles[includes_drama_mask] 
print(includes_drama.shape)
# print(includes_drama.columns.values)
# print(includes_drama[['tmdb_popularity', 'prop_female']])
includes_drama.head()

includes_drama[['tmdb_popularity', 'prop_female']].describe()
includes_drama[['tmdb_popularity', 'prop_female']].median()


# df of shows or movies that included 'documentation' as a genre label
includes_documentation_mask = titles['genres'].apply(lambda x: ('documentation' in x))
includes_documentation = titles[includes_documentation_mask]
print(includes_documentation.shape)
# print(includes_documentation.columns.values)
# print(includes_documentation[['tmdb_popularity', 'prop_female']])
includes_documentation.head()

includes_documentation[['tmdb_popularity', 'prop_female']].describe()
includes_documentation[['tmdb_popularity', 'prop_female']].median()


# df of shows or movies that included 'thriller' as a genre label
includes_thriller_mask = titles['genres'].apply(lambda x: ('thriller' in x))
includes_thriller = titles[includes_thriller_mask]
print(includes_thriller.shape)
# print(includes_thriller.columns.values)
# print(includes_thriller[['tmdb_popularity', 'prop_female']])
includes_thriller.head()

includes_thriller[['tmdb_popularity', 'prop_female']].describe()
includes_thriller[['tmdb_popularity', 'prop_female']].median()


# df of shows or movies that included 'action' as a genre label
includes_action_mask = titles['genres'].apply(lambda x: ('action' in x))
includes_action = titles[includes_action_mask]
print(includes_action.shape)
# print(includes_action.columns.values)
# print(includes_action[['tmdb_popularity', 'prop_female']])
includes_action.head()

includes_action[['tmdb_popularity', 'prop_female']].describe()
includes_action[['tmdb_popularity', 'prop_female']].median()

(1090, 17)
(1031, 17)
(593, 17)
(513, 17)
(491, 17)


tmdb_popularity    21.871000
prop_female         0.333333
dtype: float64

In [17]:
# includes_comedy.to_csv('data/includes_comedy.csv')
# includes_drama.to_csv('data/includes_drama.csv')
# includes_documentation.to_csv('data/includes_documentation.csv')
# includes_thriller.to_csv('data/includes_thriller.csv')
# includes_action.to_csv('data/includes_action.csv')