In [10]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import names
import sklearn
import seaborn as sns

### Training a Naive Bayes Classifier to classify gender of names

In [11]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [12]:
import random
labeled_names = ([(name, 'male') for name in names.words('male.txt')]+
             [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

LookupError: 
**********************************************************************
  Resource [93mnames[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('names')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/names[0m

  Searched in:
    - 'C:\\Users\\judyr/nltk_data'
    - 'C:\\Users\\judyr\\anaconda3\\nltk_data'
    - 'C:\\Users\\judyr\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\judyr\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\judyr\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
featuresets = [(gender_features(n), gender) 
               for (n, gender) in labeled_names]

classifier = nltk.NaiveBayesClassifier.train(featuresets)

In [None]:
#test on a new name
classifier.classify(gender_features('Trinity'))

### Use Classifier to Classify Names in Credits Dataset

In [None]:
credits = pd.read_csv('data/credits.csv')
credits.head()

In [None]:
# credits['gender'] = classifier.classify(gender_features(credits['name'].split()[0]))
credits['gender'] = credits['name'].apply(lambda x : classifier.classify(gender_features(x.split()[0])))

In [None]:
credits.head(10)

### Use Classifier on titles_and_credits

In [None]:
titles_and_credits = pd.read_csv('data/titles_and_credits.csv')
titles_and_credits.head()

In [None]:
titles_and_credits['gender'] = titles_and_credits['name'].apply(
    lambda x : classifier.classify(gender_features(x.split()[0])))

In [None]:
titles_and_credits.head()

### Gender Proportion vs Popularity of Titles

In [None]:
#groupby title and calculate proportion of female names - by doing onehotencoding of gender, then take avg of male and female
#for each title, create a separate observation for every genre that is listed
#scatterplot of %female vs popularity

In [None]:
def prop_female(x):
    gender_count = x['gender'].value_counts()
    if (len(gender_count) == 2): #if there are females (all observations either have both M/F or only M names)
        return gender_count[1] / (gender_count[0] + gender_count[1])
    else: 
        return 0

df_prop_female = titles_and_credits.groupby('id').apply(prop_female).to_frame(name = 'prop_female')

In [None]:
titles = pd.read_csv('data/titles_clean.csv')

top_genres_mask = titles['genres'].apply(lambda x: ('drama' in x)  or ('comedy' in x) or ('thriller' in x) or ('action' in x) or ('romance' in x) or ('documentation' in x) or ('crime' in x))
titles = titles[top_genres_mask]

print(titles.shape)
titles.head()

In [None]:
titles = pd.merge(titles, df_prop_female, on = 'id')
print(titles.shape)
titles.head()

Making a dataframe for titles that include each of the 7 genres, and calculating descriptive stats for their prop_female and imdb_popularity columns.

In [None]:
# df of shows or movies that included 'drama' as a genre label
includes_drama_mask = titles['genres'].apply(lambda x: ('drama' in x))
includes_drama = titles[includes_drama_mask] 
print(includes_drama.shape)
# print(includes_drama.columns.values)
# print(includes_drama[['tmdb_popularity', 'prop_female']])
includes_drama.head()

includes_drama[['tmdb_popularity', 'prop_female']].describe()
includes_drama[['tmdb_popularity', 'prop_female']].median()


 # df of shows or movies that included 'comedy' as a genre label
includes_comedy_mask = titles['genres'].apply(lambda x: ('comedy' in x))
includes_comedy = titles[includes_comedy_mask]
print(includes_comedy.shape)
# print(includes_comedy.columns.values)
# print(includes_comedy[['tmdb_popularity', 'prop_female']])
includes_comedy.head()

includes_comedy[['tmdb_popularity', 'prop_female']].describe()
includes_comedy[['tmdb_popularity', 'prop_female']].median()


# df of shows or movies that included 'thriller' as a genre label
includes_thriller_mask = titles['genres'].apply(lambda x: ('thriller' in x))
includes_thriller = titles[includes_thriller_mask]
print(includes_thriller.shape)
# print(includes_thriller.columns.values)
# print(includes_thriller[['tmdb_popularity', 'prop_female']])
includes_thriller.head()

includes_thriller[['tmdb_popularity', 'prop_female']].describe()
includes_thriller[['tmdb_popularity', 'prop_female']].median()


# df of shows or movies that included 'action' as a genre label
includes_action_mask = titles['genres'].apply(lambda x: ('action' in x))
includes_action = titles[includes_action_mask]
print(includes_action.shape)
# print(includes_action.columns.values)
# print(includes_action[['tmdb_popularity', 'prop_female']])
includes_action.head()

includes_action[['tmdb_popularity', 'prop_female']].describe()
includes_action[['tmdb_popularity', 'prop_female']].median()


# df of shows or movies that included 'romance' as a genre label
includes_romance_mask = titles['genres'].apply(lambda x: ('romance' in x))
includes_romance = titles[includes_romance_mask]
print(includes_romance.shape)
# print(includes_romance.columns.values)
# print(includes_romance[['tmdb_popularity', 'prop_female']])
includes_romance.head()

includes_romance[['tmdb_popularity', 'prop_female']].describe()
includes_romance[['tmdb_popularity', 'prop_female']].median()


# df of shows or movies that included 'documentation' as a genre label
includes_documentation_mask = titles['genres'].apply(lambda x: ('documentation' in x))
includes_documentation = titles[includes_documentation_mask]
print(includes_documentation.shape)
# print(includes_documentation.columns.values)
# print(includes_documentation[['tmdb_popularity', 'prop_female']])
includes_documentation.head()

includes_documentation[['tmdb_popularity', 'prop_female']].describe()
includes_documentation[['tmdb_popularity', 'prop_female']].median()


# df of shows or movies that included 'crime' as a genre label
includes_crime_mask = titles['genres'].apply(lambda x: ('crime' in x))
includes_crime = titles[includes_crime_mask]
print(includes_crime.shape)
# print(includes_crime.columns.values)
# print(includes_crime[['tmdb_popularity', 'prop_female']])
includes_crime.head()

includes_crime[['tmdb_popularity', 'prop_female']].describe()
includes_crime[['tmdb_popularity', 'prop_female']].median()

In [None]:
sns.catplot(data = includes_drama, x = "gender", y = "tmdb_popularity", kind = "bar")
sns.catplot(data = includes_comedy, x = "gender", y = "tmdb_popularity", kind = "bar")
sns.catplot(data = includes_thriller, x = "gender", y = "tmdb_popularity", kind = "bar")
sns.catplot(data = includes_action, x = "gender", y = "tmdb_popularity", kind = "bar")
sns.catplot(data = includes_romance, x = "gender", y = "tmdb_popularity", kind = "bar")
sns.catplot(data = includes_documentation, x = "gender", y = "tmdb_popularity", kind = "bar")
sns.catplot(data = includes_crime, x = "gender", y = "tmdb_popularity", kind = "bar")