In [18]:
import random
import json
from pprint import pprint
import nltk
import re
import os
import string
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tag.stanford import StanfordNERTagger
from nltk.chunk import RegexpParser
import stanfordTaggers.nerTagger.nertclient
import stanfordTaggers.posTagger.posclient

#PART 0:
##Data Cleaning


We abtained the source data from IMDb, Wikipedia (via David Bamman and other researchers, Carnegie Mellon University, and the Social Security Administration (see writeup for citations).

We first eliminated TV shows and focused on only movies.

Then we reduced the movies to ones with 10 or more IMDB reviews so that we only analyze movies with decent infleunce.

We then matched the IMDB movie titles with their wikipedia counterparts, and this turned out to be a challenge on its own. Since our IMDB dataset is larger then the wikipedia dataset, many movies have IMDB summaries but not a wikipedia summary. Our final database included movies that have IMDB summaries and/or wikipedia summaries.

Lastly, we matched our movies to their country data, and reduced our dataset to just US movies. It contains about 34,000 movies.

#PART 1: 
##Named Entity Recognition Tagging for Movie Summaries 

### import Standford POS Tagger and Stanford NER Tagger

In [23]:
#to run StanfordPostTagger and NERTagger, 
#first download these two packages from http://nlp.stanford.edu/software/CRF-NER.shtml 
#I saved the downloaded files in lib/
post = StanfordPOSTagger('stanfordTaggers/lib/stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger', 
                         'stanfordTaggers/lib/stanford-postagger-2014-08-27/stanford-postagger.jar', 'utf-8') # doctest: +SKIP
# post.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
nert = StanfordNERTagger('stanfordTaggers/lib/stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz',
                         'stanfordTaggers/lib/stanford-ner-2014-08-27/stanford-ner.jar', 'utf-8')

###Load in cleaned dataset of USA moviews from IMDB and Wikiperida

In [None]:
with open('data/input/usamoviedict.array.json') as sample:
    sample = json.load(sample)

###More data cleaning

In [69]:
# get rid of ([[ in the wikipidia summaries
for movie in sample:
    summaries_wiki = movie[1]['summaries_wikipedia']
    if len(summaries_wiki) > 0:
        summaries_wiki = re.sub(r'\s\(\[\[', ' ', summaries_wiki[0])
        movie[1]['summaries_wikipedia'][0] = summaries_wiki

###NER Tagging

In [71]:
#pos tagging using nltk.pos_tag
#ner tagging using stanford ner tagger
def ie_preprocess(document, lower='false', stage="pos"):
#     if stage == 'pos':
#         sentences = nltk.sent_tokenize(document)
#         sentences = [nltk.word_tokenize(sent) for sent in sentences]
#         if lower == 'false':
#             for sent in sentences:
#                 for i in range(len(sent)):
#                     if sent[i] != sent[i].lower():
#                         sent[i] = sent[i].lower()
#         sentences = [nltk.pos_tag(sent) for sent in sentences]
#         return sentences
    if stage == "ner":
        return nertclient.nertclient(document)

In [None]:
# make stanford nert more accurate by seperating the last word
# in a sentence from the ending punctuation with a space.
def insert_space_before_punct(s):
    s = re.sub(r'([.,!?;:])', r' \1', s)
    return s

In [72]:
# pos + ner tag summaries and roles
for movie in sample:
    summaries_imdb = movie[1]["summaries_imdb"]
    summaries_wiki = movie[1]['summaries_wikipedia']
    roles = movie[1]['roles']
    if len(summaries_imdb) > 0:
        # sample[i][1]["summaries_imdb_pos"] = ie_preprocess(summaries_imdb[0])
        sample[i][1]["summaries_imdb_ner"] = ie_preprocess(insert_space_before_punct(summaries_imdb[0]), stage="ner")
        
    if len(summaries_wiki) > 0:
        # sample[i][1]["summaries_wikipedia_pos"] = ie_preprocess(summaries_wiki[0])
        sample[i][1]["summaries_wikipedia_ner"] = ie_preprocess(insert_space_before_punct(summaries_wiki[0]), stage='ner')


###Output NER tagged dataset as json file usamoviedict.sample.json

In [240]:
def set_default(obj):
    if isinstance(obj, set):
        d = {}
        for i in obj:
            d[i] = 1
        return d
    raise TypeError

In [None]:
with open(os.path.join('data/output/', 'usamoviedict.array.json.complete.json'), 'w') as outfile:
    json.dump(sample, outfile, ensure_ascii=False, default=set_default)
 

#PART 2: 
##Lift out character names from summaries, and associate the character names with names from the IMDB role list.

###Load in the previously NER tagged dataset

In [3]:
with open('data/output/usamoviedict.array.json.complete.json') as tagged_data:
    sample = json.load(tagged_data)

###Create a new master dictionary called "sum_and_char" containing character names and related info.

####First, add new fields "summaries_combined" and "summaries_combined_ner" in the dataset that stores wikipedia and IMDB summaries into a single string for each movie.

In [4]:
# combine wiki_ner and imdb_nerie
for movie in sample:
    combined = []
    combined_ner = ""
    if 'summaries_wikipedia_ner' in movie[1]:
        combined.extend(movie[1]['summaries_wikipedia'] )
        combined_ner = combined_ner + " " + (movie[1]['summaries_wikipedia_ner'] )
    if 'summaries_imdb_ner' in movie[1]:
        combined.extend(movie[1]["summaries_imdb"])
        combined_ner = combined_ner + " " + (movie[1]["summaries_imdb_ner"])
    movie[1]['summaries_combined'] = combined
    movie[1]['summaries_combined_ner'] = combined_ner

####Second, run a customized regex chuncker to lift out tagged names from combined summaries

Create a json file containing a list of names for the social security dataset, this file will be used in the person_extractor_and_replacer function. The resulting name_list.json contains 138,036 unique names (93,889 first names, 58,258 last names)

In [None]:
with open('data/input/names.json') as data:
    first_names = json.load(data)
with open('data/input/surnames.json') as data2:
    last_names = json.load(data2)

name_list = []

for key, value in first_names.items():
    name_list.append(key)
for key, value in last_names.items():
    name_list.append(key)

def set_default(obj):
    if isinstance(obj, set):
        d = {}
        for i in obj:
            d[i] = 1
        return d
    raise TypeError
                    
with open(os.path.join('data/output/', "name_list.json"), 'w') as outfile:
    json.dump(set(name_list), outfile, ensure_ascii=False, default=set_default)

In [15]:
#character name chunking.
#1. tag words in summaries that match the Social Security name dataset as "/PERSON"
#2. chunk together the consective "/PERSON" tagged words
#3. append to a chunk any number of capitalized word after a '/PERSON" tagged word (this helps capture the entirity
# of a name that spans multiple words. some of the words toward the end may not have been tagged correctly.)

def person_extractor_and_replacer(tagged_summary):
    with open('data/output/name_list.json', 'r') as json_str:
        ssn_names = json.load(json_str)

    words = tagged_summary.split()

    persons = []
    current_person = []

    for w in words:
        m = re.search('([^/]+)/([A-Z]+)', w)
        if m is not None:
            body = m.group(1)
            klass = m.group(2)
            if klass != "PERSON":
                if body in ssn_names:
                    klass = "PERSON"

            if klass == "PERSON":
                current_person.append(body)
            elif len(current_person) > 0 and re.search('^[A-Z]', w) is not None:
                current_person.append(body)
            elif len(current_person) > 0:
                persons.append(" ".join(current_person))
                current_person = []
            # search for matches with social security name list
    return persons

In [7]:
# creating a new dict called sum_and_char to store summaries and corresponding characters
sum_and_char = {}
for movie in sample:
#     sorting entity_chuncking results into summaries and characters
    result = person_extractor_and_replacer(movie[1]['summaries_combined_ner'])
    movie_name = movie[0]
    if len(result) <= 0:
        char_data = {}
    else:
        char_data = set(result)
    sum_and_char[movie_name] = {"sum": movie[1]['summaries_combined'][0], \
                                "char_raw": char_data,\
                                "char_info" : {}}

###Filter character names, and establish the link between filtered character names from summaries and their counter parts in the roles list. Store results in the field char_info dict within the master dict

In [9]:
for movie in sample:
#     making a bag of words for roles
    for role in movie[1]['roles']:
        role_words = role['role']
        role_bag = re.split(r' |/|\'|\"', role_words)
        role['role_bag'] = (set(role_bag))

In [10]:
# filter char_raw against role list and link char_names with their names in the role list.
''' 
match words in char_raw against roles in role list. the list of actual character names is a subset of char_raw.
This is because some of the captured names are not characters (e.g. director, actor, spriris and saints). We filter
out the false names by only keeping char_raw names if at least one word in the name has a matched word in the role list.
If there is zero word match, discard. If there is at least one
match, keep.

char_info in sum_and_char use names in the role list as keys, and contains role gender, char names
found in summaries, and roles found in summaries

The matching selects the highest # of matches between a role and a character name. Ties are broken by selecting the role with
the fewest words.
'''
for key, value in (sum_and_char.items()):
    for movie in sample:
#     locate correct movie
        if movie[0] == key:
#             for each filted char name
            for name in value['char_raw']:
                max_count = 0
                max_role = None
                max_gen = None
#                 tie-breaker on # of words contained in a role_bag
                max_role_bag_len = 0

                name_split = name.split()
#                 for each role
                for role in movie[1]['roles']:
                    count = 0
#                   for each word in char name
#                   increment count by 1 every time a word in the char name appears in a role
                    for word in name_split:
                        if word in role['role_bag']:
                            count += 1
#                   select the role with the most counts!
                    if (count > max_count) or (count == max_count and len(role['role_bag']) < max_role_bag_len):
                        max_count = count
                        max_role = role['role']
                        max_gen = role['gender']
                        max_role_bag_len = len(role['role_bag'])
                
#                 After looping through all roles in the role list, decide which is the max role
                if max_count < 1:
#                     print(name, "discarded")
                    pass
                else:   
                    if max_role not in value['char_info']:
                        value['char_info'][max_role] = {'gender':max_gen, 'roles_found_in_sums':[], 'names_found_in_sums':[name]}
                    else:
                        value['char_info'][max_role]['names_found_in_sums'].append(name)

###Lastly, output the master dict sum_and_char as a json file

In [13]:
with open('data/output/sum_char_char.complete.initial.json', 'w') as outfile:
    json.dump(os.path.join('data/output/sum_and_char', outfile, ensure_ascii=False, default=set_default)
 

#PART 3: 
###Find roles for character names in summary!
###Start by defining several regular expressions to find phrases that link a character's name to her role.

In [45]:
#Load data
with open('data/output/sum_char_char.complete.initial.json') as sum_and_char:
    sum_and_char = json.load(sum_and_char)

In [50]:
#define regex with parenthesis to keep parens from roles from unbalacing re
parenstrip = re.compile(r'[()]')
parenbracketstrip =re.compile(r'[[\]()]')

In [47]:
#regex1: 'is a/an'
for key, value in sum_and_char.items():
    for subkey, subvalue in value['char_info'].items():
        for char in subvalue['names_found_in_sums']:
            #print(char)
            subvalue['re_extracted_roles'] = []
            
            regex_str = re.sub(parenbracketstrip, '', char) + ' is an? ([^.]+?)\.'
            
                
                
            #print(regex_str)
            try:
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            except:
                print(char)
                regex_str = re.escape(char) + ' is an? ([^.]+?)\.'
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            if m:
                subvalue['re_extracted_roles'].append(m.group(1))

Head B**** In Charge


In [48]:
#regex2: ', a <role>'
for key, value in sum_and_char.items():
    for subkey, subvalue in value['char_info'].items():
        for char in subvalue['names_found_in_sums']:
            regex_str = re.sub(parenstrip, '', char) + ', a ([^.]+?)\.'
            try:
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            except:
                print(char)
                regex_str = re.escape(char) + ' is an? ([^.]+?)\.'
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            if m:
                subvalue['re_extracted_roles'].append(m.group(1))

Drew Cabot([[James Thomas
Head B**** In Charge


In [49]:
#regex3: 'named/by the name of'
for key, value in sum_and_char.items():
    for subkey, subvalue in value['char_info'].items():
        for char in subvalue['names_found_in_sums']:
            regex_str = '^([A-Za-z ]+ )(named|by the name of) ' + re.sub(parenstrip, '', char)
            #print(regex_str)
            try:
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            except:
                print(char)
                regex_str = re.escape(char) + ' is an? ([^.]+?)\.'
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            if m:
                subvalue['re_extracted_roles'].append(m.group(1))

Drew Cabot([[James Thomas
Head B**** In Charge


In [51]:
#regex4: '<role>(,) name'
for key, value in sum_and_char.items():
    for subkey, subvalue in value['char_info'].items():
        for char in subvalue['names_found_in_sums']:
            regex_str = '([A-Za-z]+),? ' + re.sub(parenbracketstrip, '', char)
            #print(regex_str)
            try:
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            except:
                print(char)
                regex_str = re.escape(char) + ' is an? ([^.]+?)\.'
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            if m:
                subvalue['re_extracted_roles'].append(m.group(1))

Head B**** In Charge


In [52]:
#regex5: 'his/her/their <role>'
for key, value in sum_and_char.items():
    for subkey, subvalue in value['char_info'].items():
        for char in subvalue['names_found_in_sums']:
            regex_str = re.sub(parenbracketstrip, '', char) +',? (his|her|their) (\S+) '
            #print(regex_str)
            try:
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            except:
                print(char)
                regex_str = re.escape(char) + ' is an? ([^.]+?)\.'
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            if m:
                subvalue['re_extracted_roles'].append(m.group(1))

Head B**** In Charge


In [53]:
#regex6: 'his/her <word>'
for key, value in sum_and_char.items():
    for subkey, subvalue in value['char_info'].items():
        for char in subvalue['names_found_in_sums']:
            regex_str = '[Hh](is|er) ([A-Za-z]+),? ' + re.sub(parenbracketstrip, '', char)
            try:
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            except:
                print(char)
                regex_str = re.escape(char) + ' is an? ([^.]+?)\.'
                p = re.compile(regex_str)
                m = p.search(value['sum'])
            if m:
                subvalue['re_extracted_roles'].append(m.group(1)) 

Head B**** In Charge


###Tagging the Regex-extracted parts
After extracting phrases with the Regex patterns, we tag them to prepare them for chunking.

In [32]:
posclient.posclient('I am sitting on the bed')

[('I', 'PRP'),
 ('am', 'VBP'),
 ('sitting', 'VBG'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('bed', 'NN')]

In [31]:
post.tag('I am sitting on the bed'.split())

[('I', 'PRP'),
 ('am', 'VBP'),
 ('sitting', 'VBG'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('bed', 'NN')]

In [66]:
x=0
for film in sum_and_char:
    for role in sum_and_char[film]['char_info']:
        sum_and_char[film]['char_info'][role]['re_roles_tagged']=[posclient.posclient(re_role) for re_role in sum_and_char[film]['char_info'][role]['re_extracted_roles'] if type(re_role)==str]


###Lift out the roles of characters using chunking
A chunking grammar includes or excludes words from the tagged phrases to try to isolate roles.

In [67]:
grammar = '''CHUNK: 
                    {^<.*>*?<NN.*>+}
                    {<JJ><NN>}
                    {<NNP>+}
                    {<NN><NNP>}
                    }<IN|.*RB|VB.*|TO|.*DT|PRP.*>{
'''
cp = nltk.RegexpParser(grammar)

In [68]:
#pattern for breaking chunk string output and tags into list
chunktag_re = re.compile('\S+ /[A-Z]{2}')

###Splitting up chunks and isolating role-descriptive words

In [69]:
#loop through films in dictionary and add lists of chunks
for film in sum_and_char:
    for role in sum_and_char[film]['char_info']:
        sum_and_char[film]['char_info'][role]['chunks'] = []
        for tagged_role in sum_and_char[film]['char_info'][role]['re_roles_tagged']:
            for subtree in cp.parse(tagged_role).subtrees():
                if subtree.label() == 'CHUNK':
                    sum_and_char[film]['char_info'][role]['chunks'].append(" ".join([a + " /" + b for (a,b) in subtree.leaves()]))

In [70]:
#regex to remove trailing punctuation from chunks
strippattern = re.compile(r'\W$')

In [71]:
for film in sum_and_char:
    for role in sum_and_char[film]['char_info']:
        roleholdinglist= []
        for chunk in sum_and_char[film]['char_info'][role]['chunks']:
            chunkwords = chunktag_re.findall(chunk) #list of token-tag strings from chunk
            chunktemplist = []
            for chunkword in chunkwords:
                if ((chunkword[:-4] not in role) and (chunkword[:-4] not in roleholdinglist)
                    and ((chunkword.split('/')[1] == ('NN')) )):
                    chunktemplist.append(chunkword[:-4])
            if len(chunktemplist)>0:    
                roleholdinglist.append(' '.join(chunktemplist))
        if len(roleholdinglist)>0:
            sum_and_char[film]['char_info'][role]['roles_found_in_sums'] = [re.sub(strippattern, '', finalrole) for finalrole in roleholdinglist]

In [18]:
sum_and_char['bedazzled.2000']['char_info']

{'Alison/Nicole': {'chunks': ['sports /NNS reporter /NN',
   'colleague /NN',
   'colleague /NN',
   'neighbor /NN'],
  'gender': 'F',
  'names_found_in_sums': ['Alison', 'Alison Gardner', 'Nicole Delarusso'],
  're_extracted_roles': ['sports reporter, to lose interest in him shortly after they meet',
   'colleague',
   'colleague',
   'neighbor',
   'his',
   'is',
   'is'],
  're_roles_tagged': [[['sports', 'NNS'],
    ['reporter', 'NN'],
    [',', ','],
    ['to', 'TO'],
    ['lose', 'VB'],
    ['interest', 'NN'],
    ['in', 'IN'],
    ['him', 'PRP'],
    ['shortly', 'RB'],
    ['after', 'IN'],
    ['they', 'PRP'],
    ['meet', 'VBP']],
   [['colleague', 'NN']],
   [['colleague', 'NN']],
   [['neighbor', 'NN']],
   [['his', 'PRP$']],
   [['is', 'VBZ']],
   [['is', 'VBZ']]],
  'roles_found_in_sums': ['sports reporter', 'colleague', 'neighbor']},
 'Bob/Roberto/Beach Jock/Sportscaster/Lincoln Aide': {'chunks': [],
  'gender': 'M',
  'names_found_in_sums': ['Bob', 'Abraham Lincoln'],
  

In [241]:
#option to output full 
with open(os.path.join('data/output/', 'sum_and_char.complete.extracted.json'), 'w') as outfile:
    json.dump(sum_and_char, outfile, ensure_ascii=False, default=set_default)


#PART 4:
###Result Analysis
Aside from allowing one to easily view data for a particular film, this section mostly contains code we used to look at the data.  More detailed analysis is in the writeup.

In [2]:
#Load data
with open('data/output/sum_and_char.complete.extracted.json') as sum_and_char:
    sum_and_char = json.load(sum_and_char)



###To view extracted data for a particular film, enter it into the "thisfilm" variable below in the format "lower case film title.YYYY" (e.g. 'gone with the wind.1939' ).

In [29]:
thisfilm = 'a clockwork orange.1971'

for role in sum_and_char[thisfilm]['char_info']:
    print('Role from IMDB credit list: ', role)
    print('\nExtracted variants: ')
    print(sum_and_char[thisfilm]['char_info'][role]['roles_found_in_sums'])
    print('\n\n*****')
    #for extracted_role in sum_and_char[thisfilm]['char_info']['roles_found_in_sums']:
     #   print(extracted_role)

Role from IMDB credit list:  Pete

Extracted variants: 
[]


*****
Role from IMDB credit list:  Mustachioed Ludovico Technician

Extracted variants: 
[]


*****
Role from IMDB credit list:  Dim

Extracted variants: 
[]


*****
Role from IMDB credit list:  Minister

Extracted variants: 
[]


*****
Role from IMDB credit list:  Billyboy

Extracted variants: 
[]


*****
Role from IMDB credit list:  Alex

Extracted variants: 
['London']


*****
Role from IMDB credit list:  Mrs. Alexander

Extracted variants: 
[]


*****
Role from IMDB credit list:  Georgie

Extracted variants: 
[]


*****
Role from IMDB credit list:  Julian

Extracted variants: 
['manservant']


*****


In [30]:
sum_and_char[thisfilm]['sum']


'In futuristic London, Alex  is the leader of his "droogs", Pete , Georgie , and Dim , one of many youth gangs in the decaying metropolis. One night, after intoxicating themselves on "milk plus", they engage in an evening of "ultra-violence", including beating an elderly vagrant , and fighting a rival gang led by Billyboy .Both Burgess\' novel and Stanley Kubrick\'s published movie script have this character\'s name as one word "Billyboy" although the Internet Movie Database lists him in the credits with two words "Billy Boy". Stealing a car, they drive to the country home of writer F. Alexander Patrick Magee , where they beat Mr. Alexander to the point of crippling him for life. Alex then rapes his wife  while intoning "Singin\' in the Rain". The next day, while truant from school, Alex is approached by probation officer Mr. P. R. Deltoid , who is aware of Alex\'s violence and cautions him. In response, Alex visits a record store where he picks up two girls. Alex and the girls have se

In [96]:
commastrip=re.compile(r',')
roleslist = open('foundroles.csv', 'w')
roleslist.write('Role,Gender,Year,Film\n')


overallrolescounter=0
discoveredrolescounter=0
for film in sum_and_char:
        #print(film)
        #print(film[-4:])
    
        for role in sum_and_char[film]['char_info']:
            overallrolescounter+=1
            #print('\t', role)
            for foundrole in sum_and_char[film]['char_info'][role]['roles_found_in_sums']:
                discoveredrolescounter+=1
                #print('\t\t', foundrole)
                roleslist.write((re.sub(commastrip, '', foundrole).lower()))
                roleslist.write(',')
                roleslist.write(sum_and_char[film]['char_info'][role]['gender'])
                roleslist.write(',')
                roleslist.write(film[-4:])
                roleslist.write(',')
                roleslist.write(re.sub(commastrip, '', film[:-5]))
                roleslist.write('\n')
        #print('\n')
roleslist.close()

The cell below creates various lists of extracted roles divided by gender, year, and other attributes, on which we performed frequency distributions to obtain results.  

In [107]:
extractedroleslist = []
allroleslist = []
yeardict = {}
genderdict = {}

for film in sum_and_char:
    for role in sum_and_char[film]['char_info']:
        for foundrole in sum_and_char[film]['char_info'][role]['roles_found_in_sums']:
                extractedroleslist.append({'role': foundrole, 'gender': sum_and_char[film]['char_info'][role]['gender'], 'year': int(film[-4:])})

                
                
allroleslist = [item['role'].lower() for item in extractedroleslist]
malelist=[item['role'].lower() for item in extractedroleslist if item['gender']=='M']
femalelist=[item['role'].lower() for item in extractedroleslist if item['gender']=='F']
for x in extractedroleslist:
    if x['year'] not in yeardict:
        yeardict[x['year']] = [x['role'].lower()]
    else:
        yeardict[x['year']].append(x['role'].lower())

for x in extractedroleslist:
    if x['role'].lower() not in genderdict:
        genderdict[x['role'].lower()] = {'M':0, 'F':0}
        genderdict[x['role'].lower()][x['gender']] +=1
    else:
        genderdict[x['role'].lower()][x['gender']] +=1
               

In [165]:
nltk.FreqDist(allroleslist).most_common(100)

[('friend', 2156),
 ('wife', 1679),
 ('daughter', 1351),
 ('brother', 1139),
 ('son', 1135),
 ('girlfriend', 992),
 ('sister', 909),
 ('father', 863),
 ('mother', 763),
 ('man', 744),
 ('husband', 735),
 ('boyfriend', 625),
 ('girl', 541),
 ('woman', 505),
 ('agent', 475),
 ('detective', 468),
 ('partner', 419),
 ('friends', 401),
 ('boss', 365),
 ('student', 352),
 ('owner', 350),
 ('officer', 296),
 ('boy', 284),
 ('assistant', 279),
 ('reporter', 275),
 ('leader', 263),
 ('lawyer', 261),
 ('captain', 254),
 ('director', 252),
 ('professor', 249),
 ('cousin', 240),
 ('star', 227),
 ('singer', 223),
 ('attorney', 222),
 ('manager', 221),
 ('teacher', 214),
 ('neighbor', 204),
 ('lover', 195),
 ('john', 192),
 ('s', 187),
 ('uncle', 186),
 ('day', 176),
 ('writer', 171),
 ('actress', 169),
 ('worker', 166),
 ('secretary', 166),
 ('lieutenant', 166),
 ('artist', 162),
 ('school', 161),
 ('producer', 159),
 ('businessman', 154),
 ('cop', 154),
 ('photographer', 154),
 ('couple', 144),
 (

In [132]:
#calculate probability of female in a role
for role in genderdict:
    genderdict[role]['P(F)'] = genderdict[role]['F']/(genderdict[role]['F']+genderdict[role]['M'])

In [208]:
print(len(malelist))
print(len(femalelist))

45700
25516


In [159]:
decadedict = {}
for year in yeardict:
    if str(year)[:3] not in decadedict:
        decadedict[int(str(year)[:3])] = [role for role in yeardict[year]]
    else:
        for role in yeardict:
            decadedict[int(str(year)[:3])].append(role)

In [175]:
toptenthrudecades = {}
for decade in decadedict:
    #print(len(decadedict[decade]))
        tenlist = nltk.FreqDist(decadedict[decade]).most_common(10)
        toptenthrudecades[decade*10] = [(role[0], tenlist.index(role)) for role in tenlist]


In [183]:
decadelist = []
for decade in toptenthrudecades:
    for item in toptenthrudecades[decade]:
        decadelist.append([item[0], 10-item[1], decade])
print(decadelist)

topten = open('decadetopten.csv', 'w')
topten.write('Role,Rank,Year\n')
for item in decadelist:
    topten.write(item[0])
    topten.write(',')
    topten.write(str(item[1]))
    topten.write(',')
    topten.write(str(item[2]))
    topten.write('\n')
topten.close()
        

[['friend', 10, 1920], ['daughter', 9, 1920], ['brother', 8, 1920], ['girl', 7, 1920], ['sister', 6, 1920], ['lover', 5, 1920], ['wife', 4, 1920], ['nephew', 3, 1920], ['man', 2, 1920], ['lady', 1, 1920], ['dancer', 10, 1890], ['pioneer', 9, 1890], ['friend', 10, 1990], ['wife', 9, 1990], ['son', 8, 1990], ['daughter', 7, 1990], ['girlfriend', 6, 1990], ['mother', 5, 1990], ['father', 4, 1990], ['man', 3, 1990], ['brother', 2, 1990], ['sister', 1, 1990], ['wife', 10, 1960], ['friend', 9, 1960], ['lover', 8, 1960], ['girl', 7, 1960], ['partner', 6, 1960], ['daughter', 5, 1960], ['man', 4, 1960], ['girlfriend', 3, 1960], ['father', 2, 1960], ['son', 1, 1960], ['friend', 10, 1930], ['daughter', 9, 1930], ['wife', 8, 1930], ['girl', 7, 1930], ['brother', 6, 1930], ['sister', 5, 1930], ['cousin', 4, 1930], ['singer', 3, 1930], ['owner', 2, 1930], ['detective', 1, 1930], ['daughter', 10, 1900], ['sicilian', 9, 1900], ['sweetheart', 8, 1900], ['sprite', 7, 1900], ['musician', 6, 1900], ['pove

In [143]:
allrolesdist = nltk.FreqDist(allroleslist)
maledist = nltk.FreqDist(malelist)
femaledist = nltk.FreqDist(femalelist)
year1984dist = nltk.FreqDist(yeardict[1984])

In [140]:
allrolesdist.most_common(10)

[('friend', 2156),
 ('wife', 1679),
 ('daughter', 1351),
 ('brother', 1139),
 ('son', 1135),
 ('girlfriend', 992),
 ('sister', 909),
 ('father', 863),
 ('mother', 763),
 ('man', 744)]

In [194]:
print(maledist.most_common(30))
print(femaledist.most_common(30))

[('friend', 1457), ('brother', 1125), ('son', 1100), ('father', 802), ('man', 696), ('husband', 692), ('boyfriend', 610), ('detective', 417), ('agent', 380), ('partner', 338), ('boss', 327), ('owner', 283), ('friends', 278), ('boy', 258), ('officer', 249), ('captain', 229), ('leader', 229), ('director', 226), ('lawyer', 224), ('student', 202), ('professor', 196), ('manager', 188), ('attorney', 179), ('uncle', 174), ('reporter', 159), ('assistant', 157), ('cousin', 152), ('lieutenant', 151), ('producer', 142), ('businessman', 141)]
[('wife', 1625), ('daughter', 1316), ('girlfriend', 960), ('sister', 889), ('mother', 719), ('friend', 699), ('girl', 500), ('woman', 462), ('actress', 164), ('singer', 154), ('student', 150), ('secretary', 149), ('widow', 134), ('niece', 131), ('friends', 123), ('assistant', 122), ('nurse', 121), ('teacher', 118), ('reporter', 116), ('sweetheart', 100), ('john', 99), ('aunt', 95), ('agent', 95), ('neighbor', 94), ('star', 92), ('cousin', 88), ('teenager', 82

In [146]:
year1984dist.most_common(10)

[('friend', 19),
 ('brother', 17),
 ('wife', 16),
 ('girlfriend', 15),
 ('sister', 13),
 ('son', 10),
 ('boyfriend', 9),
 ('friends', 8),
 ('man', 8),
 ('father', 7)]

In [97]:
print(overallrolescounter)
print(discoveredrolescounter)

114922
71216
