In [26]:
import random
import json
from pprint import pprint
import nltk
import re
import string
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tag.stanford import StanfordNERTagger
from nltk.chunk import RegexpParser

### import Standford POS Tagger and Stanford NER Tagger

In [27]:
#to run StanfordPostTagger and NERTagger, 
#first download these two packages from http://nlp.stanford.edu/software/CRF-NER.shtml 
#I saved the downloaded files in lib/
post = StanfordPOSTagger('lib/stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger', 
                         'lib/stanford-postagger-2014-08-27/stanford-postagger.jar', 'utf-8') # doctest: +SKIP
# post.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
nert = StanfordNERTagger('lib/stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz',
                         'lib/stanford-ner-2014-08-27/stanford-ner.jar', 'utf-8')

### make a random sample of 2 and smaple of 5 from the json data as starting data.
Note: only run this code once. Currently commented out. 

In [28]:
# with open('movies-with-roles-summaries.json')as data_file:
#     data = json.load(data_file)

# print(type(data))

In [29]:
# data_sample_2 = random.sample((data.items()), 2)

In [30]:
# data_sample_5 = random.sample((data.items()), 5)

In [31]:
# with open('data_sample_2.json', 'w') as outfile:
#     json.dump(data_sample_2, outfile)

In [32]:
# with open('data_sample_5.json', 'w') as outfile:
#     json.dump(data_sample_5, outfile)

###Load in the sample of five

In [33]:
with open('data_sample_5.json') as sample:
# with open('movies-with-roles-summaries.json') as sample:
    sample = json.load(sample)

###More data cleaning

In [34]:
# get rid of ([[ in the wikipidia summaries
for movie in sample:
    summaries_wiki = movie[1]['summaries_wikipedia']
    if len(summaries_wiki) > 0:
        summaries_wiki = re.sub(r'\s\(\[\[', ' ', summaries_wiki[0])
        movie[1]['summaries_wikipedia'][0] = summaries_wiki

###Tagging

In [35]:
#pos tagging using nltk.pos_tag
#ner tagging using stanford ner tagger
def ie_preprocess(document, lower='false', stage="pos"):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    if lower == 'false':
        for sent in sentences:
            for i in range(len(sent)):
                if sent[i] != sent[i].lower():
                    sent[i] = sent[i].lower()
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    if stage == "pos":
        return sentences
    if stage == "ner":
        return nert.tag(document.split())

In [36]:
# pos + ner tag summaries and roles
for movie in sample:
    summaries_imdb = movie[1]["summaries_imdb"]
    summaries_wiki = movie[1]['summaries_wikipedia']
    roles = movie[1]['roles']
    if len(summaries_imdb) > 0:
        movie[1]["summaries_imdb_pos"] = ie_preprocess(summaries_imdb[0])
        movie[1]["summaries_imdb_ner"] = ie_preprocess(summaries_imdb[0], stage="ner")
    if len(summaries_wiki) > 0:
        movie[1]["summaries_wikipedia_pos"] = ie_preprocess(summaries_wiki[0])
        movie[1]["summaries_wikipedia_ner"] = ie_preprocess(summaries_wiki[0], stage='ner')

###Search for a ner tagged name in the roles bag of words

####First, combine wiki summaries and imdb summaries into one string. 

In [37]:
# combine wiki_ner and imdb_ner
for movie in sample:
#     summaries_imdb_ner is indeed a list
    summaries_imdb_ner = movie[1]["summaries_imdb_ner"]
    if 'summaries_wikipedia_ner' in movie[1]:
        summaries_wiki_ner = movie[1]['summaries_wikipedia_ner']
    else: summaries_wiki_ner = []
    summaries_imdb_ner.extend(summaries_wiki_ner)
    movie[1]['summaries_combined_ner'] = summaries_imdb_ner

####Run chuncker to lift out tagged names from summaries

In [38]:
# Define custom tagged entities - group NE's together 
def chunker_rules(values):
    # Define  custom grammar (modified to be a valid regex).
    grammar = r'''
        PERSON:
                {<PERSON>+}
            '''
    cp = nltk.RegexpParser(grammar) # Create an instance of your custom parser.
    return cp.parse(values)         # Parse!

def entity_chunker(tagged_docs):
    chunks = []
#     for doc in tagged_docs:
    tree = chunker_rules(tagged_docs)
    for subtree in tree.subtrees():
#             if (subtree.node == 'WIDOW'):
        leaflist = [leaf[0] for leaf in subtree.leaves()]
        chunks.append(' '.join(leaflist))
    return chunks

In [39]:
# creating a new dict called sum_and_char to store summaries and corresponding characters
sum_and_char = {}
for movie in sample:
#     making a bag of words for roles
    roles_bag = []
    roles = movie[1]['roles']
    for role in roles:
        roles_bag.append(role['role'])
    str = " ".join(roles_bag)
#     split roles on space, /, ' or "
    roles_bag = re.split(r' |/|\'|\"', str)
#     sorting entity_chuncking results into summaries and characters
    result = entity_chunker(movie[1]['summaries_combined_ner'])
    movie_name = movie[0]
    if len(result) <= 1:
        char_data = {}
    else:
        char_data = set(result[1:])
    sum_and_char[movie_name] = {"sum": result[0], \
                                "char_raw": char_data,\
                                "roles_bag": set(roles_bag),\
                                "char_info" : {}}

####Check characters against roles for filtering

In [40]:
for key, value in sum_and_char.items():
    sum_and_char[key]['char_filtered'] = []
    for char in value['char_raw']:
        flag = False
        char_split = char.split()
        for elem in char_split:
            if elem in value['roles_bag']:
                flag = True
        if flag == True:
            sum_and_char[key]['char_filtered'].append(char)
        

###Establish the link between filtered character names from summaries and their counter parts in the roles list

In [41]:
for movie in sample:
#     making a bag of words for roles
    for role in movie[1]['roles']:
        role_words = role['role']
        role_bag = re.split(r' |/|\'|\"', role_words)
        role['role_bag'] = (set(role_bag))

In [42]:
# TODO: add algorithm for tie-breaking

# link char_filter with their names in the role list.
''' char_info in sum_and_char use names in the role list as keys, and contains role gender, char names
found in summaries, and roles found in summaries
'''
for key, value in (sum_and_char.items()):
    for movie in sample:
#     locate correct movie
        if movie[0] == key:
#             for each filted char name
            for name in value['char_filtered']:
                max_count = 0
                max_role = None
                max_gen = None
                name_split = name.split()
#                 for each role
                for role in movie[1]['roles']:
                    count = 0
#                   for each word in char name
#                   increment count by 1 every time a word in the char name appears in a role
                    for word in name_split:
                        if word in role['role_bag']:
                            count += 1
#                   select the role with the most counts!
                    if count > max_count:
                        max_count = count
                        max_role = role['role']
                        max_gen = role['gender']
                if max_count < 1:
                    print("error")
                    pass
                else:
                    if max_role not in value['char_info']:
                        value['char_info'][max_role] = {'gender':max_gen, 'roles_found_in_sums':[], 'names_found_in_sums':[name]}
                    else:
                        value['char_info'][max_role]['names_found_in_sums'].append(name)