In [1]:
import mammoth
from bs4 import BeautifulSoup
from pprint import pprint
import pydot
from os import listdir
from os.path import isfile, join
import spacy

In [2]:
# Source: https://stackoverflow.com/questions/20656135/python-deep-merge-dictionary-data
def deep_merge_dicts(source, destination):
    """
    Deep merge dicts

    >>> a = { 'first' : { 'all_rows' : { 'pass' : 'dog', 'number' : '1' } } }
    >>> b = { 'first' : { 'all_rows' : { 'fail' : 'cat', 'number' : '5' } } }
    >>> merge(b, a) == { 'first' : { 'all_rows' : { 'pass' : 'dog', 'fail' : 'cat', 'number' : '5' } } }
    True
    """
    for key, value in source.items():
        if isinstance(value, dict):
            # get node or create one
            node = destination.setdefault(key, {})
            deep_merge_dicts(value, node)
        elif isinstance(value, list) and key in destination.keys():
            destination[key] = list(set(value + destination[key]))            
        else:
            destination[key] = value

    return destination

In [3]:
def nested_list_to_dict(ul, doc_words_setter):
    level = {}
    lis = ul.findChildren('li' , recursive=False)
    
    for li in lis:
        child_ul = li.find('ul')
        text = li.next_element.strip()
        
        doc_words_setter(text)
        if child_ul == None:
            level[text] = {}
        else:
            level[text] = nested_list_to_dict(child_ul, doc_words_setter)
            
    return level

def file_to_html(file_path):
    f = open(file_path, 'rb')
    document = mammoth.convert_to_html(f)
    return BeautifulSoup(document.value.encode('utf8'))

def doc_to_dict(file_path):
    html = file_to_html(file_path)
    start = html.find('h1', string='call taker')
    event = {}
    doc_words = {}
    current_sit = None
    set_doc_words = lambda words: words not in doc_words.keys() and doc_words.update({words: [file_path]})

    for element in start.next_elements:
        tag = element.name
        level = {}

        if tag == 'h1':
            break
        elif tag == 'h2':
            current_situation = element.text.strip()
            event[current_situation] = {}
        elif tag == 'p':
            next_next_element = element.next_element.next_element
            text = element.text.strip()
            
            set_doc_words(text)

            if next_next_element.name == 'ul':
                event[current_situation][text] = nested_list_to_dict(
                    next_next_element.find('ul'),
                    set_doc_words
                )
            else:
                event[current_situation][element.text] = {}

    return event, doc_words

In [4]:
def draw(graph, parent_name, child_name):
    edge = pydot.Edge(parent_name, child_name)
    graph.add_edge(edge)

def visit(graph, node, parents=[], allowed_nodes=None):
    for k,v in node.items():
        if len(parents) > 0 and (allowed_nodes == None or k in allowed_nodes):
            draw(graph, parents[-1], k)    
            
        new_parents = parents.copy()
        
        if len(parents) == 0 or allowed_nodes == None or k in allowed_nodes:
            new_parents.append(k)
        
        visit(graph, v, new_parents, allowed_nodes)

def dict_to_graph(sop_dict, allowed_nodes=None):
    graph = pydot.Dot(graph_type='graph', rankdir='LR')
    visit(graph, sop_dict, allowed_nodes=allowed_nodes)
    
    return graph

In [5]:
def docs_to_dict(dir_path):
    files = [f for f in listdir(dir_path) if isfile(join(dir_path, f)) and '.docx' in f]
    dir_dict = {}
    doc_words = {}
    
    for file_path in files:
        file_dict, file_doc_words = doc_to_dict(dir_path + '/' + file_path)
        dir_dict = deep_merge_dicts(dir_dict, file_dict)
        doc_words = deep_merge_dicts(doc_words, file_doc_words)

    return dir_dict, doc_words

In [6]:
def filter_nodes(doc_words, doc_threshold = 2):
    allowed_nodes = []
    
    for k, v in doc_words.items():
        if len(v) <= doc_threshold:
            allowed_nodes.append(k)
    
    return allowed_nodes

In [7]:
sop, doc_words = doc_to_dict("./data/A-ANIMAL.docx")

In [8]:
pprint(sop)

{'Animal bites – Just occurred or with a time delay': {'Advise BCEHS as required': {},
                                                       'Refer to Animal Control': {}},
 'Animal left in a vehicle': {'If animal is in imminent distress': {'Transfer caller to Fire': {}},
                              'In all other instances': {'Refer to Animal Control': {}}},
 'Deceased animals': {'Large animal': {'On highways': {'Contact the Department of Highways': {}},
                                       'On municipal property': {'Refer to Public Works Yard': {}}},
                      'Small animal': {'Refer to Animal Control': {}}},
 'Dogs at large': {'If running at large and attacking or viciously pursuing a person or domestic animal or livestock': {'Create a call': {}},
                   'In all other cases': {'Refer to Animal Control': {}}},
 'Dogs barking': {'Refer to Animal Control': {}},
 'Livestock at large': {'Create a call': {'Can the animal be contained?': {},
                    

In [9]:
pprint(doc_words)

{'Advise BCEHS as required': ['./data/A-ANIMAL.docx'],
 'Bear sightings': ['./data/A-ANIMAL.docx'],
 'Can the animal be contained?': ['./data/A-ANIMAL.docx'],
 'Contact the Department of Highways': ['./data/A-ANIMAL.docx'],
 'Create a call': ['./data/A-ANIMAL.docx'],
 'Create a call if there is immediate danger': ['./data/A-ANIMAL.docx'],
 'Create an MVI call': ['./data/A-ANIMAL.docx'],
 'Description': ['./data/A-ANIMAL.docx'],
 'How many animals are loose': ['./data/A-ANIMAL.docx'],
 'If animal exhibits aggressive behaviour (e.g. stalking)': ['./data/A-ANIMAL.docx'],
 'If animal is in imminent distress': ['./data/A-ANIMAL.docx'],
 'If animal remains in a residential or urban area or is in a park, school yard or recreational area (excluding rural trails) that would likely be in use': ['./data/A-ANIMAL.docx'],
 'If running at large and attacking or viciously pursuing a person or domestic animal or livestock': ['./data/A-ANIMAL.docx'],
 'If stray rabbit': ['./data/A-ANIMAL.docx'],
 'If w

In [10]:
graph = dict_to_graph(sop)

In [11]:
graph.write_png('example1_graph.png')

In [12]:
big_dict, file_doc_words = docs_to_dict('./data')

In [13]:
graph = dict_to_graph(big_dict)

In [14]:
graph.write_png('example2_graph.png')

In [15]:
graph = dict_to_graph(big_dict, filter_nodes(file_doc_words, doc_threshold=2))

In [16]:
graph.write_png('example3_graph.png')

In [17]:
graph = dict_to_graph(big_dict, filter_nodes(file_doc_words, doc_threshold=1))

In [18]:
graph.write_png('example4_graph.png')

In [19]:
nlp = spacy.load("en_core_web_sm")

In [20]:
def flatten_sop(sop_dict):
    new_sop_dict = {}
    
    for key, value in sop_dict.items(): 
        new_sop_dict[key] = merge_dict_conditions(value)
    
    return new_sop_dict

In [21]:
def merge_dict_conditions(sop_dict, previous_levels = []):
    new_sop_dict = {}
    
    for key, value in sop_dict.items():
        sentence = key[0].lower() + key[1:]
        
        if len(previous_levels) > 0:
            sentence = ' , '.join(previous_levels) + ' , ' + sentence
            
        sentence = sentence.replace('caller', 'CALLER').replace('complainant', 'COMPLAINANT')
        
        doc = nlp(sentence)

        if doc[-len(key.strip().split(' '))].pos_ == 'VERB' or value == {}:
            if len(previous_levels) == 0:
                new_sop_dict[key] = value
            else:
                for level in previous_levels:
                    if level not in new_sop_dict.keys():
                        new_sop_dict[level] = {}

                    new_sop_dict[level][key] = value
        else:
            flattened_levels = []
            if len(previous_levels) == 0:
                flattened_levels.append(key)
            else:
                flattened_levels += list(map(lambda x: x + ' , ' + key, previous_levels))
            
            new_dict = merge_dict_conditions(value, flattened_levels)
            new_sop_dict = deep_merge_dicts(new_sop_dict, new_dict)   
            
    return new_sop_dict

In [22]:
graph = dict_to_graph(flatten_sop(big_dict))

In [23]:
graph.write_png('example5_graph.png')

In [26]:
doc = nlp('In all other instances,, instruct CALLER not to remove animals from the trailer')

In [27]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In in ADP IN prep Xx True True
all all DET DT det xxx True True
other other ADJ JJ amod xxxx True True
instances instance NOUN NNS pobj xxxx True False
, , PUNCT , punct , False False
, , PUNCT , punct , False False
instruct instruct VERB VBP compound xxxx True False
CALLER CALLER PROPN NNP nsubj XXXX True False
not not PART RB neg xxx True True
to to PART TO aux xx True True
remove remove VERB VB ROOT xxxx True False
animals animal NOUN NNS dobj xxxx True False
from from ADP IN prep xxxx True True
the the DET DT det xxx True True
trailer trailer NOUN NN pobj xxxx True False


![](Situation.jpeg)