In [9]:
import mammoth
from bs4 import BeautifulSoup
from pprint import pprint
import pydot
from IPython.display import Image, display
from os import listdir
from os.path import isfile, join

In [2]:
def nested_list_to_dict(ul):
    level = {}
    lis = ul.findChildren('li' , recursive=False)
    
    for li in lis:
        child_ul = li.find('ul')
        if child_ul == None:
            level[li.next_element] = {}
        else:
            level[li.next_element] = nested_list_to_dict(child_ul)
            
    return level

def file_to_html(file_path):
    f = open(file_path, 'rb')
    document = mammoth.convert_to_html(f)
    return BeautifulSoup(document.value.encode('utf8'))

def doc_to_dict(file_path):
    html = file_to_html(file_path)
    start = html.find('h1', string='call taker')
    event = {}
    current_sit = None

    for element in start.next_elements:
        tag = element.name
        level = {}

        if tag == 'h1':
            break
        elif tag == 'h2':
            current_situation = element.text
            event[current_situation] = {}
        elif tag == 'p':
            next_next_element = element.next_element.next_element
            if next_next_element.name == 'ul':
                event[current_situation][element.text] = nested_list_to_dict(next_next_element.find('ul'))
            else:
                event[current_situation][element.text] = {}

    return event

In [3]:
sop = doc_to_dict("./data/A-ANIMAL.docx")

In [4]:
pprint(sop)

{'Animal bites – Just occurred or with a time delay': {'Advise BCEHS as required': {},
                                                       'Refer to Animal Control': {}},
 'Animal left in a vehicle': {'If animal is in imminent distress': {'Transfer caller to Fire': {}},
                              'In all other instances': {'Refer to Animal Control': {}}},
 'Deceased animals': {' Large animal': {'On highways': {'Contact the Department of Highways': {}},
                                        'On municipal property': {'Refer to Public Works Yard': {}}},
                      'Small animal': {'Refer to Animal Control': {}}},
 'Dogs at large': {'If running at large and attacking or viciously pursuing a person or domestic animal or livestock': {'Create a call': {}},
                   'In all other cases ': {'Refer to Animal Control': {}}},
 'Dogs barking': {'Refer to Animal Control': {}},
 'Livestock at large': {'Create a call': {'Can the animal be contained?': {},
                 

In [5]:
def draw(graph, parent_name, child_name):
    edge = pydot.Edge(parent_name, child_name)
    graph.add_edge(edge)

def visit(graph, node, parent=None):
    for k,v in node.items():
        if parent:
            draw(graph, parent, k)
        visit(graph, v, k)

def dict_to_graph(sop_dict):
    graph = pydot.Dot(graph_type='graph')
    visit(graph, sop_dict)
    
    return graph

In [6]:
graph = dict_to_graph(sop)

In [7]:
graph.write_png('example1_graph.png')

In [12]:
def docs_to_dict(dir_path):
    files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
    dir_dict = {}
    
    for file_path in files:
        file_dict = doc_to_dict(dir_path + '/' + file_path)
        dir_dict = {**dir_dict, **file_dict}

    return dir_dict

In [13]:
big_dict = docs_to_dict('./data')

In [15]:
graph = dict_to_graph(big_dict)

In [16]:
graph.write_png('example2_graph.png')