### ASIC Mapper

Company records are an essential part of many investigations. This script takes company record PDFs from the Australian Securities & Investments Commission (ASIC) and converts them into an interactive network graph, in order to more easily find otherwise obscure connections between companies. 

It uses the fitz library to tabularise PDFs, then PyPDF2 to map out the interactive network graph.  

In [1]:
import PyPDF2
import fitz
import os
import pandas as pd
import pyvis
from pyvis.network import Network
import networkx as nx

In [2]:
def convert_PDFs_to_DatFrame(directory_where_PDFs_are):

    df = pd.DataFrame()
    
    files = [f for f in os.listdir(directory_where_PDFs_are) if '.pdf' in f] 
    
    for file in files:
        
        doc = fitz.open(f'{directory_where_PDFs_are}/{file}')
        
        for page in doc:
            text = page.get_text()     
            
            lines = text.split('\n')
            
            for n in range(0, len(lines)-1):
                
                line = lines[n].strip()
                
                if ':' in line and ':' == line[-1]:
                    key = line.replace(':','').strip()
                    
                    value = lines[n+1]
                
                    
                    if key in ['Address'] :
                        
                        if ':' not in lines[n+2] and 'Class' not in lines[n+2]:
                            value = lines[n+1] + lines[n+2]
                                                        
                    df.loc[df.shape[0]+1, ['file', 'key', 'value']] = [file.replace('.pdf',''), key.strip().upper(), value.replace('  ',' ').strip().upper()]
        
    df = df.drop_duplicates()
        
    return df

In [43]:
## Make df and really strip it down to the essentials: names, addresses, ABNs/ACNs

def simplify_the_DataFrame(df):
    
    dfSIMPLE = df

    dfSIMPLE = dfSIMPLE.loc[~dfSIMPLE['file'].isin(exclude_pdfs)]
    dfSIMPLE = dfSIMPLE.loc[~dfSIMPLE['value'].isin(exclude_values)]
    dfSIMPLE = dfSIMPLE.loc[~dfSIMPLE['key'].isin(exclude_keys)]

    dfSIMPLE.loc[dfSIMPLE['key'].isin(['Registered address', 'Business address']), 'key'] = 'ADDRESS'

    dfSIMPLE = dfSIMPLE.drop_duplicates()

    for index, row in dfSIMPLE.iterrows():
            
        for key in simplify_addresses:
            
            
            if key in row['value']:
                                    
                dfSIMPLE.loc[index, 'value'] = simplify_addresses[key]
        
    return dfSIMPLE

In [44]:
def make_interactive_network_graph(df):
    
    ### create nodes and edges
    
    dimension_size = 1000
    g=Network(height=dimension_size, width=dimension_size, notebook=True, directed=True)

    for index, row in df.iterrows():

        source, target, edge_text = row['file'], row['value'], row['key']

        source_color = 'silver'
        target_color = 'silver'

        g.add_node(source, color=source_color)
        g.add_node(target, color=target_color)
        g.add_edge(source, target, weight=5, title=edge_text, color='#aaccff')

    ####  format graph   

    node_colors = {}

    files = [x for x in df['file'].unique()]

    degrees = pd.concat([df['file'], df['value']])

    for node in g.nodes:

        node['borderWidth'] = 1
        node['borderWidthSelected'] = 2

        if node['id'] in files:
            node['color'] = 'pink'
            node['size'] = int(degrees.value_counts()[node['id']]) * 10
            node['shape'] = 'circle'

            node['label'] = df.loc[(df['file']==node['id']) & (df['key']=='NAME'), 'value'].values[0].replace('AUSTRALIA','AUS').replace('AUSTRALIAN','AUS')[0:10]

        elif int(degrees.value_counts()[node['id']]) > 1:
            node['color'] = '#550022'
            node['size'] = int(degrees.value_counts()[node['id']]) * 2

        else:
            node['size'] = 1


    pyvis.options.Layout(improvedLayout=True)
    
    return g

In [45]:
exclude_pdfs = ['2d3132303738333036313', '2d3134343731373030343', '2d3332393933313038353', '2d3439383731393736323', '383532333137393537323', '2d3839383238393439353', '373730323339383630323', '363536363032383233323', '383935353632313435383', '363536363032383233323']    
exclude_values = ['LIMITED BY SHARES', 'REGISTERED', 'DEREGISTERED', 'STRIKE-OFF ACTION IN PROGRESS', 'AUSTRALIAN PROPRIETARY COMPANY', 'PROPRIETARY COMPANY', 'AUSTRALIAN PUBLIC COMPANY', 'UNKNOWN', 'LIMITED BY GUARANTEE', 'LISTED PUBLIC COMPANY']
exclude_keys = ['REGISTERED IN', 'APPOINTMENT DATE', 'BORN', 'START DATE', 'REGISTRATION DATE', 'NEXT REVIEW DATE', 'NAME START DATE', 'CEASE DATE', 'DATE DEREGISTERED', 'DEREGISTRATION REASON', 'DEREGISTRATION CODE', 'PREVIOUS STATE NUMBER', 'SUBCLASS', 'NAME START']

In [46]:
simplify_addresses = {
    'SUITE 401, 54 MILLER STREET, NORTH SYDNEY NSW 2060': '401 / 54 MILLER STREET, NORTH SYDNEY',
    '8 BEULAH': 'ADCREDA ACCOUNTING', 
    '245 FULLARTON': 'ADCREDA ACCOUNTING', 
    'GPO BOX 2447, ADELAIDE': 'ADCREDA ACCOUNTING', 
    'ADCREDA ACCOUNTING': 'ADCREDA ACCOUNTING', 
}

In [47]:
df = convert_PDFs_to_DatFrame('documents')
dfSIMPLE = simplify_the_DataFrame(df)
g = make_interactive_network_graph(dfSIMPLE)

g.show('test.html')