In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 150

## Load data

In [2]:
keep_cols = ['node_id', 'sourceID', 'name', 'incorporation_date', 'country_codes', 'countries',
             'jurisdiction_description', 'jurisdiction', 'service_provider', 'status']
dtypes = {'node_id': 'int32', 'sourceID':'category', 'name':'object', 'country_codes':'category', 'countries':'category',
          'jurisdiction_description':'category', 'jurisdiction':'category', 'service_provider':'category', 'status':'category'}

bahamas_entity_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
offshore_entity_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
panama_entity_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
paradise_entity_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)

## Standardize column order and concatenate dataframes

In [3]:
bahamas_entity_raw = bahamas_entity_raw[keep_cols]
offshore_entity_raw = offshore_entity_raw[keep_cols]
panama_entity_raw = panama_entity_raw[keep_cols]
paradise_entity_raw = paradise_entity_raw[keep_cols]

entity_df = pd.concat([bahamas_entity_raw, offshore_entity_raw, panama_entity_raw, paradise_entity_raw], ignore_index=True)

## Standardize formatting

- Null entries imported as a string 'nan' instead of an actual null value.
- Capitalization is all over the place. Standardized as title case.

In [4]:
entity_df.replace('nan', np.nan, inplace=True)
entity_df['name'] = entity_df['name'].str.title()

## Consolidate entries in 'status' column

In [5]:
entity_df['status'] = entity_df['status'].str.lower()
entity_df.loc[entity_df['status'].str.contains('liquidation', na=False), 'status'] = 'in liquidation'
entity_df.loc[entity_df['status'].str.contains('liquidated', na=False), 'status'] = 'liquidated'
entity_df.loc[entity_df['status'].str.contains('resigned', na=False), 'status'] = 'resigned agent'
entity_df.loc[entity_df['status'].str.contains('sundry', na=False), 'status'] = 'sundry account'
entity_df.loc[entity_df['status'].str.contains('dissolved', na=False), 'status'] = 'dissolved'
entity_df.loc[entity_df['status'].str.contains('struck|defunct|deregistered', na=False), 'status'] = 'struck / defunct / deregistered'   
entity_df['status'] = entity_df['status'].str.title()

## Format date

In [6]:
entity_df.loc[
    (entity_df['incorporation_date'].str.contains('[A-Z]{3}', na=False)) & ((entity_df['incorporation_date'].str[-4:] > '2018') | (entity_df['incorporation_date'].str[-4:] < '1800')
                                                                           ), 'incorporation_date'] = np.nan
entity_df['formatted_date'] = pd.to_datetime(entity_df.loc[entity_df['incorporation_date'].str.contains(',', na=False), 'incorporation_date'], format='%b %d, %Y')
entity_df['formatted_date'] = pd.to_datetime(entity_df.loc[entity_df['incorporation_date'].str.contains('[A-Z]{3}', na=False), 'incorporation_date'], format='%d-%b-%Y')

entity_df.drop(columns=['incorporation_date'], inplace=True)
entity_df.rename(columns={'formatted_date':'incorporation_date'}, inplace=True)

## Address duplicates and worthless entries

- 'counry_codes' and 'countries' are strings that are actually lists. Not standardized as to which value comes first in the list, causing duplicates when counting values.
- 'jurisdiction' and 'jurisdiction_description' included worthless values that should be null

In [7]:
entity_df['country_codes'] = entity_df.loc[entity_df['country_codes'].notnull(), 'country_codes'].str.split(';').apply(lambda x: sorted(x))
entity_df['countries'] = entity_df.loc[entity_df['countries'].notnull(), 'countries'].str.split(';').apply(lambda x: sorted(x))
entity_df['jurisdiction_description'] = entity_df['jurisdiction_description'].str.title()
entity_df.loc[entity_df['jurisdiction_description'].str.contains('Undetermined|Recorded in leaked files as "fund"'), 'jurisdiction_description'] = np.nan
entity_df['jurisdiction'] = entity_df['jurisdiction']
entity_df.loc[entity_df['jurisdiction'] == 'XXX', 'jurisdiction'] = np.nan

In [8]:
entity_df.head(3)

Unnamed: 0,node_id,sourceID,name,country_codes,countries,jurisdiction_description,jurisdiction,service_provider,status,incorporation_date
0,20003127,Bahamas Leaks,Dalma Corporation Limited,,,Bahamas,BAH,,,1990-11-30
1,20010494,Bahamas Leaks,Asia Construction Corporation Limited,,,Bahamas,BAH,,,1992-08-14
2,20010495,Bahamas Leaks,Euro Logistics Limited,,,Bahamas,BAH,,,1992-08-14


In [3]:
for ds in datasets:
    print(ds.sourceID[0])
    print(sorted(ds.columns), '\n')

Bahamas Leaks
['address', 'closed_date', 'company_type', 'countries', 'country_codes', 'ibcRUC', 'incorporation_date', 'jurisdiction', 'jurisdiction_description', 'labels(n)', 'name', 'node_id', 'note', 'service_provider', 'sourceID', 'status', 'type', 'valid_until'] 

Offshore Leaks
['closed_date', 'company_type', 'countries', 'country_codes', 'ibcRUC', 'inactivation_date', 'incorporation_date', 'jurisdiction', 'jurisdiction_description', 'name', 'node_id', 'note', 'service_provider', 'sourceID', 'status', 'struck_off_date', 'valid_until'] 

Panama Papers
['closed_date', 'company_type', 'countries', 'country_codes', 'ibcRUC', 'inactivation_date', 'incorporation_date', 'jurisdiction', 'jurisdiction_description', 'name', 'node_id', 'note', 'service_provider', 'sourceID', 'status', 'struck_off_date', 'valid_until'] 

Paradise Papers - Aruba corporate registry
['closed_date', 'company_type', 'countries', 'country_codes', 'ibcRUC', 'inactivation_date', 'incorporation_date', 'jurisdiction',

In [7]:
for ds in datasets:
    print(ds.sourceID.unique()[0])
    print(ds.shape)
    print(ds.node_id.nunique(), '\n')

Bahamas Leaks
(175888, 14)
175888 

Offshore Leaks
(105516, 14)
105516 

Panama Papers
(213634, 14)
213634 

Paradise Papers - Aruba corporate registry
(290086, 14)
290086 



In [10]:
def clean_country_and_code(df):
    df['country_codes'].replace('XXX', np.nan, inplace=True)
    df['country_codes'].str.replace(';XXX|XXX;', '')
    df['country_codes'] = df.loc[df['country_codes'].notnull(), 'country_codes'].str.split(';').apply(lambda x: sorted(x))
    df['countries'].replace('Not identified', np.nan, inplace=True)
    df['countries'].str.replace(';Not identified|Not identified;', '')
    df['countries'] = df.loc[df['countries'].notnull(), 'countries'].str.split(';').apply(lambda x: sorted(x))
    
clean_country_and_code(entity_df)

In [None]:
entity_df.to_csv('../data/intermediate/entities.csv')