In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 150

In [2]:
keep_cols = ['node_id', 'sourceID', 'name', 'countries', 'country_codes']
dtypes = {'node_id':'int32', 'sourceID':'category', 'name':'object', 'country_codes':'category', 'countries':'category'}

bahamas_officer_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.officer.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
bahamas_officer_raw = bahamas_officer_raw[keep_cols]

offshore_officer_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.officer.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
offshore_officer_raw = offshore_officer_raw[keep_cols]

panama_officer_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.officer.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
panama_officer_raw = panama_officer_raw[keep_cols]

paradise_officer_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.officer.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
paradise_officer_raw = paradise_officer_raw[keep_cols]

### To Do

- Address duplicates
- Why are there company names in here??
- Are there names that go across the 4 datasets?

In [15]:
officer_df = pd.concat([bahamas_officer_raw, offshore_officer_raw, panama_officer_raw, paradise_officer_raw], ignore_index=True)
officer_df.head()

Unnamed: 0,node_id,sourceID,name,countries,country_codes
0,22013341,Bahamas Leaks,ALPHA DIRECTION LTD.,,
1,22017206,Bahamas Leaks,GAMMA SECRETARIES,,
2,22023260,Bahamas Leaks,THETA DIRECTION LTD.,,
3,22006231,Bahamas Leaks,ALPHA DIRECTION LTD.,,
4,22012516,Bahamas Leaks,GAMMA SECRETARIES LTD.,,


In [4]:
officer_df['sourceID'].value_counts()

Panama Papers                                        238402
Paradise Papers - Barbados corporate registry        127912
Paradise Papers - Malta corporate registry           107209
Offshore Leaks                                       107190
Paradise Papers - Appleby                             76915
Paradise Papers - Aruba corporate registry            31286
Bahamas Leaks                                         25262
Paradise Papers - Samoa corporate registry             5828
Paradise Papers - Cook Islands corporate registry       644
Paradise Papers - Nevis corporate registry              202
Paradise Papers - Bahamas corporate registry             11
Paradise Papers - Lebanon corporate registry              1
Name: sourceID, dtype: int64

## Countries and country codes

In [5]:
officer_df['countries'].value_counts()

# Country names are slightly different than the entity table. Example: United States vs United States of America
# Muliple countries. Will need to turn those into a sorted list
# The unknown category also has a different name: Not identified. Remove non-country options

Malta                                                                                                                             44916
Not identified                                                                                                                    39450
China                                                                                                                             36751
Hong Kong                                                                                                                         29413
United States                                                                                                                     26351
Taiwan                                                                                                                            20202
British Virgin Islands                                                                                                            17733
United Kingdom                                  

In [6]:
officer_df['country_codes'].value_counts()
# Muliple country_codes. Will need to turn those into a sorted list
# Unknown category 'XXX' needs to be removed

MLT                                                    44916
XXX                                                    39450
CHN                                                    36751
HKG                                                    29413
USA                                                    26355
TWN                                                    20205
VGB                                                    17741
GBR                                                    14030
JEY                                                     7994
RUS                                                     7475
ITA                                                     7310
SGP                                                     7019
CHE                                                     6927
PAN                                                     6182
WSM                                                     5213
CYP                                                     5019
GGY                     

In [17]:
officer_df['countries'].value_counts()

# Country names are slightly different than the entity table. Example: United States vs United States of America

[Malta]                                                                          44916
[China]                                                                          36751
[Hong Kong]                                                                      29413
[United States]                                                                  26351
[Taiwan]                                                                         20202
[British Virgin Islands]                                                         17733
[United Kingdom]                                                                 14029
[Jersey]                                                                          7994
[Russia]                                                                          7475
[Italy]                                                                           7310
[Singapore]                                                                       7019
[Switzerland]                              

## Names

In [9]:
officer_df['name'].str.title().value_counts()
# There are a lot of duplicate entries in here. `The Bearer` alone has over 70k rows.
# 'The Bearer' also appears to have multiple entries: 'The Bearer', 'Bearer 1', 'Bearer'
# Also, these are officers, why are there company names in here?
# Split into first, last, middle names?

The Bearer                                                                71850
El Portador                                                                9351
Bearer 1                                                                   2667
Bearer                                                                     1367
Carmichael Trevor A.                                                       1196
Clementi Limited                                                           1112
Tanah Merah Limited                                                        1047
Bukit Merah Limited                                                         963
Cst Administration (Baham                                                   835
The Corporate Secretary Limited                                             700
Court Administration Limi                                                   474
Primary Management Limite                                                   449
Barnes Deborah J.                       

In [19]:
officer_df['name'] = officer_df['name'].str.title()
officer_df.head()

Unnamed: 0,node_id,sourceID,name,countries,country_codes
0,22013341,Bahamas Leaks,Alpha Direction Ltd.,,
1,22017206,Bahamas Leaks,Gamma Secretaries,,
2,22023260,Bahamas Leaks,Theta Direction Ltd.,,
3,22006231,Bahamas Leaks,Alpha Direction Ltd.,,
4,22012516,Bahamas Leaks,Gamma Secretaries Ltd.,,


In [4]:
def clean_countries_and_codes(df):
    df['clean_co_codes'] = df.loc[df['country_codes'].notnull(), 'country_codes'].str.replace(';XXX|XXX;', '')
    df['clean_co_codes'] = df.loc[df['clean_co_codes'].notnull(), 'clean_co_codes'].replace('XXX', np.nan)
    df['clean_co_codes'] = df.loc[df['clean_co_codes'].notnull(), 'clean_co_codes'].str.split(';').apply(lambda x: sorted(x))
    df['clean_co_codes'] = df.loc[df['clean_co_codes'].notnull(), 'clean_co_codes'].apply(', '.join)
    
    df['clean_countries'] = df.loc[df['countries'].notnull(), 'countries'].str.replace(';Not identified|Not identified;', '')
    df['clean_countries'] = df.loc[df['clean_countries'].notnull(), 'clean_countries'].replace('Not identified', np.nan)
    df['clean_countries'] = df.loc[df['clean_countries'].notnull(), 'clean_countries'].str.split(';').apply(lambda x: sorted(x))
    df['clean_countries'] = df.loc[df['clean_countries'].notnull(), 'clean_countries'].apply(' ,'.join)
    
    df.drop(columns=['country_codes', 'countries'], inplace=True)
    df.rename(columns={'clean_co_codes':'country_codes', 'clean_countries':'countries'}, inplace=True)


officer_df = pd.concat([bahamas_officer_raw, offshore_officer_raw, panama_officer_raw, paradise_officer_raw], ignore_index=True)
officer_df['name'] = officer_df['name'].str.title()
clean_countries_and_codes(officer_df)

officer_df.head()

Unnamed: 0,node_id,sourceID,name,country_codes,countries
0,22013341,Bahamas Leaks,Alpha Direction Ltd.,,
1,22017206,Bahamas Leaks,Gamma Secretaries,,
2,22023260,Bahamas Leaks,Theta Direction Ltd.,,
3,22006231,Bahamas Leaks,Alpha Direction Ltd.,,
4,22012516,Bahamas Leaks,Gamma Secretaries Ltd.,,


In [12]:
officer_df.to_csv('../data/intermediate/officers.csv', index=False)