In [1]:
import numpy as np
import pandas as pd

import nltk
import string
import re

import datetime as dt

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 150

In [2]:
keep_cols = ['node_id', 'sourceID', 'name', 'incorporation_date', 'country_codes', 'countries',
             'jurisdiction_description', 'jurisdiction', 'service_provider', 'status']
dtypes = {'node_id': 'int32', 'sourceID':'category', 'name':'object', 'country_codes':'category', 'countries':'category',
          'jurisdiction_description':'category', 'jurisdiction':'category', 'service_provider':'category', 'status':'category'}

bahamas_entity_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
offshore_entity_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
panama_entity_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
paradise_entity_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)

In [3]:
bahamas_entity_raw = bahamas_entity_raw[keep_cols]
offshore_entity_raw = offshore_entity_raw[keep_cols]
panama_entity_raw = panama_entity_raw[keep_cols]
paradise_entity_raw = paradise_entity_raw[keep_cols]

In [137]:
entity_df = pd.concat([bahamas_entity_raw, offshore_entity_raw, panama_entity_raw, paradise_entity_raw], ignore_index=True)
entity_df['company'] = entity_df['name'].str.title()

entity_df['clean_status'] = entity_df['status'].str.lower()
entity_df.loc[entity_df['clean_status'].str.contains('liquidation', na=False), 'status'] = 'in liquidation'
entity_df.loc[entity_df['clean_status'].str.contains('liquidated', na=False), 'status'] = 'liquidated'
entity_df.loc[entity_df['clean_status'].str.contains('resigned', na=False), 'status'] = 'resigned agent'
entity_df.loc[entity_df['clean_status'].str.contains('sundry', na=False), 'status'] = 'sundry account'
entity_df.loc[entity_df['clean_status'].str.contains('dissolved', na=False), 'status'] = 'dissolved'
entity_df.loc[entity_df['clean_status'].str.contains('struck|defunct|deregistered', na=False), 'status'] = 'struck / defunct / deregistered'   
entity_df['clean_status'] = entity_df['clean_status'].str.title()

entity_df.replace('nan', np.nan, inplace=True)

entity_df.loc[
    (entity_df['incorporation_date'].str.contains('[A-Z]{3}', na=False)) & ((entity_df['incorporation_date'].str[-4:] > '2018') | (entity_df['incorporation_date'].str[-4:] < '1800')
                                                                           ), 'incorporation_date'] = np.nan
entity_df['formatted_date'] = pd.to_datetime(entity_df.loc[entity_df['incorporation_date'].str.contains(',', na=False), 'incorporation_date'], format='%b %d, %Y')
entity_df['formatted_date'] = pd.to_datetime(entity_df.loc[entity_df['incorporation_date'].str.contains('[A-Z]{3}', na=False), 'incorporation_date'], format='%d-%b-%Y')

entity_df['clean_co_codes'] = entity_df['country_codes'].str.split(';').apply(lambda x: sorted(x))
entity_df['clean_countries'] = entity_df['countries'].str.split(';').apply(lambda x: sorted(x))
entity_df['clean_jurisdiction_desc'] = entity_df['jurisdiction_description'].str.title()
entity_df.loc[entity_df['clean_jurisdiction_desc'].str.contains('Undetermined|Recorded in leaked files as "fund"'), 'clean_jursdiction_desc'] = np.nan
entity_df['clean_jurisdiction'] = entity_df['jurisdiction']
entity_df.loc[entity_df['clean_jurisdiction'] == 'XXX', 'clean_jurisdiction'] = np.nan

entity_df.head()

Unnamed: 0,node_id,sourceID,name,incorporation_date,country_codes,countries,jurisdiction_description,jurisdiction,service_provider,status,company,clean_status
0,20003127,Bahamas Leaks,DALMA CORPORATION LIMITED,30-NOV-1990,,,Bahamas,BAH,,,Dalma Corporation Limited,
1,20010494,Bahamas Leaks,ASIA CONSTRUCTION CORPORATION LIMITED,14-AUG-1992,,,Bahamas,BAH,,,Asia Construction Corporation Limited,
2,20010495,Bahamas Leaks,EURO LOGISTICS LIMITED,14-AUG-1992,,,Bahamas,BAH,,,Euro Logistics Limited,
3,20010496,Bahamas Leaks,EURO LEISURE LIMITED,14-AUG-1992,,,Bahamas,BAH,,,Euro Leisure Limited,
4,20010497,Bahamas Leaks,EURO DATA PROCUREMENT LIMITED,14-AUG-1992,,,Bahamas,BAH,,,Euro Data Procurement Limited,


## Duplicates

In [5]:
entity_df.loc[entity_df['company'].duplicated(keep=False), ['node_id', 'name', 'company', 'incorporation_date', 'sourceID']].sort_values(by='company')

Unnamed: 0,node_id,name,company,incorporation_date,sourceID
63177,85039232,"""A.T.L.I."" ARUBA'S TRAINING AND LANGUAGE INSTITUTE","""A.T.L.I."" Aruba'S Training And Language Institute",15-AUG-2008,Paradise Papers - Aruba corporate registry
63590,85039653,"""A.T.L.I."" ARUBA'S TRAINING AND LANGUAGE INSTITUTE","""A.T.L.I."" Aruba'S Training And Language Institute",05-NOV-2008,Paradise Papers - Aruba corporate registry
54003,85029855,"""CENTRO MUSICAL"" DO-RE-MI","""Centro Musical"" Do-Re-Mi",01-MAY-2002,Paradise Papers - Aruba corporate registry
56508,85032414,"""CENTRO MUSICAL"" DO-RE-MI","""Centro Musical"" Do-Re-Mi",04-MAR-2004,Paradise Papers - Aruba corporate registry
65220,85041336,"""FACES""","""Faces""",26-MAR-2010,Paradise Papers - Aruba corporate registry
63823,85039897,"""FACES""","""Faces""",01-FEB-2009,Paradise Papers - Aruba corporate registry
67602,85043767,"""FLORTIEK"" BY WILSON","""Flortiek"" By Wilson",28-MAR-2012,Paradise Papers - Aruba corporate registry
63625,85039688,"""FLORTIEK"" BY WILSON","""Flortiek"" By Wilson",15-NOV-2008,Paradise Papers - Aruba corporate registry
58957,85034915,# 1 APPLIANCES SERVICE AND PARTS,# 1 Appliances Service And Parts,17-AUG-2005,Paradise Papers - Aruba corporate registry
50787,85026562,# 1 APPLIANCES SERVICE AND PARTS,# 1 Appliances Service And Parts,18-APR-2000,Paradise Papers - Aruba corporate registry


There are duplicate companies between investigations, but there are also duplicates that just have different incorporation dates. They've got different node_ids, which will need to be addressed. Will need to check against the address to see if they're really duplicates, or if they're actually different companies that just happen to have the same (oddly specific in some cases) name.

## Incorporation date

Received an error when attepting to look at the dates. Looking at the head and tail when sorted by date shows there are at least two date formats.

In [89]:
print(entity_df['formatted_date'].min())
print(entity_df['formatted_date'].max())

1865-10-26 00:00:00
2157-05-09 00:00:00


In [150]:
entity_df.loc[entity_df['incorporation_date'].notna(), 'incorporation_date'].sort_values().head()

523482    01-APR-1918
569160    01-APR-1926
569251    01-APR-1931
523466    01-APR-1933
569285    01-APR-1933
Name: incorporation_date, dtype: object

In [149]:
entity_df.loc[entity_df['incorporation_date'].notna(), ['incorporation_date', 'sourceID']].sort_values('incorporation_date').tail(17)

Unnamed: 0,incorporation_date,sourceID
699001,31-OCT-2016,Paradise Papers - Malta corporate registry
699003,31-OCT-2016,Paradise Papers - Malta corporate registry
674578,31-OCT-2016,Paradise Papers - Nevis corporate registry
507099,"Apr 01, 2004",Paradise Papers - Malta corporate registry
507100,"Apr 08, 1998",Paradise Papers - Malta corporate registry
507097,"Aug 26, 2008",Paradise Papers - Malta corporate registry
507089,"Dec 10, 2013",Paradise Papers - Malta corporate registry
507093,"Feb 23, 2011",Paradise Papers - Malta corporate registry
507098,"Jan 09, 2007",Paradise Papers - Malta corporate registry
507094,"Jan 25, 2005",Paradise Papers - Malta corporate registry


Was able to standardize the format version Mon dd, YYYY. However, received 'out of range' errors while converting the dd-MON-YYYY format. Based on the error message, discovered there are typos in the year, giving years greater than 2018 (the last update to the dataset) as well as less than 1000. Values like 2812 and 0990 are probaby typos of 2012 and 1990, but the others are much less clear. Quick Google searches for the incorporation date don't reveal the correct date. Removing these values as there is no easy way to fix them and I doubt the data entry personal had precognition.

In [7]:
entity_df.loc[
    (entity_df['incorporation_date'].str.contains('[A-Z]{3}', na=False)) & ((entity_df['incorporation_date'].str[-4:] > '2018') | (entity_df['incorporation_date'].str[-4:] < '1800')
                                                                           ), 'incorporation_date'] = np.nan
entity_df['formatted_date'] = pd.to_datetime(entity_df.loc[entity_df['incorporation_date'].str.contains(',', na=False), 'incorporation_date'], format='%b %d, %Y')
entity_df['formatted_date'] = pd.to_datetime(entity_df.loc[entity_df['incorporation_date'].str.contains('[A-Z]{3}', na=False), 'incorporation_date'], format='%d-%b-%Y')

In [148]:
entity_df[(entity_df['formatted_date'].dt.year > 2018) | (entity_df['formatted_date'].dt.year < 1000)]

Unnamed: 0,node_id,sourceID,name,incorporation_date,country_codes,countries,jurisdiction_description,jurisdiction,service_provider,status,company,clean_status,formatted_date


## Countries, codes, and jurisdictions

Country codes include the codes listed both ways when there are more than one. Example: VGB;HKG - HKG;VGB. Same for countries.

Jurisdiction descriptions have multiple entries for the same entities due to differences in capicalization. Example: 'United States of America' and 'United States Of America'.

In [13]:
entity_df[entity_df['country_codes'].notnull().str.split(';')]

AttributeError: Can only use .str accessor with string values, which use np.object_ dtype in pandas

In [21]:
entity_df[entity_df['country_codes'].str.split(';')]

ValueError: cannot index with vector containing NA / NaN values

In [16]:
entity_df['clean_co_codes'] = entity_df['country_codes'].str.split(';').apply(lambda x: sorted(x))
entity_df['clean_countries'] = entity_df['countries'].str.split(';').apply(lambda x: sorted(x))
entity_df['clean_jurisdiction_desc'] = entity_df['jurisdiction_description'].str.title()
entity_df.loc[entity_df['clean_jurisdiction_desc'].str.contains('Undetermined|Recorded in leaked files as "fund"', na=False), 'clean_jursdiction_desc'] = np.nan
entity_df['clean_jurisdiction'] = entity_df['jurisdiction']
entity_df.loc[entity_df['clean_jurisdiction'] == 'XXX', 'clean_jurisdiction'] = np.nan

TypeError: 'float' object is not iterable

In [174]:
for col in ['jurisdiction_description', 'jurisdiction']:
    print(col)
    print(entity_df[col].value_counts(), '\n')

jurisdiction_description
Bahamas                               209686
British Virgin Islands                153872
Malta                                  83934
Saint Kitts and Nevis                  70602
Undetermined                           53571
Aruba                                  49050
Panama                                 48406
Barbados                               40845
Seychelles                             15964
Samoa                                  15013
Niue                                    9611
Bermuda                                 9456
Cayman Islands                          8755
Cook Islands                            4135
British Anguilla                        3253
Hong Kong                               1644
Isle of Man                             1494
Nevada                                  1260
Jersey                                   894
Singapore                                747
Cayman                                   668
Mauritius                     

In [23]:
entity_df[entity_df['country_codes'].str.contains(';', na=False)]

Unnamed: 0,node_id,sourceID,name,incorporation_date,country_codes,countries,jurisdiction_description,jurisdiction,service_provider,status,formatted_date
175888,67028,Offshore Leaks,Zodiak Ltd,24-AUG-2007,VGB;CYP,British Virgin Islands;Cyprus,Undetermined,XXX,Commonwealth Trust Limited,Dead,2007-08-24
175889,67243,Offshore Leaks,Zven Ltd.,29-JAN-2007,XXX;VGB,Not identified;British Virgin Islands,Undetermined,XXX,Commonwealth Trust Limited,Dead,2007-01-29
175890,67258,Offshore Leaks,Anson 11A Ltd.,09-MAY-2000,SGP;VGB,Singapore;British Virgin Islands,Undetermined,XXX,Commonwealth Trust Limited,Liquidated,2000-05-09
175891,67266,Offshore Leaks,Giada Ltd,14-JUN-2007,CYP;VGB,Cyprus;British Virgin Islands,Undetermined,XXX,Commonwealth Trust Limited,Transferred Out,2007-06-14
175892,108050,Offshore Leaks,Scott D. Howard,,USA;XXX,United States;Not identified,Undetermined,XXX,Commonwealth Trust Limited,,NaT
175894,108110,Offshore Leaks,"Van Vector, Ltd.",09-APR-2003,RUS;VGB,Russia;British Virgin Islands,Undetermined,XXX,Commonwealth Trust Limited,Active,2003-04-09
175895,108141,Offshore Leaks,Southglen Limited,15-JUN-1998,XXX;NLD,Not identified;Netherlands,Undetermined,XXX,Commonwealth Trust Limited,Dead,1998-06-15
175896,108147,Offshore Leaks,Julius J. Grodski,,XXX;USA,Not identified;United States,Undetermined,XXX,Commonwealth Trust Limited,,NaT
175898,108197,Offshore Leaks,Miraley Ltd.,16-JUN-2005,VGB;RUS,British Virgin Islands;Russia,Undetermined,XXX,Commonwealth Trust Limited,Dead,2005-06-16
175899,109720,Offshore Leaks,Bahiti Ltd,06-JUL-2007,VGB;CYP,British Virgin Islands;Cyprus,Undetermined,XXX,Commonwealth Trust Limited,Active,2007-07-06


In [37]:
entity_df.loc[entity_df['country_codes'].notnull(), 'country_codes'].str.split(';').apply(lambda x: sorted(x)).head()

175888    [CYP, VGB]
175889    [VGB, XXX]
175890    [SGP, VGB]
175891    [CYP, VGB]
175892    [USA, XXX]
Name: country_codes, dtype: object

In [6]:
entity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785124 entries, 0 to 785123
Data columns (total 11 columns):
node_id                     785124 non-null int64
sourceID                    785124 non-null object
name                        785095 non-null object
incorporation_date          771330 non-null object
country_codes               488105 non-null object
countries                   488105 non-null object
jurisdiction_description    785124 non-null object
jurisdiction                785124 non-null object
service_provider            344086 non-null object
status                      337782 non-null object
formatted_date              771314 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(9)
memory usage: 65.9+ MB


In [49]:
entity_df.drop(columns=['incorporation_date'], inplace=True)
print(entity_df.columns)
entity_df.rename(columns={'formatted_date':'incorporation_date'})

Index(['node_id', 'sourceID', 'name', 'country_codes', 'countries',
       'jurisdiction_description', 'jurisdiction', 'service_provider',
       'status', 'formatted_date'],
      dtype='object')


Unnamed: 0,node_id,sourceID,name,country_codes,countries,jurisdiction_description,jurisdiction,service_provider,status,incorporation_date
0,20003127,Bahamas Leaks,Dalma Corporation Limited,,,Bahamas,BAH,,,1990-11-30
1,20010494,Bahamas Leaks,Asia Construction Corporation Limited,,,Bahamas,BAH,,,1992-08-14
2,20010495,Bahamas Leaks,Euro Logistics Limited,,,Bahamas,BAH,,,1992-08-14
3,20010496,Bahamas Leaks,Euro Leisure Limited,,,Bahamas,BAH,,,1992-08-14
4,20010497,Bahamas Leaks,Euro Data Procurement Limited,,,Bahamas,BAH,,,1992-08-14
5,20010498,Bahamas Leaks,Sterling Industries Limited,,,Bahamas,BAH,,,1992-08-14
6,20010499,Bahamas Leaks,General Commercial Enterprises Limited,,,Bahamas,BAH,,,1992-08-14
7,20010500,Bahamas Leaks,Newark Industries Limited,,,Bahamas,BAH,,,1992-08-14
8,20010501,Bahamas Leaks,Summit Enterprises Inc.,,,Bahamas,BAH,,,1992-08-14
9,20010502,Bahamas Leaks,Armaco International Ltd.,,,Bahamas,BAH,,,1992-08-14


In [5]:
entity_df = pd.concat([bahamas_entity_raw, offshore_entity_raw, panama_entity_raw, paradise_entity_raw], ignore_index=True)
entity_df.replace('nan', np.nan, inplace=True)
entity_df['name'] = entity_df['name'].str.title()

entity_df['status'] = entity_df['status'].str.lower()
entity_df.loc[entity_df['status'].str.contains('liquidation', na=False), 'status'] = 'in liquidation'
entity_df.loc[entity_df['status'].str.contains('liquidated', na=False), 'status'] = 'liquidated'
entity_df.loc[entity_df['status'].str.contains('resigned', na=False), 'status'] = 'resigned agent'
entity_df.loc[entity_df['status'].str.contains('sundry', na=False), 'status'] = 'sundry account'
entity_df.loc[entity_df['status'].str.contains('dissolved', na=False), 'status'] = 'dissolved'
entity_df.loc[entity_df['status'].str.contains('struck|defunct|deregistered', na=False), 'status'] = 'struck / defunct / deregistered'   
entity_df['status'] = entity_df['status'].str.title()


entity_df.loc[
    (entity_df['incorporation_date'].str.contains('[A-Z]{3}', na=False)) & ((entity_df['incorporation_date'].str[-4:] > '2018') | (entity_df['incorporation_date'].str[-4:] < '1800')
                                                                           ), 'incorporation_date'] = np.nan
entity_df['formatted_date'] = pd.to_datetime(entity_df.loc[entity_df['incorporation_date'].str.contains(',', na=False), 'incorporation_date'], format='%b %d, %Y')
entity_df['formatted_date'] = pd.to_datetime(entity_df.loc[entity_df['incorporation_date'].str.contains('[A-Z]{3}', na=False), 'incorporation_date'], format='%d-%b-%Y')

entity_df['country_codes'] = entity_df.loc[entity_df['country_codes'].notnull(), 'country_codes'].str.split(';').apply(lambda x: sorted(x))
entity_df['countries'] = entity_df.loc[entity_df['countries'].notnull(), 'countries'].str.split(';').apply(lambda x: sorted(x))
entity_df['jurisdiction_description'] = entity_df['jurisdiction_description'].str.title()
entity_df.loc[entity_df['jurisdiction_description'].str.contains('Undetermined|Recorded in leaked files as "fund"'), 'jurisdiction_description'] = np.nan
entity_df['jurisdiction'] = entity_df['jurisdiction']
entity_df.loc[entity_df['jurisdiction'] == 'XXX', 'jurisdiction'] = np.nan

entity_df.drop(columns=['incorporation_date'], inplace=True)
entity_df.rename(columns={'formatted_date':'incorporation_date'}, inplace=True)

entity_df.head()

Unnamed: 0,node_id,sourceID,name,country_codes,countries,jurisdiction_description,jurisdiction,service_provider,status,incorporation_date
0,20003127,Bahamas Leaks,Dalma Corporation Limited,,,Bahamas,BAH,,,1990-11-30
1,20010494,Bahamas Leaks,Asia Construction Corporation Limited,,,Bahamas,BAH,,,1992-08-14
2,20010495,Bahamas Leaks,Euro Logistics Limited,,,Bahamas,BAH,,,1992-08-14
3,20010496,Bahamas Leaks,Euro Leisure Limited,,,Bahamas,BAH,,,1992-08-14
4,20010497,Bahamas Leaks,Euro Data Procurement Limited,,,Bahamas,BAH,,,1992-08-14


In [8]:
entity_df.to_csv('../data/intermediate/entities.csv')