In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 150

In [2]:
bahamas_address_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.address.csv')
offshore_address_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.address.csv')
panama_address_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.address.csv')
paradise_address_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.address.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
datasets = [bahamas_address_raw, offshore_address_raw, panama_address_raw, paradise_address_raw]

for ds in datasets:
    print(ds.sourceID[0])
    print(sorted(ds.columns), '\n')

Bahamas Leaks
['address', 'closed_date', 'company_type', 'countries', 'country_codes', 'ibcRUC', 'incorporation_date', 'jurisdiction', 'jurisdiction_description', 'labels(n)', 'name', 'node_id', 'note', 'service_provider', 'sourceID', 'status', 'type', 'valid_until'] 

Offshore Leaks
['address', 'countries', 'country_codes', 'name', 'node_id', 'note', 'sourceID', 'valid_until'] 

Panama Papers
['address', 'countries', 'country_codes', 'name', 'node_id', 'note', 'sourceID', 'valid_until'] 

Paradise Papers - Appleby
['address', 'countries', 'country_codes', 'name', 'node_id', 'note', 'sourceID', 'valid_until'] 



In [4]:
bahamas_address_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551 entries, 0 to 550
Data columns (total 18 columns):
labels(n)                   551 non-null object
valid_until                 551 non-null object
country_codes               551 non-null object
countries                   551 non-null object
node_id                     551 non-null int64
sourceID                    551 non-null object
address                     551 non-null object
name                        0 non-null float64
jurisdiction_description    0 non-null float64
service_provider            0 non-null float64
jurisdiction                0 non-null float64
closed_date                 0 non-null float64
incorporation_date          0 non-null float64
ibcRUC                      0 non-null float64
type                        0 non-null float64
status                      0 non-null float64
company_type                0 non-null float64
note                        23 non-null object
dtypes: float64(10), int64(1), object(7)
me

In [5]:
offshore_address_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57600 entries, 0 to 57599
Data columns (total 8 columns):
node_id          57600 non-null int64
name             0 non-null float64
address          57600 non-null object
country_codes    57596 non-null object
countries        57596 non-null object
sourceID         57600 non-null object
valid_until      57600 non-null object
note             0 non-null float64
dtypes: float64(2), int64(1), object(5)
memory usage: 3.5+ MB


In [6]:
panama_address_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93454 entries, 0 to 93453
Data columns (total 8 columns):
node_id          93454 non-null int64
name             0 non-null float64
address          93453 non-null object
country_codes    92567 non-null object
countries        92567 non-null object
sourceID         93454 non-null object
valid_until      93454 non-null object
note             0 non-null float64
dtypes: float64(2), int64(1), object(5)
memory usage: 5.7+ MB


In [7]:
paradise_address_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223350 entries, 0 to 223349
Data columns (total 8 columns):
node_id          223350 non-null int64
name             223349 non-null object
address          203420 non-null object
country_codes    99391 non-null object
countries        99391 non-null object
sourceID         223350 non-null object
valid_until      223350 non-null object
note             0 non-null float64
dtypes: float64(1), int64(1), object(6)
memory usage: 13.6+ MB


In [8]:
for ds in datasets:
    print(ds.sourceID.unique()[0])
    print(ds.shape)
    print(ds.node_id.nunique(), '\n')

Bahamas Leaks
(551, 18)
551 

Offshore Leaks
(57600, 8)
57600 

Panama Papers
(93454, 8)
93454 

Paradise Papers - Appleby
(223350, 8)
223350 



## node_id to be used as unique identifier in database

Each row does in fact have its own node id. We'll use this as the unique identifier going forward

In [9]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.valid_until
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique())
    print(ds_col.value_counts(), '\n')


Investigation: Bahamas Leaks
Number of rows in dataset: 551
Number of populated rows: 551
Difference between the number of rows and the populated rows: 0
Number of unique entries: 1
Difference between the number of populated entries and the number of unique entries: 550
The Bahamas Leaks data is current through early 2016.    551
Name: valid_until, dtype: int64 

Investigation: Offshore Leaks
Number of rows in dataset: 57600
Number of populated rows: 57600
Difference between the number of rows and the populated rows: 0
Number of unique entries: 1
Difference between the number of populated entries and the number of unique entries: 57599
The Offshore Leaks data is current through 2010    57600
Name: valid_until, dtype: int64 

Investigation: Panama Papers
Number of rows in dataset: 93454
Number of populated rows: 93454
Difference between the number of rows and the populated rows: 0
Number of unique entries: 1
Difference between the number of populated entries and the number of unique ent

In [10]:
for ds in [paradise_address_raw]:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.name
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique())
    print('\n')

Investigation: Paradise Papers - Appleby
Number of rows in dataset: 223350
Number of populated rows: 223349
Difference between the number of rows and the populated rows: 1
Number of unique entries: 222810
Difference between the number of populated entries and the number of unique entries: 539




In [11]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.address
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique())
    print('\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 551
Number of populated rows: 551
Difference between the number of rows and the populated rows: 0
Number of unique entries: 550
Difference between the number of populated entries and the number of unique entries: 1


Investigation: Offshore Leaks
Number of rows in dataset: 57600
Number of populated rows: 57600
Difference between the number of rows and the populated rows: 0
Number of unique entries: 57479
Difference between the number of populated entries and the number of unique entries: 121


Investigation: Panama Papers
Number of rows in dataset: 93454
Number of populated rows: 93453
Difference between the number of rows and the populated rows: 1
Number of unique entries: 93452
Difference between the number of populated entries and the number of unique entries: 1


Investigation: Paradise Papers - Appleby
Number of rows in dataset: 223350
Number of populated rows: 203420
Difference between the number of rows and the populated ro

In [12]:
paradise_address_raw.head()

Unnamed: 0,node_id,name,address,country_codes,countries,sourceID,valid_until,note
0,81014050,6B Chenyu Court; 22-24 Kennedy Road; Hong Kong,6B Chenyu Court,HKG,Hong Kong,Paradise Papers - Appleby,Appleby data is current through 2014,
1,81014052,15C Suchun Industrial Square; Suzhou Industrial Park; 215126 Suzhou; People's Republic of China,15C Suchun Industrial Square,CHN,China,Paradise Papers - Appleby,Appleby data is current through 2014,
2,81014055,"8F., No. 68; Minfu 13th St. Taoyuan County; 330; Taiwan","8F., No. 68",TWN,Taiwan,Paradise Papers - Appleby,Appleby data is current through 2014,
3,81014056,"No. 66, Ln. 20; Dafu Rd., Shengang Township; 429 Taichung County; Taiwan","No. 66, Ln. 20",TWN,Taiwan,Paradise Papers - Appleby,Appleby data is current through 2014,
4,81014057,"1F., No. 18, Ln. 274; Chang'an St., Luzhou City; Taipei County 247, Taiwan; People's Republic of China","1F., No. 18, Ln. 274",CHN,China,Paradise Papers - Appleby,Appleby data is current through 2014,


In [13]:
paradise_address_raw[paradise_address_raw['name'].isnull()]

Unnamed: 0,node_id,name,address,country_codes,countries,sourceID,valid_until,note
42796,81073210,,,,,Paradise Papers - Appleby,Appleby data is current through 2014,


In [14]:
bahamas_address_raw.head(3)

Unnamed: 0,labels(n),valid_until,country_codes,countries,node_id,sourceID,address,name,jurisdiction_description,service_provider,jurisdiction,closed_date,incorporation_date,ibcRUC,type,status,company_type,note
0,"[""Address""]",The Bahamas Leaks data is current through early 2016.,BHS,Bahamas,24000001,Bahamas Leaks,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS",,,,,,,,,,,
1,"[""Address""]",The Bahamas Leaks data is current through early 2016.,BHS,Bahamas,24000002,Bahamas Leaks,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS",,,,,,,,,,,
2,"[""Address""]",The Bahamas Leaks data is current through early 2016.,BHS,Bahamas,24000003,Bahamas Leaks,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS",,,,,,,,,,,


In [15]:
offshore_address_raw.head(3)

Unnamed: 0,node_id,name,address,country_codes,countries,sourceID,valid_until,note
0,2004267,,One Bearer Secured Debenture,XXX,Not identified,Offshore Leaks,The Offshore Leaks data is current through 2010,
1,67277,,"11 Coomber Road, The Peak, Hong Kong",HKG,Hong Kong,Offshore Leaks,The Offshore Leaks data is current through 2010,
2,67346,,"4 Irish Place 2nd Floor, Gibraltar.",GIB,Gibraltar,Offshore Leaks,The Offshore Leaks data is current through 2010,


In [16]:
panama_address_raw.head(3)

Unnamed: 0,node_id,name,address,country_codes,countries,sourceID,valid_until,note
0,14000001,,-\t27 ROSEWOOD DRIVE #16-19 SINGAPORE 737920,SGP,Singapore,Panama Papers,The Panama Papers data is current through 2015,
1,14000002,,"""Almaly Village"" v.5, Almaty Kazakhstan",KAZ,Kazakhstan,Panama Papers,The Panama Papers data is current through 2015,
2,14000003,,"""Cantonia"" South Road St Georges Hill Weybridge, Surrey",GBR,United Kingdom,Panama Papers,The Panama Papers data is current through 2015,


In [None]:
keep_cols = ['node_id', 'sourceID', 'address', 'name', 'country_codes', 'countries']
dtypes = {'node_id': 'int32', 'sourceID':'category', 'address': 'object', 'name':'object',
          'country_codes':'category', 'countries':'category'}

bahamas_address_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.address.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
offshore_address_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.address.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
panama_address_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.address.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
paradise_address_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.address.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)