In [2]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 150

In [3]:
bahamas_intermediary_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.intermediary.csv')
offshore_intermediary_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.intermediary.csv')
panama_intermediary_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.intermediary.csv')
paradise_intermediary_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.intermediary.csv')

In [4]:
datasets = [bahamas_intermediary_raw, offshore_intermediary_raw, panama_intermediary_raw, paradise_intermediary_raw]

for ds in datasets:
    print(ds.sourceID[0])
    print(sorted(ds.columns), '\n')

Bahamas Leaks
['address', 'closed_date', 'company_type', 'countries', 'country_codes', 'ibcRUC', 'incorporation_date', 'jurisdiction', 'jurisdiction_description', 'labels(n)', 'name', 'node_id', 'note', 'service_provider', 'sourceID', 'status', 'type', 'valid_until'] 

Offshore Leaks
['countries', 'country_codes', 'name', 'node_id', 'note', 'sourceID', 'status', 'valid_until'] 

Panama Papers
['countries', 'country_codes', 'name', 'node_id', 'note', 'sourceID', 'status', 'valid_until'] 

Paradise Papers - Bahamas corporate registry
['countries', 'country_codes', 'name', 'node_id', 'note', 'sourceID', 'valid_until'] 



In [8]:
bahamas_intermediary_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 18 columns):
labels(n)                   541 non-null object
valid_until                 541 non-null object
country_codes               539 non-null object
countries                   539 non-null object
node_id                     541 non-null int64
sourceID                    541 non-null object
address                     0 non-null float64
name                        541 non-null object
jurisdiction_description    0 non-null float64
service_provider            0 non-null float64
jurisdiction                0 non-null float64
closed_date                 0 non-null float64
incorporation_date          0 non-null float64
ibcRUC                      0 non-null float64
type                        0 non-null float64
status                      0 non-null float64
company_type                0 non-null float64
note                        2 non-null object
dtypes: float64(10), int64(1), object(7)
mem

In [4]:
offshore_intermediary_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9526 entries, 0 to 9525
Data columns (total 8 columns):
node_id          9526 non-null int64
name             9525 non-null object
country_codes    9526 non-null object
countries        9526 non-null object
status           0 non-null float64
sourceID         9526 non-null object
valid_until      9526 non-null object
note             1 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 595.5+ KB


In [5]:
panama_intermediary_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14110 entries, 0 to 14109
Data columns (total 8 columns):
node_id          14110 non-null int64
name             14110 non-null object
country_codes    12598 non-null object
countries        12598 non-null object
status           12621 non-null object
sourceID         14110 non-null object
valid_until      14110 non-null object
note             3 non-null object
dtypes: int64(1), object(7)
memory usage: 882.0+ KB


In [6]:
paradise_intermediary_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1568 entries, 0 to 1567
Data columns (total 7 columns):
node_id          1568 non-null int64
name             1568 non-null object
country_codes    489 non-null object
countries        489 non-null object
sourceID         1568 non-null object
valid_until      1568 non-null object
note             1 non-null object
dtypes: int64(1), object(6)
memory usage: 85.8+ KB


In [9]:
for ds in datasets:
    print(ds.sourceID.unique()[0])
    print(ds.shape)
    print(ds.node_id.nunique(), '\n')

Bahamas Leaks
(541, 18)
541 

Offshore Leaks
(9526, 8)
9526 

Panama Papers
(14110, 8)
14110 

Paradise Papers - Bahamas corporate registry
(1568, 7)
1568 



In [6]:
for ds in datasets:
    print(ds.sourceID.unique()[0])
    print(ds[['countries', 'country_codes']].nunique(), '\n')

Bahamas Leaks
countries        1
country_codes    1
dtype: int64 

Offshore Leaks
countries        137
country_codes    137
dtype: int64 

Panama Papers
countries        241
country_codes    241
dtype: int64 

Paradise Papers - Bahamas corporate registry
countries        22
country_codes    22
dtype: int64 



## node_id to be used as unique identifier in database

Each row does in fact have its own node id. We'll use this as the unique identifier going forward

In [11]:
keep_cols = ['node_id', 'sourceID', 'name', 'countries', 'country_codes']

In [12]:
bahamas_intermediary_raw[keep_cols].head()

Unnamed: 0,node_id,sourceID,name,countries,country_codes
0,23000001,Bahamas Leaks,Internal User,,
1,23000002,Bahamas Leaks,M & A CORPORATE SERVICES LIMITED,Bahamas,BHS
2,23000003,Bahamas Leaks,BRITANNIA CONSULTING GROUP,Bahamas,BHS
3,23000004,Bahamas Leaks,ABACO FSC LTD.,Bahamas,BHS
4,23000005,Bahamas Leaks,OL PRIVATE CORPORATE COUNSEL LTD.,Bahamas,BHS


In [13]:
offshore_intermediary_raw[keep_cols].head()

Unnamed: 0,node_id,sourceID,name,countries,country_codes
0,51122,Offshore Leaks,"Peng, Wan-Hsiung",Taiwan,TWN
1,51149,Offshore Leaks,Shin Tae Young,United Arab Emirates,ARE
2,51162,Offshore Leaks,Lin Celina,Hong Kong,HKG
3,51224,Offshore Leaks,R E Douglas B Juday,United States,USA
4,51270,Offshore Leaks,Chun Thong Ping,Singapore,SGP


In [14]:
panama_intermediary_raw[keep_cols].head()

Unnamed: 0,node_id,sourceID,name,countries,country_codes
0,11000001,Panama Papers,"MICHAEL PAPAGEORGE, MR.",South Africa,ZAF
1,11000002,Panama Papers,CORFIDUCIA ANSTALT,Liechtenstein,LIE
2,11000003,Panama Papers,"DAVID, RONALD",Monaco,MCO
3,11000004,Panama Papers,"DE BOUTSELIS, JEAN-PIERRE",Belgium,BEL
4,11000005,Panama Papers,THE LEVANT LAWYERS (TLL),Lebanon,LBN


In [15]:
paradise_intermediary_raw[keep_cols].head()

Unnamed: 0,node_id,sourceID,name,countries,country_codes
0,34304771,Paradise Papers - Bahamas corporate registry,CITITRUST (BAHAMAS) LIMITED,Bahamas,BHS
1,80000189,Paradise Papers - Appleby,Appleby Management (Bermuda) Ltd.,Bermuda,BMU
2,80000191,Paradise Papers - Appleby,Appleby Services (Bermuda) Ltd.,Bermuda,BMU
3,80000392,Paradise Papers - Appleby,Appleby Corporate Services (BVI) Limited,British Virgin Islands,VGB
4,80007709,Paradise Papers - Appleby,Appleby Corporate Services (HK) Limited,Hong Kong,HKG


In [None]:
keep_cols = ['node_id', 'sourceID', 'name', 'countries', 'country_codes']
dtypes = = {'node_id': 'int32', 'sourceID':'category', 'name':'object',
            'countries':'category', 'country_codes':'category'}

bahamas_intermediary_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.intermediary.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
offshore_intermediary_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.intermediary.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
panama_intermediary_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.intermediary.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
paradise_intermediary_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.intermediary.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)