In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 150

In [2]:
bahamas_officer_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.officer.csv')
offshore_officer_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.officer.csv')
panama_officer_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.officer.csv')
paradise_officer_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.officer.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
datasets = [bahamas_officer_raw, offshore_officer_raw, panama_officer_raw, paradise_officer_raw]
for ds in datasets:
    print(ds.sourceID[0])
    print(sorted(ds.columns), '\n')

Bahamas Leaks
['address', 'closed_date', 'company_type', 'countries', 'country_codes', 'ibcRUC', 'incorporation_date', 'jurisdiction', 'jurisdiction_description', 'labels(n)', 'name', 'node_id', 'note', 'service_provider', 'sourceID', 'status', 'type', 'valid_until'] 

Offshore Leaks
['countries', 'country_codes', 'name', 'node_id', 'note', 'sourceID', 'valid_until'] 

Panama Papers
['countries', 'country_codes', 'name', 'node_id', 'note', 'sourceID', 'valid_until'] 

Paradise Papers - Malta corporate registry
['countries', 'country_codes', 'name', 'node_id', 'note', 'sourceID', 'status', 'valid_until'] 



In [8]:
bahamas_officer_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25262 entries, 0 to 25261
Data columns (total 18 columns):
labels(n)                   25262 non-null object
valid_until                 25262 non-null object
country_codes               81 non-null object
countries                   81 non-null object
node_id                     25262 non-null int64
sourceID                    25262 non-null object
address                     0 non-null float64
name                        25262 non-null object
jurisdiction_description    0 non-null float64
service_provider            0 non-null float64
jurisdiction                0 non-null float64
closed_date                 0 non-null float64
incorporation_date          0 non-null float64
ibcRUC                      0 non-null float64
type                        0 non-null float64
status                      0 non-null float64
company_type                0 non-null float64
note                        83 non-null object
dtypes: float64(10), int64(1), 

#### The extra columns in the Bahamas Leaks investigation are all empty and can be disregarded

In [11]:
offshore_officer_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107190 entries, 0 to 107189
Data columns (total 7 columns):
node_id          107190 non-null int64
name             107132 non-null object
country_codes    107186 non-null object
countries        107186 non-null object
sourceID         107190 non-null object
valid_until      107190 non-null object
note             1 non-null object
dtypes: int64(1), object(6)
memory usage: 5.7+ MB


In [12]:
panama_officer_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238402 entries, 0 to 238401
Data columns (total 7 columns):
node_id          238402 non-null int64
name             238399 non-null object
country_codes    144789 non-null object
countries        144789 non-null object
sourceID         238402 non-null object
valid_until      238402 non-null object
note             49 non-null object
dtypes: int64(1), object(6)
memory usage: 12.7+ MB


In [9]:
paradise_officer_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350008 entries, 0 to 350007
Data columns (total 8 columns):
node_id          350008 non-null int64
name             350007 non-null object
country_codes    171627 non-null object
countries        171610 non-null object
status           158 non-null object
sourceID         350008 non-null object
valid_until      350008 non-null object
note             3612 non-null object
dtypes: int64(1), object(7)
memory usage: 21.4+ MB


In [10]:
paradise_officer_raw.status.unique()

array([nan, 'Removed'], dtype=object)

#### I'm not terribly interested in the 'status' column. It only has one value - 'Removed'. As we're just interested in examining the data, not making sure it's up to date and correct, disregrding this column

In [13]:
for ds in datasets:
    print(ds.sourceID.unique()[0])
    print(ds.shape)
    print(ds.node_id.nunique(), '\n')

Bahamas Leaks
(25262, 18)
25262 

Offshore Leaks
(107190, 7)
107190 

Panama Papers
(238402, 7)
238402 

Paradise Papers - Malta corporate registry
(350008, 8)
350008 



## node_id to be used as unique identifier in database

Each row does in fact have its own node id. We'll use this as the unique identifier going forward

In [14]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.note
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique())
    print(ds_col.value_counts(), '\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 25262
Number of populated rows: 83
Difference between the number of rows and the populated rows: 25179
Number of unique entries: 3
Difference between the number of populated entries and the number of unique entries: 80
Record manually added from leaked documents                                                                                                                                                                                        81
The end date of the position held by Korbak Lynn was updated manually in the database based on documents provided to ICIJ by a communications firm representing Morneau Shepell. (Updated on September 23rd , 2016)                 1
The end date of the position held by Francis Morneau William was updated manually in the database based on documents provided to ICIJ by a communications firm representing Morneau Shepell. (Updated on September 23rd , 2016)     1
Name: note, dtype: int64 

Investig

### Discard 'note'

Note doesn't provide anything particularly interesting for analysis. As we're just interested in examining the data, not making sure it's up to date and correct, disregrding this column

In [18]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.valid_until
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique())
    print(ds_col.value_counts(), '\n')


Investigation: Bahamas Leaks
Number of rows in dataset: 25262
Number of populated rows: 25262
Difference between the number of rows and the populated rows: 0
Number of unique entries: 1
Difference between the number of populated entries and the number of unique entries: 25261
The Bahamas Leaks data is current through early 2016.    25262
Name: valid_until, dtype: int64 

Investigation: Offshore Leaks
Number of rows in dataset: 107190
Number of populated rows: 107190
Difference between the number of rows and the populated rows: 0
Number of unique entries: 1
Difference between the number of populated entries and the number of unique entries: 107189
The Offshore Leaks data is current through 2010    107190
Name: valid_until, dtype: int64 

Investigation: Panama Papers
Number of rows in dataset: 238402
Number of populated rows: 238402
Difference between the number of rows and the populated rows: 0
Number of unique entries: 1
Difference between the number of populated entries and the number

### Discard 'valid_until'

As we're just interested in examining the data, not making sure it's up to date and correct, disregrding this column. If we want it later, we can create a 'current through' table into the database.

There aren't a whole lot of country/country_code entries, but I'd like to keep them to compare to any other provided data.

In [31]:
col_order = ['sourceID', 'name', 'countries', 'country_codes']

bahamas_officer_raw[col_order].head()

Unnamed: 0_level_0,sourceID,name,countries,country_codes
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
22013341,Bahamas Leaks,ALPHA DIRECTION LTD.,,
22017206,Bahamas Leaks,GAMMA SECRETARIES,,
22023260,Bahamas Leaks,THETA DIRECTION LTD.,,
22006231,Bahamas Leaks,ALPHA DIRECTION LTD.,,
22012516,Bahamas Leaks,GAMMA SECRETARIES LTD.,,


In [32]:
offshore_officer_raw[col_order].head()

Unnamed: 0_level_0,sourceID,name,countries,country_codes
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
51113,Offshore Leaks,Bisbro Investment Company Ltd.,British Virgin Islands,VGB
51114,Offshore Leaks,Mohamed Mustafa Bin A Mohamed Ismail,Malaysia,MYS
51115,Offshore Leaks,Teh Hong Eng,Hong Kong,HKG
51116,Offshore Leaks,New Sights Enterprises Limited,British Virgin Islands,VGB
51117,Offshore Leaks,Yong Meow Mui,Singapore,SGP


In [33]:
panama_officer_raw[col_order].head()

Unnamed: 0_level_0,sourceID,name,countries,country_codes
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12000001,Panama Papers,KIM SOO IN,South Korea,KOR
12000002,Panama Papers,Tian Yuan,China,CHN
12000003,Panama Papers,GREGORY JOHN SOLOMON,Australia,AUS
12000004,Panama Papers,MATSUDA MASUMI,Japan,JPN
12000005,Panama Papers,HO THUY NGA,Viet Nam,VNM


In [34]:
paradise_officer_raw[col_order].head()

Unnamed: 0_level_0,sourceID,name,countries,country_codes
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
59160036,Paradise Papers - Malta corporate registry,RADOMIR VUKCEVIC,,
59178341,Paradise Papers - Malta corporate registry,ANA DUKANOVIC,,
59190179,Paradise Papers - Malta corporate registry,ANA KOLAREVIC,,
59152278,Paradise Papers - Malta corporate registry,Ranin Ltd,,
59127837,Paradise Papers - Malta corporate registry,DAVID MARINELLI,,


## Final code to pull CSVs

In [22]:
keep_cols = ['node_id', 'sourceID', 'name', 'countries', 'country_codes']
col_order = ['sourceID', 'name', 'countries', 'country_codes']
dtypes = {'node_id':'int32', 'sourceID':'category', 'name':'object', 'country_codes':'category', 'countries':'category'}

bahamas_officer_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.officer.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
bahamas_officer_raw = bahamas_officer_raw[col_order]

offshore_officer_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.officer.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
offshore_officer_raw = offshore_officer_raw[col_order]

panama_officer_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.officer.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
panama_officer_raw = panama_officer_raw[col_order]

paradise_officer_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.officer.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
paradise_officer_raw = paradise_officer_raw[col_order]