In [2]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 150

## Challenges
- Can't pull in all the data for any one investigation (Panama, Paradise, Bahamas, Offshore) due to memory problems
- Doesn't appear that the column names are exactly the same between investigations
- Doesn't appear that all the columns are populated in each of the CSVs

## Solutions
- Whittle the number of columns down to just the ones you need
- Set the column types so the system doesn't have to guess and uses the most efficient ones

In [1]:
# edge_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.edges.csv')
# address_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.address.csv')
entity_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.entity.csv', index_col='node_id')
# intermediary_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.intermediary.csv')
# officer_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.officer.csv')
# other_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.other.csv')

- Went through the CSVs. Pulled out the names of the columns that looked interesting
- Loaded the data to dataframes, looked at the null values, removed a couple more columns

In [None]:
['country_codes', 'countries', 'node_id', 'sourceID', 'name', 'jurisdiction_description', 'service_provider',
 'jurisdiction', 'closed_date', 'incorporation_date', 'ibcRUC', 'status', 'company_type', 'note']

In [6]:
bahamas_entity_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.entity.csv', 
                                usecols = ['node_id', 'country_codes', 'countries', 'node_id', 'sourceID', 'name',
                                           'jurisdiction_description', 'service_provider', 'jurisdiction',
                                           'closed_date', 'incorporation_date', 'ibcRUC', 'status',
                                           'company_type', 'note'])
offshore_entity_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.entity.csv', 
                                usecols = ['node_id', 'country_codes', 'countries', 'node_id', 'sourceID', 'name',
                                           'jurisdiction_description', 'service_provider', 'jurisdiction',
                                           'closed_date', 'incorporation_date', 'ibcRUC', 'status',
                                           'company_type', 'note'])
panama_entity_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.entity.csv', 
                                usecols = ['node_id', 'country_codes', 'countries', 'node_id', 'sourceID', 'name',
                                           'jurisdiction_description', 'service_provider', 'jurisdiction',
                                           'closed_date', 'incorporation_date', 'ibcRUC', 'status',
                                           'company_type', 'note'])
paradise_entity_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.entity.csv',
                                usecols = ['node_id', 'country_codes', 'countries', 'node_id', 'sourceID', 'name',
                                           'jurisdiction_description', 'service_provider', 'jurisdiction',
                                           'closed_date', 'incorporation_date', 'ibcRUC', 'status',
                                           'company_type', 'note'])

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


# Row counts for all columns for all investigations

info() gives you counts of all the non-null rows for each column

In [87]:
bahamas_entity_raw.head()

Unnamed: 0_level_0,country_codes,countries,sourceID,name,jurisdiction_description,service_provider,jurisdiction,incorporation_date,status
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20003127,,,Bahamas Leaks,DALMA CORPORATION LIMITED,Bahamas,,BAH,1990-11-30,
20010494,,,Bahamas Leaks,ASIA CONSTRUCTION CORPORATION LIMITED,Bahamas,,BAH,1992-08-14,
20010495,,,Bahamas Leaks,EURO LOGISTICS LIMITED,Bahamas,,BAH,1992-08-14,
20010496,,,Bahamas Leaks,EURO LEISURE LIMITED,Bahamas,,BAH,1992-08-14,
20010497,,,Bahamas Leaks,EURO DATA PROCUREMENT LIMITED,Bahamas,,BAH,1992-08-14,


In [13]:
bahamas_entity_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175888 entries, 0 to 175887
Data columns (total 14 columns):
country_codes               0 non-null float64
countries                   0 non-null float64
node_id                     175888 non-null int64
sourceID                    175888 non-null object
name                        175888 non-null object
jurisdiction_description    175888 non-null object
service_provider            0 non-null float64
jurisdiction                175888 non-null object
closed_date                 0 non-null float64
incorporation_date          175871 non-null object
ibcRUC                      175888 non-null object
status                      0 non-null float64
company_type                0 non-null float64
note                        1 non-null object
dtypes: float64(6), int64(1), object(7)
memory usage: 18.8+ MB


### Bahamas entities that are empty
- closed_date
- company_type
- country_codes
- countries
- service_provider
- status

Only 1 note

In [14]:
offshore_entity_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105516 entries, 0 to 105515
Data columns (total 14 columns):
node_id                     105516 non-null int64
name                        105516 non-null object
jurisdiction                105516 non-null object
jurisdiction_description    105516 non-null object
country_codes               105516 non-null object
countries                   105516 non-null object
incorporation_date          96137 non-null object
closed_date                 0 non-null float64
ibcRUC                      93599 non-null object
status                      96475 non-null object
company_type                103227 non-null object
service_provider            105516 non-null object
sourceID                    105516 non-null object
note                        8986 non-null object
dtypes: float64(1), int64(1), object(12)
memory usage: 11.3+ MB


### Offshore entities that are empty
- closed_date

In [15]:
panama_entity_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213634 entries, 0 to 213633
Data columns (total 14 columns):
node_id                     213634 non-null int64
name                        213630 non-null object
jurisdiction                213634 non-null object
jurisdiction_description    213634 non-null object
country_codes               212844 non-null object
countries                   212844 non-null object
incorporation_date          213599 non-null object
closed_date                 0 non-null float64
ibcRUC                      161771 non-null object
status                      213631 non-null object
company_type                0 non-null float64
service_provider            213634 non-null object
sourceID                    213634 non-null object
note                        7 non-null object
dtypes: float64(2), int64(1), object(11)
memory usage: 22.8+ MB


### Panama entities that are empty
- closed_date
- company_type

In [16]:
paradise_entity_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290086 entries, 0 to 290085
Data columns (total 14 columns):
node_id                     290086 non-null int64
name                        290061 non-null object
jurisdiction                290086 non-null object
jurisdiction_description    290086 non-null object
country_codes               169745 non-null object
countries                   169745 non-null object
incorporation_date          285746 non-null object
closed_date                 45305 non-null object
ibcRUC                      107263 non-null object
status                      27676 non-null object
company_type                19271 non-null object
service_provider            24936 non-null object
sourceID                    290086 non-null object
note                        32763 non-null object
dtypes: int64(1), object(13)
memory usage: 31.0+ MB


### Paradise entities that are empty
- None

# Unique identifier analysis

node_id appears to be a unique identifier. Let's double check.

In [7]:
datasets = [bahamas_entity_raw, offshore_entity_raw, panama_entity_raw, paradise_entity_raw]

for ds in datasets:
    print(ds.sourceID.unique()[0])
    print(ds.shape)
    print(ds.node_id.nunique(), '\n')

Bahamas Leaks
(175888, 14)
175888 

Offshore Leaks
(105516, 14)
105516 

Panama Papers
(213634, 14)
213634 

Paradise Papers - Aruba corporate registry
(290086, 14)
290086 



## node_id to be used as unique identifier in database

Each row does in fact have its own node id. We'll use this as the unique identifier going forward

### Determine which columns you want and what data type to use

All columns:

In [22]:
bahamas_entity_raw.columns

Index(['country_codes', 'countries', 'node_id', 'sourceID', 'name',
       'jurisdiction_description', 'service_provider', 'jurisdiction',
       'closed_date', 'incorporation_date', 'ibcRUC', 'status', 'company_type',
       'note'],
      dtype='object')

### Definitely keep:
- node_id: the unique identifier
- sourceID: which investigation the info came from
- name: name of the company
- jurisdiction_description: spelled out jurisdiction
- jurisdiction: abbreviated jurisdiction - Do I need to keep both jurisdiction and jurisdiction_description?
- incorporation_date: gives you dates to play with

### Are these columns worth having?
- country_codes
- countries
- service_provider
- closed_date
- status
- company_type
- note
- ibcRUC: What is this?

# ibcRUC

In [30]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    print('Number of rows in dataset:', ds.ibcRUC.shape[0])
    print('Number of populated rows:', ds.ibcRUC.count())
    print('Difference between the number of rows and the populated rows:', ds.ibcRUC.shape[0] - ds.ibcRUC.count())
    print('Number of unique entries:', ds.ibcRUC.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds.ibcRUC.count() - ds.ibcRUC.nunique(), '\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 175888
Number of populated rows: 175888
Difference between the number of rows and the populated rows: 0
Number of unique entries: 175150
Difference between the number of populated entries and the number of unique entries: 738 

Investigation: Offshore Leaks
Number of rows in dataset: 105516
Number of populated rows: 93599
Difference between the number of rows and the populated rows: 11917
Number of unique entries: 93077
Difference between the number of populated entries and the number of unique entries: 522 

Investigation: Panama Papers
Number of rows in dataset: 213634
Number of populated rows: 161771
Difference between the number of rows and the populated rows: 51863
Number of unique entries: 120046
Difference between the number of populated entries and the number of unique entries: 41725 

Investigation: Paradise Papers - Aruba corporate registry
Number of rows in dataset: 290086
Number of populated rows: 107263
Difference bet

### Discard 'ibcRUC'

ibcRUC isn't unique and there are quite a few missing values. Also wasn't able to find what this value represents. Removing from consideration for database.

# note

In [32]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.note
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique())
    print(ds_col.value_counts(), '\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 175888
Number of populated rows: 1
Difference between the number of rows and the populated rows: 175887
Number of unique entries: 1
Difference between the number of populated entries and the number of unique entries: 0
The end date of the positions held by Francis Morneau William and Korbak Lynn were updated manually in the database based on documents provided to ICIJ by a communications firm representing Morneau Shepell. (Updated on September 23rd , 2016)    1
Name: note, dtype: int64 

Investigation: Offshore Leaks
Number of rows in dataset: 105516
Number of populated rows: 8986
Difference between the number of rows and the populated rows: 96530
Number of unique entries: 2
Difference between the number of populated entries and the number of unique entries: 8984
This is not an offshore entity even though it was logged as such in original database ICIJ received. It isi an internal account created by the agent to record miscellaneo

### Discard 'note'

Note doesn't provide anything particularly interesting for analysis. As we're just interested in examining the data, not making sure it's up to date and correct, disregrding this column

# country_codes and countries

In [34]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.country_codes
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique(), '\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 175888
Number of populated rows: 0
Difference between the number of rows and the populated rows: 175888
Number of unique entries: 0
Difference between the number of populated entries and the number of unique entries: 0 

Investigation: Offshore Leaks
Number of rows in dataset: 105516
Number of populated rows: 105516
Difference between the number of rows and the populated rows: 0
Number of unique entries: 400
Difference between the number of populated entries and the number of unique entries: 105116 

Investigation: Panama Papers
Number of rows in dataset: 213634
Number of populated rows: 212844
Difference between the number of rows and the populated rows: 790
Number of unique entries: 159
Difference between the number of populated entries and the number of unique entries: 212685 

Investigation: Paradise Papers - Aruba corporate registry
Number of rows in dataset: 290086
Number of populated rows: 169745
Difference between the numb

In [37]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.countries
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique(), '\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 175888
Number of populated rows: 0
Difference between the number of rows and the populated rows: 175888
Number of unique entries: 0
Difference between the number of populated entries and the number of unique entries: 0 

Investigation: Offshore Leaks
Number of rows in dataset: 105516
Number of populated rows: 105516
Difference between the number of rows and the populated rows: 0
Number of unique entries: 400
Difference between the number of populated entries and the number of unique entries: 105116 

Investigation: Panama Papers
Number of rows in dataset: 213634
Number of populated rows: 212844
Difference between the number of rows and the populated rows: 790
Number of unique entries: 159
Difference between the number of populated entries and the number of unique entries: 212685 

Investigation: Paradise Papers - Aruba corporate registry
Number of rows in dataset: 290086
Number of populated rows: 169745
Difference between the numb

### Keep 'country_code' and 'countries'

country_code and countries have the same number of populated rows and unique entries, except for a single row in the Panama Papers investigation.

Will want to compare these to addresses associated with the companies. Put them in the table

# service_provider

In [39]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.service_provider
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique())
    print(ds_col.value_counts(), '\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 175888
Number of populated rows: 0
Difference between the number of rows and the populated rows: 175888
Number of unique entries: 0
Difference between the number of populated entries and the number of unique entries: 0
Series([], Name: service_provider, dtype: int64) 

Investigation: Offshore Leaks
Number of rows in dataset: 105516
Number of populated rows: 105516
Difference between the number of rows and the populated rows: 0
Number of unique entries: 2
Difference between the number of populated entries and the number of unique entries: 105514
Portcullis Trustnet           61123
Commonwealth Trust Limited    44393
Name: service_provider, dtype: int64 

Investigation: Panama Papers
Number of rows in dataset: 213634
Number of populated rows: 213634
Difference between the number of rows and the populated rows: 0
Number of unique entries: 1
Difference between the number of populated entries and the number of unique entries: 213633
Mo

### Keep 'service_provider'

Not sure what service these companies are providing, but there may be something interesting if subsetting by provider

# closed_date

In [40]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.closed_date
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique(), '\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 175888
Number of populated rows: 0
Difference between the number of rows and the populated rows: 175888
Number of unique entries: 0
Difference between the number of populated entries and the number of unique entries: 0 

Investigation: Offshore Leaks
Number of rows in dataset: 105516
Number of populated rows: 0
Difference between the number of rows and the populated rows: 105516
Number of unique entries: 0
Difference between the number of populated entries and the number of unique entries: 0 

Investigation: Panama Papers
Number of rows in dataset: 213634
Number of populated rows: 0
Difference between the number of rows and the populated rows: 213634
Number of unique entries: 0
Difference between the number of populated entries and the number of unique entries: 0 

Investigation: Paradise Papers - Aruba corporate registry
Number of rows in dataset: 290086
Number of populated rows: 45305
Difference between the number of rows and th

### Discard 'closed_date'

Column is empty in three of the investigations. Majority are missing in the fourth. Not worth keeping.

# status

In [42]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.status
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique())
    print(ds_col.value_counts(), '\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 175888
Number of populated rows: 0
Difference between the number of rows and the populated rows: 175888
Number of unique entries: 0
Difference between the number of populated entries and the number of unique entries: 0
Series([], Name: status, dtype: int64) 

Investigation: Offshore Leaks
Number of rows in dataset: 105516
Number of populated rows: 96475
Difference between the number of rows and the populated rows: 9041
Number of unique entries: 21
Difference between the number of populated entries and the number of unique entries: 96454
Active                                   46004
Dead                                     23095
Struck / Defunct / Deregistered          19486
Transferred OUT                           2637
Transferred Out                           1807
Liquidated                                 826
Company liquidated                         742
Not To Be Renewed / In Deregistration      663
Shelf                    

### Keep 'status', modify entries to standardize

There are enough entries to be interesting. Also, will want to standardize the responses. Use text and cell manipulation to update.

# company_type

In [43]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.company_type
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique())
    print(ds_col.value_counts(), '\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 175888
Number of populated rows: 0
Difference between the number of rows and the populated rows: 175888
Number of unique entries: 0
Difference between the number of populated entries and the number of unique entries: 0
Series([], Name: company_type, dtype: int64) 

Investigation: Offshore Leaks
Number of rows in dataset: 105516
Number of populated rows: 103227
Difference between the number of rows and the populated rows: 2289
Number of unique entries: 55
Difference between the number of populated entries and the number of unique entries: 103172
Standard International Company                    40072
Standard Company under IBC Act                    26214
Business Company Limited by Shares                25130
Client Sundry Account                              8692
Cook Islands Asset Protection Trust                 547
Domestic Company                                    462
Cook Islands Asset Protection Trust - 3520A         445
S

### Discard 'company_type'

Completely empty column in two of the investigations. Not enough values to be interesting in the other two.

## Final decision on columns to keep

In [35]:
keep_cols = ['node_id', 'sourceID', 'name', 'incorporation_date', 'country_codes', 'countries',
             'jurisdiction_description', 'jurisdiction', 'service_provider', 'status']

bahamas_entity_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.entity.csv',
                                 usecols = keep_cols)
offshore_entity_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.entity.csv',
                                 usecols = keep_cols)
panama_entity_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.entity.csv',
                                 usecols = keep_cols)
paradise_entity_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.entity.csv',
                                 usecols = keep_cols)

  interactivity=interactivity, compiler=compiler, result=result)


### Determine what data types to use for each column

Blogs on reducing dataframe memory usage:

- https://www.dataquest.io/blog/pandas-big-data/
- https://towardsdatascience.com/why-and-how-to-use-pandas-with-large-data-9594dda2ea4c

In [18]:
panama_entity_raw[keep_cols].head()

Unnamed: 0,node_id,sourceID,name,incorporation_date,country_codes,countries,jurisdiction_description,jurisdiction,service_provider,status
0,10000001,Panama Papers,"TIANSHENG INDUSTRY AND TRADING CO., LTD.",23-MAR-2006,HKG,Hong Kong,Samoa,SAM,Mossack Fonseca,Defaulted
1,10000002,Panama Papers,"NINGBO SUNRISE ENTERPRISES UNITED CO., LTD.",27-MAR-2006,HKG,Hong Kong,Samoa,SAM,Mossack Fonseca,Defaulted
2,10000003,Panama Papers,"HOTFOCUS CO., LTD.",10-JAN-2006,HKG,Hong Kong,Samoa,SAM,Mossack Fonseca,Defaulted
3,10000004,Panama Papers,"SKY-BLUE GIFTS & TOYS CO., LTD.",06-JAN-2006,HKG,Hong Kong,Samoa,SAM,Mossack Fonseca,Defaulted
4,10000005,Panama Papers,FORTUNEMAKER INVESTMENTS CORPORATION,19-APR-2006,HKG,Hong Kong,Samoa,SAM,Mossack Fonseca,Changed agent


In [38]:
datasets = [bahamas_entity_raw, offshore_entity_raw, panama_entity_raw, paradise_entity_raw]

for ds in datasets:
    print(ds.sourceID[0])
    print(ds.info(memory_usage='deep'), '\n')

Bahamas Leaks
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175888 entries, 0 to 175887
Data columns (total 10 columns):
country_codes               0 non-null float64
countries                   0 non-null float64
node_id                     175888 non-null int64
sourceID                    175888 non-null object
name                        175888 non-null object
jurisdiction_description    175888 non-null object
service_provider            0 non-null float64
jurisdiction                175888 non-null object
incorporation_date          175871 non-null object
status                      0 non-null float64
dtypes: float64(4), int64(1), object(5)
memory usage: 64.1 MB
None 

Offshore Leaks
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105516 entries, 0 to 105515
Data columns (total 10 columns):
node_id                     105516 non-null int64
name                        105516 non-null object
jurisdiction                105516 non-null object
jurisdiction_description    105516 

In [20]:
for ds in datasets:
    print(ds.sourceID[0])
    for dtype in ['int', 'float', 'object']:
        selected_dtype = ds.select_dtypes(include=[dtype])
        mb = 1024*1024
        mean_usage_mb = selected_dtype.memory_usage(deep=True).mean() / mb
        print('Average memory usage for {} columns: {:03.2f} MB'.format(dtype, mean_usage_mb))
    print('\n')

Bahamas Leaks
Average memory usage for int columns: 0.00 MB
Average memory usage for float columns: 1.07 MB
Average memory usage for object columns: 9.56 MB


Offshore Leaks
Average memory usage for int columns: 0.00 MB
Average memory usage for float columns: 0.00 MB
Average memory usage for object columns: 6.40 MB


Panama Papers
Average memory usage for int columns: 0.00 MB
Average memory usage for float columns: 0.00 MB
Average memory usage for object columns: 12.53 MB


Paradise Papers - Aruba corporate registry
Average memory usage for int columns: 0.00 MB
Average memory usage for float columns: 0.00 MB
Average memory usage for object columns: 15.04 MB




In [39]:
for ds in datasets:
    print(ds.sourceID[0])
    mb = 1024*1024
    print(ds.memory_usage(deep=True)/mb, '\n')

Bahamas Leaks
Index                        0.000076
country_codes                1.341919
countries                    1.341919
node_id                      1.341919
sourceID                    11.741791
name                        13.436182
jurisdiction_description    10.735352
service_provider             1.341919
jurisdiction                10.064392
incorporation_date          11.405727
status                       1.341919
dtype: float64 

Offshore Leaks
Index                       0.000076
node_id                     0.805023
name                        8.280141
jurisdiction                6.041529
jurisdiction_description    7.239896
country_codes               6.218044
countries                   8.053644
incorporation_date          6.520695
status                      6.568155
service_provider            7.944076
sourceID                    7.144581
dtype: float64 

Panama Papers
Index                        0.000076
node_id                      1.629898
name                  

In [21]:
for ds in datasets:
    print(ds.sourceID[0])
    print(ds.nunique())
    print('\n')

Bahamas Leaks
country_codes                    0
countries                        0
node_id                     175888
sourceID                         1
name                        175514
jurisdiction_description         1
service_provider                 0
jurisdiction                     1
incorporation_date            6732
status                           0
dtype: int64


Offshore Leaks
node_id                     105516
name                        105114
jurisdiction                    27
jurisdiction_description        28
country_codes                  400
countries                      400
incorporation_date            4868
status                          21
service_provider                 2
sourceID                         1
dtype: int64


Panama Papers
node_id                     213634
name                        206525
jurisdiction                    21
jurisdiction_description        21
country_codes                  159
countries                      159
incorporation_dat

### Notes on areas not recorded in notebook

- Turning node_id into an index increases the size of the index by a substantial amount (int32 data format uses 0.2 - 0.55 MB on the node_id column depending on the dataset, whereas when formatted as an index uses 5.81 - 12.21 MB). Probaby for the same reason indexing on a column in a database makes that column consume more memory
- Ability of int categories to hold data:
  + bool: consumes 1 byte, true or false
  + int8 / uint8: -128 - 127 or 255
  + float16 / int16 / uint16: -32,786 - 32,767 or 65,535
  + float32 / int32 / uint32: -2,147,483,648 - 2,147,483,647 or 4,294,987,295
  + float64 / int64 / uint64: if you need more than this, you're doing it wrong

# Final decision on data types

In [40]:
keep_cols = ['node_id', 'sourceID', 'name', 'incorporation_date', 'country_codes', 'countries',
             'jurisdiction_description', 'jurisdiction', 'service_provider', 'status']
dtypes = {'node_id': 'int32', 'sourceID':'category', 'name':'object', 'country_codes':'category', 'countries':'category',
          'jurisdiction_description':'category', 'jurisdiction':'category', 'service_provider':'category', 'status':'category'}

bahamas_entity_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes,
                                 parse_dates=['incorporation_date'])
offshore_entity_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes,
                                 parse_dates=['incorporation_date'])
panama_entity_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes,
                                 parse_dates=['incorporation_date'])
paradise_entity_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes,
                                 parse_dates=['incorporation_date'])

In [41]:
datasets = [bahamas_entity_raw, offshore_entity_raw, panama_entity_raw, paradise_entity_raw]

for ds in datasets:
    print(ds.sourceID[0])
    print(ds.info(memory_usage='deep'), '\n')

Bahamas Leaks
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175888 entries, 0 to 175887
Data columns (total 10 columns):
country_codes               0 non-null category
countries                   0 non-null category
node_id                     175888 non-null int32
sourceID                    175888 non-null category
name                        175888 non-null object
jurisdiction_description    175888 non-null category
service_provider            0 non-null category
jurisdiction                175888 non-null category
incorporation_date          175871 non-null datetime64[ns]
status                      0 non-null category
dtypes: category(7), datetime64[ns](1), int32(1), object(1)
memory usage: 16.6 MB
None 

Offshore Leaks
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105516 entries, 0 to 105515
Data columns (total 10 columns):
node_id                     105516 non-null int32
name                        105516 non-null object
jurisdiction                105516 non-null cate

In [42]:
for ds in datasets:
    print(ds.sourceID[0])
    for dtype in ['int', 'float', 'object']:
        selected_dtype = ds.select_dtypes(include=[dtype])
        mb = 1024*1024
        mean_usage_mb = selected_dtype.memory_usage(deep=True).mean() / mb
        print('Average memory usage for {} columns: {:03.2f} MB'.format(dtype, mean_usage_mb))
    print('\n')

Bahamas Leaks
Average memory usage for int columns: 0.34 MB
Average memory usage for float columns: 0.00 MB
Average memory usage for object columns: 6.72 MB


Offshore Leaks
Average memory usage for int columns: 0.20 MB
Average memory usage for float columns: 0.00 MB
Average memory usage for object columns: 4.14 MB


Panama Papers
Average memory usage for int columns: 0.41 MB
Average memory usage for float columns: 0.00 MB
Average memory usage for object columns: 8.26 MB


Paradise Papers - Aruba corporate registry
Average memory usage for int columns: 0.55 MB
Average memory usage for float columns: 0.00 MB
Average memory usage for object columns: 13.77 MB




In [43]:
for ds in datasets:
    print(ds.sourceID[0])
    mb = 1024*1024
    print(ds.memory_usage(deep=True)/mb, '\n')

Bahamas Leaks
Index                        0.000076
country_codes                0.167740
countries                    0.167740
node_id                      0.670959
sourceID                     0.167807
name                        13.436182
jurisdiction_description     0.167801
service_provider             0.167740
jurisdiction                 0.167797
incorporation_date           1.341919
status                       0.167740
dtype: float64 

Offshore Leaks
Index                       0.000076
node_id                     0.402512
name                        8.280141
jurisdiction                0.103417
jurisdiction_description    0.103644
country_codes               0.245098
countries                   0.252342
incorporation_date          0.805023
status                      0.102695
service_provider            0.100780
sourceID                    0.100696
dtype: float64 

Panama Papers
Index                        0.000076
node_id                      0.814949
name                  