In [None]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 150

In [40]:
keep_cols = ['node_id', 'sourceID', 'name', 'incorporation_date', 'country_codes', 'countries',
             'jurisdiction_description', 'jurisdiction', 'service_provider', 'status']
dtypes = {'node_id': 'int32', 'sourceID':'category', 'name':'object', 'country_codes':'category', 'countries':'category',
          'jurisdiction_description':'category', 'jurisdiction':'category', 'service_provider':'category', 'status':'category'}

bahamas_entity_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes,
                                 parse_dates=['incorporation_date'])
offshore_entity_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes,
                                 parse_dates=['incorporation_date'])
panama_entity_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes,
                                 parse_dates=['incorporation_date'])
paradise_entity_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.entity.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes,
                                 parse_dates=['incorporation_date'])

In [None]:
datasets = [bahamas_entity_raw, offshore_entity_raw, panama_entity_raw, paradise_entity_raw]

## status

In [42]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.status
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique())
    print(ds_col.value_counts(), '\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 175888
Number of populated rows: 0
Difference between the number of rows and the populated rows: 175888
Number of unique entries: 0
Difference between the number of populated entries and the number of unique entries: 0
Series([], Name: status, dtype: int64) 

Investigation: Offshore Leaks
Number of rows in dataset: 105516
Number of populated rows: 96475
Difference between the number of rows and the populated rows: 9041
Number of unique entries: 21
Difference between the number of populated entries and the number of unique entries: 96454
Active                                   46004
Dead                                     23095
Struck / Defunct / Deregistered          19486
Transferred OUT                           2637
Transferred Out                           1807
Liquidated                                 826
Company liquidated                         742
Not To Be Renewed / In Deregistration      663
Shelf                    

In [None]:
df.loc[df['comorbidity'].str.contains('anxi|depression|ptsd', na=False), 'comorbidity'] = 'Psych'

In [53]:
offshore_entity_raw.status.str.title().head()

node_id
67028                Dead
67243                Dead
67258          Liquidated
67266     Transferred Out
108050                NaN
Name: status, dtype: object

In [62]:
import nltk
import string
import re

In [72]:
for ds in [offshore_entity_raw, panama_entity_raw, paradise_entity_raw]:
    ds['status'] = ds['status'].str.lower()
    ds.loc[ds['status'].str.contains('liquidation', na=False), 'status'] = 'in liquidation'
    ds.loc[ds['status'].str.contains('liquidated', na=False), 'status'] = 'liquidated'
    ds.loc[ds['status'].str.contains('resigned', na=False), 'status'] = 'resigned agent'
    ds.loc[ds['status'].str.contains('sundry', na=False), 'status'] = 'sundry account'
    ds.loc[ds['status'].str.contains('dissolved', na=False), 'status'] = 'dissolved'
    ds.loc[ds['status'].str.contains('struck|defunct|deregistered', na=False), 'status'] = 'struck / defunct / deregistered'   
    ds['status'] = ds['status'].str.title()


In [86]:
offshore_entity_raw.loc[offshore_entity_raw['status'].str.lower().str.contains('transferred', na=False), 'status'].str.title().unique()

array(['Transferred Out'], dtype=object)

In [73]:
for ds in datasets:
    print('Investigation:', ds.sourceID.unique()[0])
    ds_col = ds.status
    print('Number of rows in dataset:', ds_col.shape[0])
    print('Number of populated rows:', ds_col.count())
    print('Difference between the number of rows and the populated rows:', ds_col.shape[0] - ds_col.count())
    print('Number of unique entries:', ds_col.nunique())
    print('Difference between the number of populated entries and the number of unique entries:', ds_col.count() - ds_col.nunique())
    print(ds_col.value_counts(), '\n')

Investigation: Bahamas Leaks
Number of rows in dataset: 175888
Number of populated rows: 0
Difference between the number of rows and the populated rows: 175888
Number of unique entries: 0
Difference between the number of populated entries and the number of unique entries: 0
Series([], Name: status, dtype: int64) 

Investigation: Offshore Leaks
Number of rows in dataset: 105516
Number of populated rows: 96475
Difference between the number of rows and the populated rows: 9041
Number of unique entries: 21
Difference between the number of populated entries and the number of unique entries: 96454
Active                                   46004
Dead                                     23095
Struck / Defunct / Deregistered          19486
Transferred OUT                           2637
Transferred Out                           1807
Liquidated                                 826
Company liquidated                         742
Not To Be Renewed / In Deregistration      663
Shelf                    

In [None]:
bahamas_entity_raw.to_csv('../data/intermediate/bahamas_entity.csv')
offshore_entity_raw.to_csv('../data/intermediate/offshore_entity.csv')
panama_entity_raw.to_csv('../data/intermediate/panama_entity.csv')
paradise_entity_raw.to_csv('/data/intermediate/paradise_entity.csv')