In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 150

In [5]:
keep_cols = ['node_id', 'sourceID', 'name', 'countries', 'country_codes']
dtypes = {'node_id': 'int32', 'sourceID':'category', 'name':'object',
            'countries':'category', 'country_codes':'category'}

bahamas_intermediary_raw = pd.read_csv('../data/raw/bahamas_leaks/bahamas_leaks.nodes.intermediary.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
bahamas_intermediary_raw = bahamas_intermediary_raw[keep_cols]

offshore_intermediary_raw = pd.read_csv('../data/raw/offshore_leaks/offshore_leaks.nodes.intermediary.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
offshore_intermediary_raw = offshore_intermediary_raw[keep_cols]

panama_intermediary_raw = pd.read_csv('../data/raw/panama_papers/panama_papers.nodes.intermediary.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
panama_intermediary_raw = panama_intermediary_raw[keep_cols]

paradise_intermediary_raw = pd.read_csv('../data/raw/paradise_papers/paradise_papers.nodes.intermediary.csv', 
                                 usecols = keep_cols,
                                 dtype=dtypes)
paradise_intermediary_raw = paradise_intermediary_raw[keep_cols]

### To Do

- Standardize the names (Title case)
- There are companies and people in the name field. Do you want to break this into different tables? One with specific people and one with companies?
- Break peope into first and last name?
- Companies are broken out by their branch office. Do you want to make a Corporate column that unites these offices?

In [6]:
intermediary_df = pd.concat([bahamas_intermediary_raw, offshore_intermediary_raw, panama_intermediary_raw, paradise_intermediary_raw])
intermediary_df.head()

Unnamed: 0,node_id,sourceID,name,countries,country_codes
0,23000001,Bahamas Leaks,Internal User,,
1,23000002,Bahamas Leaks,M & A CORPORATE SERVICES LIMITED,Bahamas,BHS
2,23000003,Bahamas Leaks,BRITANNIA CONSULTING GROUP,Bahamas,BHS
3,23000004,Bahamas Leaks,ABACO FSC LTD.,Bahamas,BHS
4,23000005,Bahamas Leaks,OL PRIVATE CORPORATE COUNSEL LTD.,Bahamas,BHS


In [18]:
intermediary_df['sourceID'].value_counts()

Panama Papers                                    14110
Offshore Leaks                                    9526
Paradise Papers - Barbados corporate registry      974
Bahamas Leaks                                      541
Paradise Papers - Bahamas corporate registry       239
Paradise Papers - Appleby                          185
Paradise Papers - Nevis corporate registry          96
Paradise Papers - Aruba corporate registry          74
Name: sourceID, dtype: int64

In [12]:
print(intermediary_df[intermediary_df['name'].duplicated(keep=False)].sort_values('name').shape, '\n')
print(intermediary_df[intermediary_df['name'].duplicated(keep=False)].sort_values('name').nunique(), '\n')
intermediary_df[intermediary_df['name'].duplicated(keep=False)].sort_values('name').head()

(1706, 5) 

node_id          1706
sourceID            7
name              583
countries          61
country_codes      61
dtype: int64 



Unnamed: 0,node_id,sourceID,name,countries,country_codes
1402,135063428,Paradise Papers - Barbados corporate registry,(BARBADOS) INC. SG GLOBAL CONSULTING,,
1254,135126973,Paradise Papers - Barbados corporate registry,(BARBADOS) INC. SG GLOBAL CONSULTING,,
607,135020603,Paradise Papers - Barbados corporate registry,(BARBADOS) INC. SG GLOBAL CONSULTING,,
1164,135075791,Paradise Papers - Barbados corporate registry,(BARBADOS) LTD. AMICORP,,
971,135116404,Paradise Papers - Barbados corporate registry,(BARBADOS) LTD. AMICORP,,


In [14]:
print(intermediary_df[intermediary_df['name'].str.title().duplicated(keep=False)].sort_values('name').shape, '\n')
print(intermediary_df[intermediary_df['name'].str.title().duplicated(keep=False)].sort_values('name').nunique(), '\n')
intermediary_df[intermediary_df['name'].str.title().duplicated(keep=False)].sort_values('name').tail()

(2193, 5) 

node_id          2193
sourceID            8
name             1070
countries          69
country_codes      69
dtype: int64 



Unnamed: 0,node_id,sourceID,name,countries,country_codes
9121,297421,Offshore Leaks,YenBle Management Limited,Taiwan,TWN
9134,297440,Offshore Leaks,Yew Su-Sun,Hong Kong,HKG
9135,297441,Offshore Leaks,Yew Su-Sun,Hong Kong,HKG
3567,11003568,Panama Papers,ZETLAND CORPORATE SERVICES LIMITED,Hong Kong,HKG
2990,291134,Offshore Leaks,Zetland Corporate Services Limited,Hong Kong,HKG


In [15]:
for col in ['countries', 'country_codes']:
    print(col)
    print(intermediary_df[col].value_counts(), '\n')
    
# countries = Not Identified. There are some in lists. You'll need to get rid of those too
# country_codes = XXX. There are some in lists. You'll need to get rid of those too

countries
Hong Kong                                              4895
United Kingdom                                         1545
United States                                          1510
Taiwan                                                 1323
Switzerland                                            1321
Singapore                                              1307
Bahamas                                                 897
China                                                   646
Panama                                                  559
Indonesia                                               522
Guatemala                                               441
Luxembourg                                              403
Brazil                                                  402
Not identified                                          394
Thailand                                                358
Malaysia                                                350
Ecuador                       

In [17]:
intermediary_df['name'] = intermediary_df['name'].str.title()
intermediary_df.head()

Unnamed: 0,node_id,sourceID,name,countries,country_codes
0,23000001,Bahamas Leaks,Internal User,,
1,23000002,Bahamas Leaks,M & A Corporate Services Limited,Bahamas,BHS
2,23000003,Bahamas Leaks,Britannia Consulting Group,Bahamas,BHS
3,23000004,Bahamas Leaks,Abaco Fsc Ltd.,Bahamas,BHS
4,23000005,Bahamas Leaks,Ol Private Corporate Counsel Ltd.,Bahamas,BHS


In [None]:
intermediary_df.to_csv('../data/intermediate/intermediary.csv')