In [1]:
import pandas as pd
import numpy as np

pd.options.display.float_format = '{:.1f}'.format
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [6]:
# stata intermediate data outputs
full_data_stata = pd.read_stata("../../../stata_files/full_data.dta")
before_mcp_stata = pd.read_stata("../../../stata_files/before_mcp_matrix.dta")
rca_stata = pd.read_stata("../../../stata_files/mcp_rca_matrix.dta")
mcp_stata = pd.read_stata("../../../stata_files/mcp_matrix.dta")
ecomp_stata = pd.read_stata("../../../stata_files/ecomplexity.dta")
reliable_stata = pd.read_stata("../../../stata_files/complexity_reliable_countries.dta")
# c_all = pd.read_stata("../../../stata_files/complexity_all_countries.dta")
# c_allcp = pd.read_stata("../../../stata_files/complexity_all_countries_all_products.dta")

In [7]:
# python intermediate data outputs
full_data_py = pd.read_parquet("../data/intermediate/H0_2015_complexity_all_countries.parquet")
before_mcp_py = pd.read_parquet("../data/intermediate/H0_2015_before_mcp.parquet")
rca_py = pd.read_parquet("../data/intermediate/H0_2015_mcp_rca.parquet")
mcp_py = pd.read_parquet("../data/intermediate/H0_2015_mcp.parquet")
ecomp_py = pd.read_parquet("../data/intermediate/H0_2015_complexitytest.parquet")
reliable_py = pd.read_parquet("../data/intermediate/H0_2015_reliable_countries.parquet")


In [8]:
def stata_python_data_review(df_stata, df_py, merge_on, value_checks):
    print(f"stata df shape: {df_stata.shape} py df shape {df_py.shape}")
    merged = df_py.merge(df_stata, on=merge_on, how='outer', suffixes=('_py', '_stata'))
    errors = pd.DataFrame()
    for col in value_checks: 
        print(f"reviewing {col}")
        merged[f'{col}_diff'] = merged[f'{col}_stata'] - merged[f'{col}_py']
        print(f"floating point precision is met: {np.isclose(merged[f'{col}_stata'], merged[f'{col}_py']).all() == True}")
        not_close = merged[~np.isclose(merged[f'{col}_stata'], merged[f'{col}_py'])]
        errors = pd.concat([errors, not_close])
    return merged, errors
    

In [5]:
# full data 
full_data_stata = full_data_stata.drop(columns='year')
full_data_merged, not_close = stata_python_data_review(full_data_stata, full_data_py, ['exporter', 'commoditycode'], ['export_value', 'import_value'])

stata df shape: (289224, 7) py df shape (289224, 7)
reviewing export_value
floating point precision is met: True
reviewing import_value
floating point precision is met: True


In [6]:
# before mcp
before_mcp_stata.columns, before_mcp_py.columns
before_mcp_stata.shape, before_mcp_py.shape
before_mcp_merged, not_close = stata_python_data_review(before_mcp_stata, before_mcp_py, ['exporter', 'commoditycode'], ['export_value'])

stata df shape: (163989, 5) py df shape (163989, 5)
reviewing export_value
floating point precision is met: True


In [7]:
# rca matrix
rename_dict = {"HH": "HH_index"}
rca_stata = rca_stata.rename(columns = rename_dict)
rca_merged, not_close_rca = stata_python_data_review(rca_stata, rca_py, ['exporter', 'commoditycode'], ['export_value', 'rca', 'mcp', 'HH_index'])

stata df shape: (163989, 11) py df shape (163989, 8)
reviewing export_value
floating point precision is met: True
reviewing rca
floating point precision is met: True
reviewing mcp
floating point precision is met: True
reviewing HH_index
floating point precision is met: True


In [8]:
# mcp
rename_dict = {
    'HH': 'HH_index',
    'cumshare': 'cumul_share',
    'ene': 'eff_exporters',
    'c1': 'flag_for_small_share',
    'c2': 'flag_for_few_exporters',
    'c3': 'flag_for_low_ubiquity',
    'call': 'exclude_flag'
}

mcp_stata = mcp_stata.rename(columns = rename_dict)
mcp_cols = ['commoditycode', 'export_value', 'HH_index', 'mcp', 'share',
        'cumul_share', 'eff_exporters', 'flag_for_small_share',
        'flag_for_few_exporters', 'flag_for_low_ubiquity', 'exclude_flag']
mcp_merged = stata_python_data_review(mcp_stata, mcp_py, ['commoditycode'], ['export_value', 'HH_index', 'mcp', 'share', 'exclude_flag'])

stata df shape: (1233, 11) py df shape (1233, 11)
reviewing export_value
floating point precision is met: True
reviewing HH_index
floating point precision is met: True
reviewing mcp
floating point precision is met: True
reviewing share
floating point precision is met: True
reviewing exclude_flag
floating point precision is met: True


In [10]:
# ecomp
ecomp_py.shape, ecomp_stata.shape
ecomp_py.columns, ecomp_stata.columns
ecomp_stata = ecomp_stata.rename(columns={'M': 'mcp'})
ecomp_stata = ecomp_stata.astype({
    'exporter': 'object',
    'commoditycode': 'object',
    'export_value': 'float32',
    'diversity': 'int64',
    'ubiquity': 'int64',
    'mcp': 'int64',
    'eci': 'float64',
    'pci': 'float64',
    'density': 'float64',
    'coi': 'float64',
    'cog': 'float64',
    'rca': 'float32'
})
ecomp_merged, not_close_ecomp = stata_python_data_review(ecomp_stata, ecomp_py, ['exporter', 'commoditycode'], ['export_value', 'diversity', 'ubiquity', 'mcp', 'eci', 'pci', 'density', 'coi', 'cog', 'rca'])

stata df shape: (150822, 14) py df shape (150822, 13)
reviewing export_value
floating point precision is met: True
reviewing diversity
floating point precision is met: True
reviewing ubiquity
floating point precision is met: True
reviewing mcp
floating point precision is met: True
reviewing eci
floating point precision is met: True
reviewing pci
floating point precision is met: True
reviewing density
floating point precision is met: False
reviewing coi
floating point precision is met: False
reviewing cog
floating point precision is met: False
reviewing rca
floating point precision is met: True


In [11]:
cols = ['year', 'exporter', 'commoditycode', 
          'export_value_py', 'export_value_stata', 'export_value_diff',
          'diversity_py', 'diversity_stata', 'diversity_diff',
          'ubiquity_py', 'ubiquity_stata', 'ubiquity_diff',
          'mcp_py', 'mcp_stata', 'mcp_diff',
          'eci_py', 'eci_stata', 'eci_diff',
          'pci_py', 'pci_stata', 'pci_diff',
          'density_py', 'density_stata', 'density_diff',
          'coi_py', 'coi_stata', 'coi_diff',
          'cog_py', 'cog_stata', 'cog_diff',
          'rca_py', 'rca_stata', 'rca_diff',
          'population', 'gdp_pc']
not_close_ecomp = not_close_ecomp[cols]
ecomp_merged = ecomp_merged[cols]
ecomp_merged[ecomp_merged.cog_diff>0.1]

Unnamed: 0,year,exporter,commoditycode,export_value_py,export_value_stata,export_value_diff,diversity_py,diversity_stata,diversity_diff,ubiquity_py,ubiquity_stata,ubiquity_diff,mcp_py,mcp_stata,mcp_diff,eci_py,eci_stata,eci_diff,pci_py,pci_stata,pci_diff,density_py,density_stata,density_diff,coi_py,coi_stata,coi_diff,cog_py,cog_stata,cog_diff,rca_py,rca_stata,rca_diff,population,gdp_pc


In [25]:
reliable_stata.head()
ecomp_py.head()

Unnamed: 0,exporter,commoditycode,export_value,year,diversity,ubiquity,mcp,eci,pci,density,coi,cog,rca
0,AGO,101,0.0,2015,8,15,0,-1.1,0.7,0.0,-1.2,0.5,0.0
1,AGO,102,1114.0,2015,8,41,0,-1.1,-0.2,0.0,-1.2,-0.1,0.0
2,AGO,103,0.0,2015,8,17,0,-1.1,1.9,0.0,-1.2,0.6,0.0
3,AGO,104,0.0,2015,8,21,0,-1.1,-1.6,0.0,-1.2,-0.2,0.0
4,AGO,105,9279.8,2015,8,33,0,-1.1,0.9,0.0,-1.2,0.3,0.0


In [9]:
# reliable country set
rename = {"rca1": "rca", "M": "mcp", "eci1": "eci", "pci1": "pci", "density1": "density"}
reliable_stata = reliable_stata.rename(columns=rename)
reliable_merged, not_close_reliable = stata_python_data_review(reliable_stata, reliable_py, ['exporter', 'commoditycode'], ['eci', 'pci', 'rca'])

stata df shape: (150822, 14) py df shape (150822, 19)
reviewing eci
floating point precision is met: True
reviewing pci
floating point precision is met: True
reviewing rca
floating point precision is met: True


In [34]:
eci_review = not_close_reliable[['exporter', 'commoditycode', 'rca_py', 'rca_stata', 'eci_py', 'eci_stata', 'eci_diff']].sort_values(by='eci_diff')
eci_review[eci_review.rca_py<1]

Unnamed: 0,exporter,commoditycode,rca_py,rca_stata,eci_py,eci_stata,eci_diff
100368,NGA,5504,0.0,0.0,-0.1,-2.2,-2.1
100372,NGA,5509,0.0,0.0,-0.1,-2.2,-2.1
100373,NGA,5510,0.0,0.0,-0.1,-2.2,-2.1
100371,NGA,5508,0.1,0.1,-0.1,-2.2,-2.1
100370,NGA,5506,0.2,0.2,-0.1,-2.2,-2.1
...,...,...,...,...,...,...,...
68690,JPN,6214,0.1,0.1,0.0,2.4,2.4
68557,JPN,4812,0.4,0.4,0.0,2.4,2.4
68558,JPN,4813,0.0,0.0,0.0,2.4,2.4
68523,JPN,4410,0.0,0.0,0.0,2.4,2.4


In [9]:
reliable_stata.columns

Index(['exporter', 'commoditycode', 'export_value', 'population', 'gdp_pc',
       'rca', 'mcp', 'density1', 'eci', 'pci', 'diversity', 'ubiquity', 'coi',
       'cog'],
      dtype='object')

In [None]:
reliabl