In [1]:
### RUN EVERY TIME: COMTRADE DATASETS EXTRACTING

In [2]:
### RUN EVERY TIME: INITIALIZATION

import pandas as pd
import numpy as np
import requests
import json
import gc
import os
import datetime
import time
import itertools
import networkx as nx

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('python version: ', python_version())
print('numpy version: ', np.__version__)
print('pandas version: ', pd.__version__)

python version:  3.7.4
numpy version:  1.17.2
pandas version:  0.25.3


In [12]:
### RUN EVERY TIME: MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### Dates:
str_date_end = '2022-12-31'
date_start = pd.Timestamp('1989-12-29')
date_end = pd.Timestamp(str_date_end)
date_ison = pd.Timestamp('1994-12-31')
### NA for MS Excel files:
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable', '---']
### Checked EBOPS service IDs list (df_serv_to_gics['GICS Group Code']):
list_services = ['206', '210', '214', '218', '219', '223', '227', '231', '232', '237', '240', '246', '247', '250', '251', '254', '255', '256', '257', '258', '263',
                 '264', '269', '272', '273', '288', '289', '292', '293', '294', '310', '391', '431', '500', '888', '891', '892', '894', '950']
### UN Comtrade authentication:
unc_login = 'pavelb'
unc_pass = 'bodoapux'
unc_token = 'wqgBfTCn0Idq0LZWWAFKgj3YQYRKczgdnfmlQ3CkanmvQzoAlnL1oK1OJ0yVoCjSLjkUozAj0/dD4eCkSLJO/6pLCqK+iweXMqMazaADI+YqBOUPFySpbXM0CEZepZEuNl5bqxg50EPVB5lCrifsoA=='
### UN Comtrade raw data containers:
str_path_unc_raw_comm_annual = 'Data_Files/Source_Files/unc_raw_comm_annual.h5'
str_path_unc_raw_serv_annual = 'Data_Files/Source_Files/unc_raw_serv_annual.h5'
str_key_unc_raw = 'unc_raw'
### UN Comtrade adopted data containers:
str_path_unc_res_all_annual = 'Data_Files/Source_Files/unc_res_all_annual.h5'
str_path_unc_res_all_zz = 'Data_Files/Source_Files/unc_res_all_world.h5'
str_key_unc_res = 'unc_res'
### File with aggregated flows:
str_path_unc_res_flows = 'Data_Files/Source_Files/unc_res_flows.h5'
### Universal HDF5 key:
str_key_unc_res = 'unc_res'
### Augmented bilateral export:
str_path_export_bilateral = 'Data_Files/Source_Files/comtrade_export_bilateral.h5'
### World export:
str_path_export_world = 'Data_Files/Source_Files/comtrade_export_world.h5'
### Export key:
str_key_unc_export = 'export_augmented'
### Augmented bilateral import:
str_path_import_bilateral = 'Data_Files/Source_Files/comtrade_import_bilateral.h5'
### World import:
str_path_import_world = 'Data_Files/Source_Files/comtrade_import_world.h5'
### Import key:
str_key_unc_import = 'import_augmented'
### Factor options:
str_path_factor_xlsx = 'Data_Files/Source_Files/comtrade_factor.xlsx'

In [5]:
### DEFINING WEIGHTED AVERAGE CALCULATOR

def weighted_average(ser_data, ser_weight = None, int_min_count = 0):
    ### Default output:
    num_result = np.NaN
    ### Checking for data presence:
    if (ser_data.count() > int_min_count):       
        ### Checking for weights dataset:
        if ser_weight is None:
            ### Calculating of simple average:
            num_result = np.nanmean(ser_data.values)
        else:
            ### Weights filtering:
            list_weight = ser_weight[ser_data.dropna().index].values
            ### Checking for weights presence:
            if np.nansum(list_weight):
                ### Data filtering:
                list_data = ser_data.dropna().values
                ### Weighted average calculating:
                num_result = np.nansum(list_data * list_weight) / np.nansum(list_weight)
    ### Results output:
    return num_result

In [6]:
### DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

In [7]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (TO BE IGNORED IN PRODUCT CODE)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [8]:
### RUN EVERY TIME: COMMON DATA EXTRACTION STEPS

### World Country Codes:
df_country_codes = get_country_codes()
### ISON membership history:
ser_ison_membership = ison_membership_converting(str_path_universe, pd.to_datetime(str_date_end))
ser_ison_membership.index.names = ['Date', 'Reporter']
### ISON Members:
list_ison_countries = sorted(ser_ison_membership.index.get_level_values('Reporter').unique())
### ISON status for the last available date:
ser_ison_status = ser_ison_membership.loc[ser_ison_membership.index[-1][0]]

In [9]:
### UN COMTRADE: DATA EXTRACTION

In [10]:
### UN COMTRADE: GENERAL DATA PREPARATION

### Constants:
int_unc_limit = 10 # 5
int_seconds_to_sleep = 2
### USA before 1981 code:
str_usa_1981 = '841'
### UN Comtrade country names to rename:
dict_map_to_replace = {'BOLIVIA (PLURINATIONAL STATE OF)': 'BOLIVIA',
                       'BOSNIA HERZEGOVINA': 'BOSNIA AND HERZEGOVINA',
                       'BR. INDIAN OCEAN TERR.': 'BRITISH INDIAN OCEAN TERRITORY',
                       'BR. VIRGIN ISDS': 'BRITISH VIRGIN ISLANDS',
                       'BRUNEI DARUSSALAM': 'BRUNEI',
                       'CABO VERDE': 'CAPE VERDE',
                       'CAYMAN ISDS': 'CAYMAN ISLANDS',
                       'CENTRAL AFRICAN REP.': 'CENTRAL AFRICAN REPUBLIC',
                       'CHRISTMAS ISDS': 'CHRISTMAS ISLAND',
                       'COCOS ISDS': 'COCOS ISLANDS',
                       'COOK ISDS': 'COOK ISLANDS',                    
                       'CURAÇAO': 'CURACAO',                          
                       'CZECHIA': 'CZECH REPUBLIC',                    
                       'DEM. REP. OF THE CONGO': 'DEMOCRATIC REPUBLIC OF THE CONGO',                          
                       'DOMINICAN REP.': 'DOMINICAN REPUBLIC',                    
                       'TIMOR-LESTE': 'EAST TIMOR',                          
                       'FALKLAND ISDS (MALVINAS)': 'FALKLAND ISLANDS',                    
                       'FAEROE ISDS': 'FAROE ISLANDS',                                           
                       'CHINA, HONG KONG SAR': 'HONG KONG',                          
                       'CÔTE D\'IVOIRE': 'IVORY COAST',                                           
                       'LAO PEOPLE\'S DEM. REP.': 'LAOS',                                         
                       'CHINA, MACAO SAR': 'MACAU',                          
                       'TFYR OF MACEDONIA': 'MACEDONIA',                    
                       'MARSHALL ISDS': 'MARSHALL ISLANDS',                          
                       'FS MICRONESIA': 'MICRONESIA',                    
                       'REP. OF MOLDOVA': 'MOLDOVA',                          
                       'NETH. ANTILLES': 'NETHERLANDS ANTILLES',                          
                       'DEM. PEOPLE\'S REP. OF KOREA': 'NORTH KOREA',                          
                       'N. MARIANA ISDS': 'NORTHERN MARIANA ISLANDS',                    
                       'STATE OF PALESTINE': 'PALESTINE',                          
                       'CONGO': 'REPUBLIC OF THE CONGO',                          
                       'RÉUNION': 'REUNION',                    
                       'RUSSIAN FEDERATION': 'RUSSIA',                          
                       'SOLOMON ISDS': 'SOLOMON ISLANDS',                    
                       'REP. OF KOREA': 'SOUTH KOREA',                                       
                       'UNITED REP. OF TANZANIA': 'TANZANIA',     
                       'OTHER ASIA, NES': 'TAIWAN',
                       'TURKS AND CAICOS ISDS': 'TURKS AND CAICOS ISLANDS',                    
                       'US VIRGIN ISDS': 'U.S. VIRGIN ISLANDS',                          
                       'USA': 'UNITED STATES',                          
                       'HOLY SEE (VATICAN CITY STATE)': 'VATICAN',                    
                       'VIET NAM': 'VIETNAM',                          
                       'WALLIS AND FUTUNA ISDS': 'WALLIS AND FUTUNA'
                      }

In [11]:
### UN COMTRADE: COUNTRIES DATA EXTRACTION AND MODIFICATION

def get_un_comtrade_country_id(df_country_codes):
    ### Getting UN Comtrade country info from post request:
    str_UNC_countries_set = 'http://comtrade.un.org/data/cache/partnerAreas.json'
    obj_UNC_countries_set = requests.post(str_UNC_countries_set)
    ### Object to dataframe transformation:
    list_UNC_countries = json.loads(obj_UNC_countries_set.text.encode().decode('utf-8-sig'))['results']
    df_UNC_countries = pd.DataFrame(list_UNC_countries)
    df_UNC_countries.columns = ['UNC ID', 'COUNTRY']
    df_UNC_countries['COUNTRY'] = df_UNC_countries['COUNTRY'].str.upper()
    df_UNC_countries.replace(dict_map_to_replace, inplace = True)
    df_UNC_countries.set_index('COUNTRY', append = False, drop = True, inplace = True)
    df_UNC_country_id = df_UNC_countries.join(df_country_codes, on = 'COUNTRY', how = 'left').dropna(how = 'any').reset_index(drop = True)
    df_UNC_country_id.drop('ISO LONG', axis = 1, inplace = True)
    df_UNC_country_id.columns = ['Comtrade_ID', 'Country']
    ser_UNC_country_id = df_UNC_country_id.set_index('Country').squeeze().sort_index()
    ### Results output:
    return ser_UNC_country_id

### Getting UN Comtrade country IDs:
ser_UNC_country_id = get_un_comtrade_country_id(df_country_codes)
#### Filtering ISON countries only & adding USA BEFORE 1981 Code & adding World Code:
#ser_UNC_country_id = ser_UNC_country_id.reindex(ser_ison_membership.index.get_level_values(1).unique())
#ser_UNC_country_id = ser_UNC_country_id.append(pd.Series(str_usa_1981, ['US'])).append(pd.Series('0', ['ZZ'])).sort_index() ### 'ZZ' ~ 'WORLD' TO BE LAST
### Only adding World Code
ser_UNC_country_id = ser_UNC_country_id.append(pd.Series('0', ['ZZ'])).sort_index() ### 'ZZ' ~ 'WORLD' TO BE LAST

In [12]:
### UN COMTRADE: DATA REQUEST EXECUTION

def get_un_comtrade_data(str_rep_country_id, str_par_country_id, int_max_rec = 500000, str_type = 'C', str_freq = 'A', str_classification_system = 'S1', 
                         str_period = 'all', str_trade_flow = 'All', str_classification_code = 'TOTAL', str_token = unc_token):
    ### Trade flows codification:
    dict_trade_flow = {'All': 'all', 'Import': '1', 'Export': '2', 're-Export': '3', 're-Import': '4', 'Both': '1,2'}
    ### URL prefix:
    str_url_base = 'http://comtrade.un.org/api/get?'
    ### Columns list:
    list_dataset_columns = ['Date', 'Reporter_ID', 'Partner_ID', 'Flow_ID', 'Commodity_ID', 'Value']
    ### Request URL preparation:
    str_url_request = str_url_base
    list_parameters = []
    list_parameters.append('max=' + str(int_max_rec)) # Usage limit
    list_parameters.append('type=' + str_type) # C = Commodities (merchandise trade data) / S = Services (trade in services data)
    list_parameters.append('freq=' + str_freq) # A = Annual, M = Monthly
    list_parameters.append('px=' + str_classification_system) # Trade data classification scheme. See list of valid classifications
    list_parameters.append('ps=' + str_period) # Time period
    list_parameters.append('r=' + str_rep_country_id) # Reporter country
    list_parameters.append('p=' + str_par_country_id) # Partner country
    list_parameters.append('rg=' + dict_trade_flow[str_trade_flow]) # Trade direction
    list_parameters.append('cc=' + str_classification_code) # Commodity code
    list_parameters.append('token=' + str_token) # Authorization code   
    list_parameters.append('fmt=json') # Response data format
    str_url_request += '&'.join(list_parameters)
    ### Getting UN Comtrade data from post request:
    request_session = requests.Session()
    obj_unc_dataset = request_session.post(str_url_request)
    print(str_url_request)
    ### Object to dataframe transformation:        
    if ('dataset' in obj_unc_dataset.json().keys()):
        list_unc_dataset = obj_unc_dataset.json()['dataset']
        if (len(list_unc_dataset) > 1):
            df_unc_dataset = pd.DataFrame(list_unc_dataset)[['period', 'rtCode', 'ptCode', 'rgCode', 'cmdCode', 'TradeValue']]
            df_unc_dataset.columns = list_dataset_columns
            if (str_freq == 'M'):
                df_unc_dataset['Date'] = pd.to_datetime(df_unc_dataset['Date'], format = '%Y%m') + pd.offsets.BMonthEnd()    
            elif (str_freq == 'A'):
                df_unc_dataset['Date'] = pd.to_datetime(df_unc_dataset['Date'], format = '%Y') + pd.offsets.BYearEnd()    
            df_unc_dataset = df_unc_dataset[list_dataset_columns]
        elif (len(list_unc_dataset) == 1):
            df_unc_dataset = pd.DataFrame(columns = list_dataset_columns)    
        else:
            df_unc_dataset = pd.DataFrame([[np.NaN] * len(list_dataset_columns)], columns = list_dataset_columns)
    else:
        df_unc_dataset = pd.DataFrame(columns = list_dataset_columns)
    ### Results output:
    return df_unc_dataset

In [None]:
### UN COMTRADE: SERVICES ANNUAL DATA EXTRACTION SCRIPT

### Concatenation aggregator initializing:
list_dataset = []
### 5-length country pairs collection:
list_empty_requests = list(itertools.product(range((len(ser_UNC_country_id.index) - 1) // int_unc_limit + 1), repeat = 2))
### Looping over 5-length country pairs:
while list_empty_requests:
    iter_country_pair = list_empty_requests[0]
    iter_reporter_group = iter_country_pair[0]    
    iter_partner_group = iter_country_pair[1]
    print(iter_reporter_group * int_unc_limit, '-', (iter_reporter_group + 1) * int_unc_limit - 1, '/', 
          iter_partner_group * int_unc_limit, '-', (iter_partner_group + 1) * int_unc_limit - 1)     
    ### Country groups preparing:
    list_reporter_group = ser_UNC_country_id.iloc[iter_reporter_group * int_unc_limit : (iter_reporter_group + 1) * int_unc_limit].to_list()
    list_partner_group = ser_UNC_country_id.iloc[iter_partner_group * int_unc_limit : (iter_partner_group + 1) * int_unc_limit].to_list()    
    str_reporter_group = ','.join(ser_UNC_country_id.iloc[iter_reporter_group * int_unc_limit : (iter_reporter_group + 1) * int_unc_limit].to_list())
    str_partner_group = ','.join(ser_UNC_country_id.iloc[iter_partner_group * int_unc_limit : (iter_partner_group + 1) * int_unc_limit].to_list())    
    ### Last single country list control to avoid endless loop:
    if ((len(list_reporter_group) > 1) | (len(list_partner_group) > 1)):
        ### Request performing:
        df_iter_dataset = get_un_comtrade_data(str_reporter_group, str_partner_group, str_trade_flow = 'Both', str_type = 'S', \
                                               str_classification_code = 'all', str_classification_system = 'EB02',
                                               str_period = ','.join(map(str, range(date_start.year, date_end.year + 1))))
        if (len(df_iter_dataset) > 1):        
            list_dataset += [df_iter_dataset]
            print(len(df_iter_dataset), 'rows of data loaded successfully')
            list_empty_requests.remove(iter_country_pair)
        elif (df_iter_dataset.isna().sum().sum() == len(df_iter_dataset.columns)):
            print('Empty response (no data found)')
            list_empty_requests.remove(iter_country_pair)
        else:
            print('API error occured')
        gc.collect()
        time.sleep(int_seconds_to_sleep)    
    else:
        print('Two lists contains the same single country: no data')
        list_empty_requests.remove(iter_country_pair)
#    break

In [15]:
### UN COMTRADE: SERVICES ANNUAL RAW DATA SAVING

### Raw data concatenating:
df_loop_dataset = pd.concat(list_dataset, axis = 0, sort = False, ignore_index = True)
del list_dataset
gc.collect()
### Raw data saving:
df_loop_dataset.to_hdf(path_or_buf = str_path_unc_raw_serv_annual, key = str_key_unc_raw, mode = 'w', format = 'table')

In [None]:
### UN COMTRADE: GOODS ANNUAL DATA EXTRACTION SCRIPT

gc.collect()
### Concatenation aggregator initializing:
list_dataset = []
### 5-length country pairs collection:
list_empty_requests = list(itertools.product(range((len(ser_UNC_country_id.index) - 1) // int_unc_limit + 1), repeat = 2))#[0 : 5]
### Looping over 5-length country pairs:
while list_empty_requests:
    iter_country_pair = list_empty_requests[0]
    iter_reporter_group = iter_country_pair[0]    
    iter_partner_group = iter_country_pair[1]
    print(iter_reporter_group * int_unc_limit, '-', (iter_reporter_group + 1) * int_unc_limit - 1, '/', 
          iter_partner_group * int_unc_limit, '-', (iter_partner_group + 1) * int_unc_limit - 1)     
    ### Country groups preparing:
    list_reporter_group = ser_UNC_country_id.iloc[iter_reporter_group * int_unc_limit : (iter_reporter_group + 1) * int_unc_limit].to_list()
    list_partner_group = ser_UNC_country_id.iloc[iter_partner_group * int_unc_limit : (iter_partner_group + 1) * int_unc_limit].to_list()    
    str_reporter_group = ','.join(ser_UNC_country_id.iloc[iter_reporter_group * int_unc_limit : (iter_reporter_group + 1) * int_unc_limit].to_list())
    str_partner_group = ','.join(ser_UNC_country_id.iloc[iter_partner_group * int_unc_limit : (iter_partner_group + 1) * int_unc_limit].to_list())    
    ### Last single country list control to avoid endless loop:
    if ((len(list_reporter_group) > 1) | (len(list_partner_group) > 1)):
        ### Request performing:
        df_iter_dataset = get_un_comtrade_data(str_reporter_group, str_partner_group, str_trade_flow = 'Both', 
                                               str_classification_code = 'AG2', str_classification_system = 'H0', 
                                               str_period = ','.join(map(str, range(date_start.year, date_end.year + 1))))
        if (len(df_iter_dataset) > 1):        
            list_dataset += [df_iter_dataset]
            print(len(df_iter_dataset), 'rows of data loaded successfully')
            list_empty_requests.remove(iter_country_pair)
        elif (df_iter_dataset.isna().sum().sum() == len(df_iter_dataset.columns)):
            print('Empty response (no data found)')
            list_empty_requests.remove(iter_country_pair)
        else:
            print('API error occured')
        gc.collect()
        time.sleep(int_seconds_to_sleep)    
    else:
        print('Two lists contains the same single country: no data')
        list_empty_requests.remove(iter_country_pair)                                
#    break

In [None]:
### UN COMTRADE: GOODS ANNUAL RAW DATA SAVING

if (os.path.exists(str_path_unc_raw_comm_annual)):
    os.remove(str_path_unc_raw_comm_annual)
### Raw data saving:
for df_iter in list_dataset:
    df_iter.to_hdf(path_or_buf = str_path_unc_raw_comm_annual, key = str_key_unc_raw, mode = 'a', format = 'table', append = True)
del list_dataset
gc.collect()

In [None]:
### UN COMTRADE: SERVICES ANNUAL RAW DATA LOADING & CONVERTING

list_chunks = []
for df_iter_cast in pd.read_hdf(path_or_buf = str_path_unc_raw_serv_annual, key = str_key_unc_raw, chunksize = 1000000):
    gc.collect()
    ### Casting trade volume:
    df_iter_cast['Value'] = (df_iter_cast['Value'] / 1000).astype('int32')
    df_iter_cast['Value'].clip(lower = 0, inplace = True)
#    df_iter_cast = df_iter_cast[df_iter_cast['Reporter_ID'] != df_iter_cast['Partner_ID']]
    ### Replacing codes with values:
    df_iter_cast['Reporter_ID'].replace(dict(zip(list(map(int, ser_UNC_country_id.values)), ser_UNC_country_id.index.to_list())), inplace = True)
    df_iter_cast['Partner_ID'].replace(dict(zip(list(map(int, ser_UNC_country_id.values)), ser_UNC_country_id.index.to_list())), inplace = True)    
    df_iter_cast.loc[All, 'Partner_ID'].replace({'ZZ': 'World'}, inplace = True)    
    ### Dropping Saudi Arabia to Taiwan Export (https://unstats.un.org/wiki/display/comtrade/Taiwan%2C+Province+of+China+Trade+data) 
    df_iter_cast.drop(df_iter_cast[(df_iter_cast['Reporter_ID'] == 'SA') & (df_iter_cast['Partner_ID'] == 'TW')].index, inplace = True)    
#    df_iter_cast['Commodity_ID'].replace(dict_comm_classification, inplace = True)
    df_iter_cast['Flow_ID'].replace({1: 'Import', 2: 'Export'}, inplace = True)
    ### Adding Type column:
    df_iter_cast['Type'] = 'Services'
    ### Categorization:    
    df_iter_cast.columns = ['Date', 'Reporter', 'Partner', 'Flow', 'Commodity_ID', 'Value', 'Type']
    df_iter_cast = df_iter_cast.astype({'Reporter': 'category', 'Partner': 'category', 'Flow': 'category', 'Type': 'category'})
#    df_iter_cast = df_iter_cast.astype({'Reporter': 'category', 'Partner': 'category', 'Flow': 'category', 'Commodity_ID': 'category', 'Type': 'category'})    
    df_iter_cast['Reporter'] = df_iter_cast['Reporter'].cat.set_categories(ser_UNC_country_id.index.to_list()[: -1] + ['World'])
    df_iter_cast['Partner'] = df_iter_cast['Partner'].cat.set_categories(ser_UNC_country_id.index.to_list()[: -1] + ['World'])
    df_iter_cast['Flow'] = df_iter_cast['Flow'].cat.set_categories(['Export', 'Import'])
    df_iter_cast['Type'] = df_iter_cast['Type'].cat.set_categories(['Goods', 'Services'])    
    ### Adding chunk to collection:
    list_chunks.append(df_iter_cast)
#    break

In [None]:
### UN COMTRADE: GOODS ANNUAL RAW DATA LOADING & CONVERTING

#list_chunks = []
for df_iter_cast in pd.read_hdf(path_or_buf = str_path_unc_raw_comm_annual, key = str_key_unc_raw, chunksize = 1000000):
    gc.collect()
    ### Casting trade volume:
    df_iter_cast['Value'] = (df_iter_cast['Value'] / 1000).astype('int32')
#    df_iter_cast = df_iter_cast[df_iter_cast['Reporter_ID'] != df_iter_cast['Partner_ID']]
    ### Replacing codes with values:
    df_iter_cast['Reporter_ID'].replace(dict(zip(list(map(int, ser_UNC_country_id.values)), ser_UNC_country_id.index.to_list())), inplace = True)
    df_iter_cast['Partner_ID'].replace(dict(zip(list(map(int, ser_UNC_country_id.values)), ser_UNC_country_id.index.to_list())), inplace = True)    
    df_iter_cast.loc[All, 'Partner_ID'].replace({'ZZ': 'World'}, inplace = True)    
    ### Dropping Saudi Arabia to Taiwan Export (https://unstats.un.org/wiki/display/comtrade/Taiwan%2C+Province+of+China+Trade+data) 
    df_iter_cast.drop(df_iter_cast[(df_iter_cast['Reporter_ID'] == 'SA') & (df_iter_cast['Partner_ID'] == 'TW')].index, inplace = True)    
#    df_iter_cast['Commodity_ID'].replace(dict_comm_classification, inplace = True)
    df_iter_cast['Flow_ID'].replace({1: 'Import', 2: 'Export'}, inplace = True)
    ### Adding Type column:
    df_iter_cast['Type'] = 'Goods'
    ### Categorization:    
    df_iter_cast.columns = ['Date', 'Reporter', 'Partner', 'Flow', 'Commodity_ID', 'Value', 'Type']
    df_iter_cast = df_iter_cast.astype({'Reporter': 'category', 'Partner': 'category', 'Flow': 'category', 'Type': 'category'})
#    df_iter_cast = df_iter_cast.astype({'Reporter': 'category', 'Partner': 'category', 'Flow': 'category', 'Commodity_ID': 'category', 'Type': 'category'})    
    df_iter_cast['Reporter'] = df_iter_cast['Reporter'].cat.set_categories(ser_UNC_country_id.index.to_list()[: -1] + ['World'])
    df_iter_cast['Partner'] = df_iter_cast['Partner'].cat.set_categories(ser_UNC_country_id.index.to_list()[: -1] + ['World'])
    df_iter_cast['Flow'] = df_iter_cast['Flow'].cat.set_categories(['Export', 'Import'])
    df_iter_cast['Type'] = df_iter_cast['Type'].cat.set_categories(['Goods', 'Services'])    
    ### Adding chunk to collection:    
    list_chunks.append(df_iter_cast)
#    break

In [19]:
### UN COMTRADE: SERVICES & GOODS CONVERTED ANNUAL DATA AGGREGATING & RE-SAVING

gc.collect()
df_cast_dataset = pd.concat(list_chunks, axis = 0, sort = False, ignore_index = True)
del list_chunks
gc.collect()
print(df_cast_dataset.memory_usage().sum())
df_cast_dataset['Commodity_ID'] = df_cast_dataset['Commodity_ID'].astype('category')
print(df_cast_dataset.memory_usage().sum())
ser_cast_dataset = df_cast_dataset.set_index(['Date', 'Reporter', 'Partner', 'Flow', 'Type', 'Commodity_ID']).squeeze().sort_index()
del df_cast_dataset
gc.collect()
print(ser_cast_dataset.memory_usage())
ser_cast_dataset.to_hdf(path_or_buf = str_path_unc_res_all_annual, key = str_key_unc_res, mode = 'w', format = 'table', complevel = 9)

948254266
729443916
474153092


In [20]:
### UN COMTRADE: AGGREGATED TOTAL EXTRACTING TO USE IN ACADIAN STAND

ser_zz_data = pd.read_hdf(path_or_buf = str_path_unc_res_all_annual, key = str_key_unc_res, where = "(Partner = 'World') & ('Flow' in ['Export', 'Import'])")
ser_zz_data.groupby(['Date', 'Reporter', 'Flow']).sum().dropna().to_hdf(path_or_buf = str_path_unc_res_all_zz, key = str_key_unc_res, mode = 'w', format = 'table')

In [None]:
### TEMP

pd.read_hdf(path_or_buf = str_path_unc_res_all_annual, key = str_key_unc_res, stop = 10000)
pd.read_hdf(path_or_buf = str_path_unc_res_all_zz, key = str_key_unc_res, stop = 10000)

In [None]:
### EXPORT AND REVERTED IMPORT CONCATENATION

gc.collect()
### File deleting:
if (os.path.exists(str_path_unc_res_flows)):
    os.remove(str_path_unc_res_flows)
### Results container:
list_export_aug = []
### Countries portion length:
int_portion = 5
### Looping over countrie portions:
for iter_num in range(len(ser_UNC_country_id.index.to_list()) // int_portion + 1):
#for iter_num in range(2):
    gc.collect()    
    ### Portion of countries selecting:
    list_iter_countries = list(ser_UNC_country_id.index.to_list())[int_portion * iter_num : int_portion * (iter_num + 1)]
    if (len(list_iter_countries) > 0):
        print(list_iter_countries)
        ### Export data loading:
        ser_unc_export = pd.read_hdf(str_path_unc_res_all_annual, key = str_key_unc_res,
                                     where = "(Flow = 'Export') & (Reporter in list_iter_countries) & (Partner != 'World')").droplevel('Flow')
        print('Export dataset loaded')
        ### Import data loading:
        ser_unc_import = pd.read_hdf(str_path_unc_res_all_annual, key = str_key_unc_res, 
                                     where = "(Flow = 'Import') & (Partner in list_iter_countries)").droplevel('Flow')
        print('Import dataset loaded')    
        ### Import data reverting:
        ser_unc_import.index.set_names('Partner_Inv', level = 1, inplace = True)
        ser_unc_import.index.set_names('Reporter', level = 2, inplace = True)
        ser_unc_import.index.set_names('Partner', level = 1, inplace = True)
        ser_unc_import = ser_unc_import.swaplevel('Reporter', 'Partner').sort_index()
        print('Import dataset reverted')
        ### Datasets concatenation:
        df_export_aug = pd.concat([ser_unc_export, ser_unc_import], axis = 1, names = 'Source Flow', keys = ['Export', 'Import']).astype('float32')
        del ser_unc_export
        del ser_unc_import    
        gc.collect()    
        print('Export and reverted Import dataset concatenated')
        ### Columns categorization:
        df_export_aug.to_hdf(str_path_unc_res_flows, key = str_key_unc_res, mode = 'a', format = 'table', complevel = 9, append = True)                
#        df_export_aug = df_export_aug.astype({'Export': 'Int32', 'Import': 'Int32'})
#        df_export_reseted['Reporter'] = df_export_reseted['Reporter'].cat.set_categories(ser_UNC_country_id.index.to_list()[: -1] + ['World'])
#        df_export_reseted['Partner'] = df_export_reseted['Partner'].cat.set_categories(ser_UNC_country_id.index.to_list()[: -1] + ['World'])
#        df_export_reseted['Type'] = df_export_reseted['Type'].cat.set_categories(['Goods', 'Services'])
#        df_export_aug = df_export_aug.join(ser_ison_status, on = 'Reporter').set_index('Market', append = True).sort_index()
#        print('Concatenated dataset index sorted')
#        list_export_aug.append(df_export_aug)
        print('Aggregated dataset added to container')
    #    break
#### Container elements concatenation:
#df_export_aug_full = pd.concat(list_export_aug, axis = 0, sort = False)
#del list_export_aug
#gc.collect()    
#### Aggregated table indexation:
#df_export_aug_full = df_export_aug_full.sort_index()
#### Results saving:
#df_export_aug_full.astype(float).to_hdf(str_path_unc_res_flows, key = str_key_unc_res, mode = 'w', format = 'table', complevel = 9)

In [None]:
### CIF COEFFICIENTS CALCULATION & IMPLEMENTATION

gc.collect()
### Files deleting:
if (os.path.exists(str_path_export_bilateral)):
    os.remove(str_path_export_bilateral)
if (os.path.exists(str_path_import_bilateral)):
    os.remove(str_path_import_bilateral)
### Getting list of commodities:
str_date = '2020-12-31'
list_commodity_id = sorted(pd.read_hdf(str_path_unc_res_flows, key = str_key_unc_res, where = "Date in str_date").index.get_level_values('Commodity_ID').unique())
### Bounds to filter bilateral Import to Export ratio before median calculation:
flo_lower_bound = 1.0
flo_upper_bound = 2.0
### Bilateral median calculation procedure:
def get_obs_median(df_comm):
    ### Export to Import ratio:
    ser_obs_coeff = df_comm['Import'] / df_comm['Export']
    ### Ratio filtering:
    ser_obs_coeff = ser_obs_coeff.loc[(ser_obs_coeff >= flo_lower_bound) & (ser_obs_coeff <= flo_upper_bound)]
    ### Filtered timeseries median as a result:
    return ser_obs_coeff.median()
### Calulation CIF coefficient for all commodities:
for iter_commodity in list_commodity_id:
    gc.collect()
    df_iter_flows = pd.read_hdf(str_path_unc_res_flows, key = str_key_unc_res, where = "Commodity_ID = iter_commodity")
    ser_cif_median = df_iter_flows.droplevel('Commodity_ID').groupby(['Reporter', 'Partner']).apply(get_obs_median)
    ### General commodity median calculation:
    flo_median = ser_cif_median.median()
    print(iter_commodity, ':', flo_median)
    ### Filling missed bilateral values with general commodity median:
    if not (np.isnan(flo_median)):
        ser_cif_median.fillna(flo_median, inplace = True)        
    ser_cif_median.name = 'CIF_Coefficient'              
    ### Adding CIF coefficients to dataset:
    df_export_cif = df_iter_flows.merge(ser_cif_median, left_index = True, right_index = True)
    df_export_cif = df_export_cif.reorder_levels(['Date', 'Reporter', 'Partner', 'Type', 'Commodity_ID'])
    ### Import correction:
    df_export_cif['Import_Corrected'] = df_export_cif['Import'] / df_export_cif['CIF_Coefficient']
    ### Export correction:
    df_export_cif['Export_Corrected'] = df_export_cif['Export'] * df_export_cif['CIF_Coefficient']
    ### Combining Export & Import data:
    ser_export_cif = df_export_cif['Export'].combine_first(df_export_cif['Import_Corrected']).astype('float32')
    ser_import_cif = df_export_cif['Import'].combine_first(df_export_cif['Export_Corrected']).astype('float32')
#    del df_export_cif
    gc.collect()
    ser_import_cif = ser_import_cif.reorder_levels(['Date', 'Partner', 'Reporter', 'Type', 'Commodity_ID']).sort_index()                               
    ser_import_cif.index.names = ['Date', 'Reporter', 'Partner', 'Type', 'Commodity_ID']
    gc.collect()
    ### Incorporation options:
    ser_export_cif.squeeze().to_hdf(str_path_export_bilateral, key = str_key_unc_export, mode = 'a', format = 'table', complevel = 9, append = True, 
                                    min_itemsize = {'Type': 8, 'Commodity_ID': 3})
    ser_import_cif.squeeze().to_hdf(str_path_import_bilateral, key = str_key_unc_import, mode = 'a', format = 'table', complevel = 9, append = True, 
                                    min_itemsize = {'Type': 8, 'Commodity_ID': 3}) 
#    break

In [12]:
### BILATERAL EXPORT DATA LOADING TO PERFORM FACTOR CALCULATION

gc.collect()
list_export_chunks = []
for num_iter_number, ser_iter_chunk in enumerate(pd.read_hdf(str_path_export_bilateral, key = str_key_unc_export, chunksize = 1000000)):
    gc.collect()
    print(num_iter_number, ': Extraction started')
    ser_iter_chunk = ser_iter_chunk[ser_iter_chunk > 0.0].astype('int32')
    df_iter_chunk = ser_iter_chunk.reset_index()
    df_iter_chunk = df_iter_chunk[(df_iter_chunk['Reporter'] != df_iter_chunk['Partner']) & \
                                ((df_iter_chunk['Type'] == 'Goods') | df_iter_chunk['Commodity_ID'].isin(list_services))]\
                               .drop('Type', axis = 1)    
    print(num_iter_number, ': Filtering performed')    
    ser_iter_chunk = df_iter_chunk.set_index(['Date', 'Reporter', 'Partner', 'Commodity_ID']).squeeze().sort_index()
    del df_iter_chunk
    gc.collect()
    list_export_chunks.append(ser_iter_chunk)
    print(num_iter_number, ': Chunk added to container')    
ser_bilateral_export = pd.concat(list_export_chunks, axis = 0, sort = False).sort_index()
ser_bilateral_export.name = 'Export'
del list_export_chunks
gc.collect()

76

In [14]:
### BILATERAL IMPORT DATA LOADING TO PERFORM FACTOR CALCULATION

gc.collect()
list_import_chunks = []
for num_iter_number, ser_iter_chunk in enumerate(pd.read_hdf(str_path_import_bilateral, key = str_key_unc_import, chunksize = 1000000)):
    gc.collect()
    print(num_iter_number, ': Extraction started')
    ser_iter_chunk = ser_iter_chunk[ser_iter_chunk > 0.0].astype('int32')
    df_iter_chunk = ser_iter_chunk.reset_index()
    df_iter_chunk = df_iter_chunk[(df_iter_chunk['Reporter'] != df_iter_chunk['Partner']) & \
                                ((df_iter_chunk['Type'] == 'Goods') | df_iter_chunk['Commodity_ID'].isin(list_services))]\
                               .drop('Type', axis = 1)    
    print(num_iter_number, ': Filtering performed')    
    ser_iter_chunk = df_iter_chunk.set_index(['Date', 'Reporter', 'Partner', 'Commodity_ID']).squeeze().sort_index()
    del df_iter_chunk
    gc.collect()
    list_import_chunks.append(ser_iter_chunk)
    print(num_iter_number, ': Chunk added to container')    
ser_bilateral_import = pd.concat(list_import_chunks, axis = 0, sort = False).sort_index()
ser_bilateral_import.name = 'Import'
del list_import_chunks
gc.collect()

76

In [16]:
### REPORTER / COMMODITY BY DATE TOTAL EXPORT & IMPORT & TRADE

gc.collect()
### Export totals:
ser_country_comm_export = ser_bilateral_export.groupby(['Date', 'Reporter', 'Commodity_ID']).sum().dropna()
ser_country_comm_export.name = 'Export'
### Import totals:
ser_country_comm_import = ser_bilateral_import.groupby(['Date', 'Reporter', 'Commodity_ID']).sum().dropna()
ser_country_comm_import.name = 'Import'
### Adding trade totals:
df_country_comm_trade = pd.concat([ser_country_comm_export, ser_country_comm_import], axis = 1)
df_country_comm_trade = df_country_comm_trade.unstack('Date').stack('Date', dropna = False)
df_country_comm_trade = df_country_comm_trade.unstack('Reporter').stack('Reporter', dropna = False)
df_country_comm_trade = df_country_comm_trade.unstack('Commodity_ID').stack('Commodity_ID', dropna = False).fillna(0.0)
df_country_comm_trade['Trade'] = df_country_comm_trade['Export'] + df_country_comm_trade['Import']

In [17]:
### TOTAL COMMODITY BY DATE TRADE VOLUME & CROSS-SECTIONAL WEIGHT

### Total commodity volume:
ser_commodity_trade = df_country_comm_trade.groupby(['Date', 'Commodity_ID'])['Trade'].sum().swaplevel().sort_index()
ser_commodity_trade.name = 'Commodity_Total'
df_commodity_trade = ser_commodity_trade.to_frame()
### Total commodity weight:
df_commodity_trade['Commodity_Weight'] = df_commodity_trade['Commodity_Total'].groupby('Date').transform(lambda df_group: df_group / df_group.sum())
### Adding missed years:
def reindex_annual(df_group):
    df_result = df_group.droplevel('Commodity_ID').resample('BY').last()
    df_result = df_result.reindex(pd.date_range(df_result.index[0], str_date_end, freq = 'BY'))
    return df_result
df_commodity_trade = df_commodity_trade.groupby('Commodity_ID').apply(reindex_annual)
### Resampling to monthly data:
def reindex_monthly(df_group):
    df_result = df_group.droplevel('Commodity_ID').resample('BM').last()
    df_result = df_result.reindex(pd.date_range(df_result.index[0], str_date_end, freq = 'BM'))
    df_result = df_result.ffill()
    return df_result
df_commodity_trade = df_commodity_trade.groupby('Commodity_ID').apply(reindex_monthly)
df_commodity_trade.index.names = ['Commodity_ID', 'Date']

In [19]:
### REPORTER / COMMODITY BY DATE PAGE RANK

gc.collect()
def get_pagerank(df_group):
    nx_graph = nx.from_pandas_edgelist(df_group, 'Reporter', 'Partner', edge_attr = 'Export', create_using = nx.DiGraph)
    dict_pagerank = nx.pagerank(nx_graph)
    ser_pagerank = pd.Series(dict_pagerank)
    return ser_pagerank
    
ser_export_pagerank = ser_bilateral_export.reset_index().groupby(['Date', 'Commodity_ID']).apply(get_pagerank)
ser_export_pagerank.name = 'PG_Rank_Local'
ser_export_pagerank.index.names = ['Date', 'Commodity_ID', 'Reporter']
ser_export_pagerank = ser_export_pagerank.reorder_levels(['Date', 'Reporter', 'Commodity_ID']).sort_index()

In [20]:
### JOINING PAGE RANK TO REPORTER / COMMODITY BY DATE TRADE & REINDEXAITION / RESAMPLING TO MONTHLY FREQUENCY

gc.collect()

df_country_comm_ranked = df_country_comm_trade['Trade'].to_frame().join(ser_export_pagerank, how = 'right')\
                                                       .reorder_levels(['Commodity_ID', 'Reporter', 'Date']).sort_index()
del df_country_comm_trade
gc.collect()
### Adding missed years:
def reindex_annual(df_group):
    df_result = df_group.droplevel(['Commodity_ID', 'Reporter']).resample('BY').last()
    df_result = df_result.reindex(pd.date_range(df_result.index[0], str_date_end, freq = 'BY'))
    return df_result
df_country_comm_ranked = df_country_comm_ranked.groupby(['Commodity_ID', 'Reporter']).apply(reindex_annual).fillna(-1.0)
### Resampling to monthly data:
def reindex_monthly(df_group):
    df_result = df_group.droplevel(['Commodity_ID', 'Reporter']).resample('BM').last()
    df_result = df_result.reindex(pd.date_range(df_result.index[0], str_date_end, freq = 'BM'))
    df_result = df_result.ffill()
    return df_result
df_country_comm_ranked = df_country_comm_ranked.groupby(['Commodity_ID', 'Reporter']).apply(reindex_monthly).replace({-1.0: np.NaN})
df_country_comm_ranked.index.names = ['Commodity_ID', 'Reporter', 'Date']

In [23]:
### ADDING ISON REGIONS AND COMMODITY TOTAL TRADE VOLUME

### HERE WE CAN SELECT TO USE BY DATE ISON MEMBERSHIP OR LAST DATE (CURRENT) ISON STATUS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

gc.collect()
### Adding Regions by Date:
#df_country_comm_isoned = df_country_comm_ranked.join(ser_ison_membership.swaplevel().sort_index())\
#                                               .dropna(subset = ['Market']).reorder_levels(['Commodity_ID', 'Date', 'Reporter']).sort_index()
#### Adding last date Regions:
#df_country_comm_isoned = df_country_comm_ranked.join(ser_ison_status).dropna(subset = ['Market']).reorder_levels(['Commodity_ID', 'Date', 'Reporter']).sort_index()
#df_country_comm_isoned = df_country_comm_isoned.join(df_commodity_trade['Commodity_Weight']).reorder_levels(['Commodity_ID', 'Reporter', 'Date']).sort_index()
#df_country_comm_isoned = df_country_comm_isoned.set_index('Market', append = True)
### Not adding Markets:
df_country_comm_isoned = df_country_comm_ranked.reorder_levels(['Commodity_ID', 'Date', 'Reporter']).sort_index()
del df_country_comm_ranked
gc.collect()
df_country_comm_isoned = df_country_comm_isoned.join(df_commodity_trade['Commodity_Weight']).reorder_levels(['Commodity_ID', 'Reporter', 'Date']).sort_index()

In [24]:
### ADDING ALTERNATIVE PAGERANK & CALCULATING OF WEIGHTED AVERAGE FOR BOTH OPTIONS:

### GLOBAL RANK IS A LOCAL ONE MULTIPLIED BY TOTAL COMMODITY CROSS-SECTIONAL WEIGHT !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

gc.collect()
### Page Rank Weighted by Commodity by Date Weight:
df_country_comm_isoned['PG_Rank_Global'] = df_country_comm_isoned['PG_Rank_Local'] * df_country_comm_isoned['Commodity_Weight']
### Page Ranks Weighting:
df_pg_rank_mean = pd.DataFrame()
for iter_pg_rank in ['PG_Rank_Local', 'PG_Rank_Global']:
#    df_pg_rank_mean[iter_pg_rank] = df_country_comm_isoned.groupby(['Reporter', 'Date', 'Market'])\
#                                                          .apply(lambda df_group: weighted_average(df_group[iter_pg_rank], df_group['Trade']))
    df_pg_rank_mean[iter_pg_rank] = df_country_comm_isoned.groupby(['Reporter', 'Date'])\
                                                          .apply(lambda df_group: weighted_average(df_group[iter_pg_rank], df_group['Trade']))    

In [25]:
### FACTOR SAVING

df_pg_rank_mean.to_excel(str_path_factor_xlsx, merge_cells = False)

In [27]:
### TEMP

ser_export_pagerank

Date        Reporter  Commodity_ID
1989-12-29  AD        01              0.003644
                      02              0.003563
                      03              0.002449
                      04              0.003163
                      05              0.002600
                                        ...   
2022-12-30  ZW        94              0.002828
                      95              0.001660
                      96              0.002464
                      97              0.001832
                      99              0.002258
Name: PG_Rank_Local, Length: 744944, dtype: float64