In [1]:
### RUN EVERY TIME: COMTRADE DATASETS EXTRACTING

In [2]:
### RUN EVERY TIME: INITIALIZATION

import pandas as pd
import numpy as np
import gc
import os
import datetime
import time

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('python version: ', python_version())
print('numpy version: ', np.__version__)
print('pandas version: ', pd.__version__)

python version:  3.7.4
numpy version:  1.17.2
pandas version:  0.25.3


In [4]:
### RUN EVERY TIME: MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### Dates:
str_date_end = '2022-12-31'
date_start = pd.Timestamp('1989-12-29')
date_end = pd.Timestamp(str_date_end)
date_ison = pd.Timestamp('1994-12-31')
### NA for MS Excel files:
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable', '---']
### Checked EBOPS service IDs list (df_serv_to_gics['GICS Group Code']):
list_services = ['206', '210', '214', '218', '219', '223', '227', '231', '232', '237', '240', '246', '247', '250', '251', '254', '255', '256', '257', '258', '263',
                 '264', '269', '272', '273', '288', '289', '292', '293', '294', '310', '391', '431', '500', '888', '891', '892', '894', '950']
### Augmented bilateral export:
str_path_export_bilateral = 'Data_Files/Source_Files/comtrade_export_bilateral.h5'
### Export key:
str_key_unc_export = 'export_augmented'
### Market Cap data:
str_path_mcap_csv = 'Data_Files/Source_Files/market_cap.csv'
### Commodity to industry connection:
str_path_industry_map_xlsx = 'Data_Files/Source_Files/goods_to_industries_hs.xlsx'
str_page_services = 'EBOPS'
str_page_goods = 'AG2'
### Factor file:
str_path_industry_exp = 'Data_Files/Source_Files/comtrade_industry_exp.h5'
str_key_comtrade_factor = 'comtrade_factor'
str_path_factor_xlsx = 'Data_Files/Source_Files/comtrade_factor.xlsx'
str_path_factor_csv = 'Data_Files/Source_Files/comtrade_factor.csv'

In [5]:
### DEFINING WEIGHTED AVERAGE CALCULATOR

def weighted_average(ser_data, ser_weight = None, int_min_count = 0):
    ### Default output:
    num_result = np.NaN
    ### Checking for data presence:
    if (ser_data.count() > int_min_count):       
        ### Checking for weights dataset:
        if ser_weight is None:
            ### Calculating of simple average:
            num_result = np.nanmean(ser_data.values)
        else:
            ### Weights filtering:
            list_weight = ser_weight[ser_data.dropna().index].values
            ### Checking for weights presence:
            if np.nansum(list_weight):
                ### Data filtering:
                list_data = ser_data.dropna().values
                ### Weighted average calculating:
                num_result = np.nansum(list_data * list_weight) / np.nansum(list_weight)
    ### Results output:
    return num_result

In [6]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (TO BE IGNORED IN PRODUCT CODE)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = list_na_excel_values, keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [7]:
### RUN EVERY TIME: COMMON DATA EXTRACTION STEPS

### ISON membership history:
ser_ison_membership = ison_membership_converting(str_path_universe, pd.to_datetime(str_date_end))
ser_ison_membership.index.names = ['Date', 'Reporter']
### ISON Members:
list_ison_countries = sorted(ser_ison_membership.index.get_level_values('Reporter').unique())
### ISON status for the last available date:
ser_ison_status = ser_ison_membership.loc[ser_ison_membership.index[-1][0]]

In [None]:
### BILATERAL EXPORT DATA LOADING TO PERFORM FACTOR CALCULATION

gc.collect()
list_export_chunks = []
for num_iter_number, ser_iter_chunk in enumerate(pd.read_hdf(str_path_export_bilateral, key = str_key_unc_export, chunksize = 1000000)):
    gc.collect()
    print(num_iter_number, ': Extraction started')
    ser_iter_chunk = ser_iter_chunk[ser_iter_chunk > 0.0].astype('int32')
    df_iter_chunk = ser_iter_chunk.reset_index()
    df_iter_chunk = df_iter_chunk[(df_iter_chunk['Reporter'] != df_iter_chunk['Partner']) & \
                                  ((df_iter_chunk['Type'] == 'Goods') | df_iter_chunk['Commodity_ID'].isin(list_services)) & (df_iter_chunk['Reporter'] != 'World') & \
                                  (df_iter_chunk['Partner'] != 'World')]\
                               .drop('Type', axis = 1)    
    print(num_iter_number, ': Filtering performed')    
    ser_iter_chunk = df_iter_chunk.set_index(['Date', 'Reporter', 'Partner', 'Commodity_ID']).squeeze().sort_index()
    del df_iter_chunk
    gc.collect()
    list_export_chunks.append(ser_iter_chunk)
    print(num_iter_number, ': Chunk added to container')    
ser_bilateral_export = pd.concat(list_export_chunks, axis = 0, sort = False).sort_index()
ser_bilateral_export.name = 'Export'
del list_export_chunks
gc.collect()

In [9]:
### REPORTER / COMMODITY BY DATE TOTAL EXPORT

### Export totals:
ser_country_comm_export = ser_bilateral_export.groupby(['Date', 'Reporter', 'Commodity_ID']).sum().dropna()
ser_country_comm_export.name = 'Export'

del ser_bilateral_export
gc.collect()

23

In [10]:
### LOADING COMMODITIES TO INDUSTRIES MAP

df_services_to_ind = pd.read_excel(engine = 'openpyxl', io = str_path_industry_map_xlsx, sheet_name = str_page_services, header = 0, dtype = 'object',
                                usecols = ['Service ID', 'GICS Group Code'], na_values = list_na_excel_values, keep_default_na = False)
ser_services_to_ind = df_services_to_ind.dropna().set_index('Service ID')['GICS Group Code'].squeeze()
ser_services_to_ind.index.names = ['Commodity_ID']
ser_services_to_ind.name = 'Services'
df_goods_to_ind = pd.read_excel(engine = 'openpyxl', io = str_path_industry_map_xlsx, sheet_name = str_page_goods, header = 0, dtype = 'object',
                                usecols = ['Commodity ID', 'GICS Group Code'], na_values = list_na_excel_values, keep_default_na = False)
ser_goods_to_ind = df_goods_to_ind.dropna().set_index('Commodity ID')['GICS Group Code'].squeeze()
ser_goods_to_ind.index.names = ['Commodity_ID']
ser_goods_to_ind.name = 'Goods'

ser_comm_to_ind_map = pd.concat([ser_goods_to_ind, ser_services_to_ind], axis = 0)
ser_comm_to_ind_map.name = 'Industry_ID'

In [11]:
### MAPPING DATA TO INDUSTRIES

ser_country_industry_exp = ser_country_comm_export.loc[:, ser_ison_status.index, :].to_frame().join(ser_comm_to_ind_map, how = 'left')\
                                                  .dropna(subset = ['Industry_ID']).set_index('Industry_ID', append = True).squeeze().droplevel('Commodity_ID')\
                                                  .sort_index().groupby(['Date', 'Reporter', 'Industry_ID']).sum()
ser_country_industry_exp.index = ser_country_industry_exp.index.set_levels(ser_country_industry_exp.index.levels[-1].astype(str), level= 'Industry_ID')

In [18]:
### DUMMY INDUSTRY MARKET CAP CREATION AND SAVING

#idx_market_caps = ser_country_industry_exp.droplevel('Reporter').index.drop_duplicates()
#ser_industry_cap = pd.Series(np.random.randint(1, 1000, len(idx_market_caps)), index = idx_market_caps)
#ser_industry_cap.name = 'Market_Cap'
#ser_industry_cap.to_csv(str_path_mcap_csv, sep = ';', header = True)

In [38]:
### LOADING INDUSTRY MARKET CAPS & JOINING IT TO MAIN DATASET

ser_industry_cap = pd.read_csv(str_path_mcap_csv, sep = ';', header = 0, na_values = list_na_excel_values, keep_default_na = False, parse_dates = ['Date'])\
                                                .set_index(['Date', 'Industry_ID']).squeeze().sort_index()
ser_industry_cap.index = ser_industry_cap.index.set_levels(ser_industry_cap.index.levels[-1].astype(str), level= 'Industry_ID')
ser_industry_cap.name = 'Market_Cap'

ser_industry_rank = ser_industry_cap.groupby('Date').rank(pct = True)
ser_industry_rank.name = 'Industry_Rank'

df_country_industry_exp = ser_country_industry_exp.reorder_levels(['Date', 'Industry_ID', 'Reporter']).sort_index().to_frame()\
                                                  .join(ser_industry_cap, how = 'left').join(ser_industry_rank, how = 'left')

In [45]:
### FACTOR CALCULATION

gc.collect()
df_mcap_factor = pd.DataFrame()
for iter_column in ['Market_Cap', 'Industry_Rank']:
    df_mcap_factor[iter_column] = df_country_industry_exp.groupby(['Date', 'Reporter'])\
                                                         .apply(lambda df_group: weighted_average(df_group[iter_column], df_group['Export'])).sort_index()  

In [49]:
### MARKET CAP FACTOR RESAMPLING

gc.collect()
### Resampling to monthly data:
def reindex_monthly(df_group):
    df_result = df_group.droplevel(['Reporter']).reindex(pd.date_range(df_group.index[0][0], str_date_end, freq = 'BY'))
    df_result = df_result.resample('BM').ffill()
    return df_result
df_mcap_monthly = df_mcap_factor.groupby('Reporter').apply(reindex_monthly).swaplevel()
df_mcap_monthly.index.names = ['Date', 'Reporter']

In [51]:
### FACTOR SAVING

df_mcap_monthly.to_excel(str_path_factor_xlsx, merge_cells = False)
df_mcap_monthly.to_csv(str_path_factor_csv, sep = ';', header = True)
df_mcap_monthly.to_hdf(str_path_industry_exp, str_key_comtrade_factor)