In [1]:
### RUN EVERY TIME: GRAVITY SOURCE DATASETS EXTRACTING

In [2]:
### RUN EVERY TIME: INITIALIZATION

import pandas as pd
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import numpy as np
import math
import itertools
import requests
import json ### To correct JSON structure before unpacking
import gc
import os
import datetime
import time ### UN COMTRADE Only
import xml.etree.ElementTree as et
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import seaborn as sns
#%load_ext line_profiler

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('pandas version: ', pd.__version__)
print('python version: ', python_version())

pandas version:  0.25.3
python version:  3.7.4


In [4]:
### RUN EVERY TIME: MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### CEPII dataset:
str_path_cepii_dataset = 'Data_Files/Source_Files/cepii_dataset.h5'
str_distance_dataset = 'distance_dataset'
### WB WDI GDP dataset:
str_path_wb_gdp_dataset = 'Data_Files/Source_Files/gdp_dataset.h5'
str_wb_gdp_dataset = 'gdp_dataset'
### Technical Constants:
str_date_end = '2022-09-30'
date_start = pd.Timestamp('1989-12-29')
date_end = pd.Timestamp(str_date_end)
date_ison = pd.Timestamp('1994-12-31')

In [5]:
### DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

In [6]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (TO BE IGNORED IN PRODUCT CODE)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [7]:
### RUN EVERY TIME: COMMON DATA EXTRACTION STEPS

### World Country Codes:
df_country_codes = get_country_codes()
### ISON membership history:
ser_ison_membership = ison_membership_converting(str_path_universe, pd.to_datetime(str_date_end))
### ISON LONG IDs list:
list_ison_long = list(df_country_codes.loc[df_country_codes['ISO SHORT'].isin(ser_ison_membership.index.get_level_values('Country').unique()), 'ISO LONG'].values)
### ISON current status:
ser_ison_status = ser_ison_membership.loc[str_date_end].droplevel('Date')
### ISON stats:
int_ison_number = len(list_ison_long)
list_regions = ['DM', 'EM', 'FM']
dict_ison_len = {}
dict_ison_len['Full Universe'] = int_ison_number
for iter_region in list_regions:
    dict_ison_len[iter_region] = len(ser_ison_status[ser_ison_status == iter_region])
ser_market_len = pd.Series(dict_ison_len)
ser_market_len.index.names = ['Market']    

In [8]:
### WORLD BANK: WDI: GDP

In [9]:
### WORLD BANK: WDI: GDP: GENERAL DATA PREPARATION

All = slice(None)
str_wdi_base_url = 'http://api.worldbank.org/v2/'
str_wdi_request_format = '?format=json&per_page=29999'

In [10]:
### WORLD BANK: WDI: INDICATORS LIST

### Session initializing:
request_session = requests.Session()
### Data loading:
obj_wb_indicators = request_session.get(str_wdi_base_url + 'indicator' + str_wdi_request_format)
list_indicators = obj_wb_indicators.json()[1]
df_indicators = pd.DataFrame(list_indicators)
print('Indicators registry length:', len(df_indicators))
### Indicators search and saving:
print(df_indicators.loc[df_indicators['name'].str.startswith('GDP') & df_indicators['name'].str.contains('current US'), ['id', 'name']])
str_gdp_dataset = 'NY.GDP.MKTP.CD'

Indicators registry length: 21034
                   id                                  name
12105  NY.GDP.MKTP.CD                     GDP (current US$)
12124  NY.GDP.PCAP.CD          GDP per capita (current US$)
12231   NYGDPMKTPSACD  GDP,current US$,millions,seas. adj.,


In [20]:
### WORLD BANK: WDI: GDP: DATA EXTRACTING

### Session initializing:
request_session = requests.Session()
### URL for API request:
str_gdp_url = str_wdi_base_url + 'country/' + '' + '/indicator/' + str_gdp_dataset + \
              str_wdi_request_format + '&date=' + str(date_start.year) + ':' + str(date_end.year)
### API response:
obj_gdp_dataset = request_session.get(str_gdp_url)
### Data converting from JSON to pandas:
ser_country_id = pd.DataFrame(obj_gdp_dataset.json()[1])['country'].apply(pd.Series)['id']
df_raw_dataset = pd.concat([ser_country_id, pd.DataFrame(obj_gdp_dataset.json()[1])[['date', 'value']]], axis = 1)
df_raw_dataset.columns = ['Country', 'Year', 'Value']
df_raw_dataset['Date'] = pd.to_datetime(df_raw_dataset['Year']) + pd.offsets.BYearEnd()
### Adding data to container:
ser_full_gdp = df_raw_dataset.set_index(['Date', 'Country'])['Value'].loc[:, df_country_codes['ISO SHORT'].values].sort_index()
### Data saving:
ser_full_gdp.to_hdf(path_or_buf = str_path_wb_gdp_dataset, key = str_wb_gdp_dataset, mode = 'w')

In [21]:
### CEPII DISTANCES

In [32]:
### CEPII DISTANCES: DATA EXPORT AND REPACKING

### Constants:
str_path_cepii_source = 'Data_Files/Source_Files/CEPII Distance Data/dist_cepii.xls'
### Source data export:
df_distance_source = pd.read_excel(str_path_cepii_source, index_col = [0, 1])
### Long to Short Country ID's converting:
df_distance_data = df_distance_source.join(df_country_codes.set_index('ISO LONG').squeeze(), on = 'iso_o')
df_distance_data.rename({'ISO SHORT': 'From_ID'}, axis = 1, inplace = True)
df_distance_data = df_distance_data.join(df_country_codes.set_index('ISO LONG').squeeze(), on = 'iso_d')
df_distance_data.rename({'ISO SHORT': 'To_ID'}, axis = 1, inplace = True)
### ISON countries filtering:
list_ison_countries = ser_ison_membership.index.get_level_values(1).unique()
df_distance_data = df_distance_data.dropna().set_index(['From_ID', 'To_ID']).loc[(list_ison_countries, All), ['dist', 'distcap', 'distw', 'distwces']]
df_distance_data.loc[(df_distance_data['distw'] == '.'), 'distw'] = df_distance_data['dist']
df_distance_data.loc[(df_distance_data['distwces'] == '.'), 'distwces'] = df_distance_data['distw']
df_distance_data = df_distance_data.astype(int)
### Result saving:
df_distance_data.to_hdf(path_or_buf = str_path_cepii_dataset, key = str_distance_dataset, mode = 'w')