In [1]:
### INITIALIZATION

import pandas as pd # Data structures
import numpy as np # Matrix algebra
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters # Plotting
register_matplotlib_converters() # Plotting
import seaborn as sns # Plotting
import gc # Garbage collection
import os # To rea all files in folder

In [2]:
### VERSIONS CONTROL

from platform import python_version
print('python version: ', python_version())
print('numpy version: ', np.__version__)
print('pandas version: ', pd.__version__)

python version:  3.7.4
numpy version:  1.17.2
pandas version:  0.25.3


In [3]:
### MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### Dates:
str_date_end = '2021-12-31'
### NA for MS Excel files:
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable', '---']
### WIOT Matrices folder:
str_wiot_folder = 'Data_Files/Source_Files/wiots_in_excel/'
### WIOT dtasets:
str_path_wiot = 'Data_Files/Source_Files/wiot_matrices.h5'
str_key_wiot = 'wiot'

In [4]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (TO BE IGNORED IN PRODUCT CODE)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [5]:
### DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

In [6]:
### COMMON DATA EXTRACTION STEPS

### World Country Codes:
df_country_codes = get_country_codes()
### ISON membership history:
ser_ison_membership = ison_membership_converting(str_path_universe, pd.to_datetime(str_date_end))
### ISO long codes for ISON countries:
df_iso_codes = df_country_codes.loc[df_country_codes['ISO SHORT'].isin(ser_ison_membership.index.get_level_values('Country').unique())]
dict_iso_codes = dict(zip(df_iso_codes['ISO LONG'], df_iso_codes['ISO SHORT']))
list_ison_iso_long = list(dict_iso_codes.keys())

In [8]:
### LOADING WIOT MATRICES

### Looping over files in folder:
for iter_number, iter_xlsx in enumerate(os.listdir(str_wiot_folder)):
    gc.collect()
    print(iter_xlsx)
    ### Loading matrix:
    dict_iter_xlsx = pd.read_excel(engine = 'openpyxl', io = str_wiot_folder + iter_xlsx, sheet_name = None, 
                                   skiprows = [0, 1, 3, 5], header = [0, 1], index_col =  [0, 1, 2, 3],                                  
                                   na_values = list_na_excel_values, keep_default_na = False)
    ### Extracting year number:
    str_iter_year = list(dict_iter_xlsx.keys())[0]
    ### Moving data from dictionary to dataframe:
    df_iter_raw = dict_iter_xlsx[str_iter_year]
    del dict_iter_xlsx
    gc.collect()
    ### Creating ISIC Industries registry:
    if (iter_number == 0):
        str_first_country = df_iter_raw.index.levels[-2][0]
        df_first_country_idx = df_iter_raw.loc[(All, All, str_first_country, All), ].reset_index([0, 1]).drop(df_iter_raw.columns[0], axis = 1).droplevel(0)
        dict_isic_ind = dict(zip(df_first_country_idx['level_0'], df_first_country_idx['level_1']))
    ### Extracting ISON countries data:
    df_iter_edit = df_iter_raw.droplevel([1, 3])
    df_iter_edit = df_iter_edit.loc[(list(dict_isic_ind.keys()), list_ison_iso_long), (list(dict_isic_ind.keys()), list_ison_iso_long)]
    ### Dataset transformation:
    ser_iter_edit = df_iter_edit.stack([0, 1])
    del df_iter_edit
    gc.collect()    
    ser_iter_edit.index.names = ['Reporter_ISIC_ID', 'Reporter_Country', 'Partner_ISIC_ID', 'Partner_Country']
    df_iter_res = ser_iter_edit.reset_index()
    del ser_iter_edit
    gc.collect()
    ### Replacing long ISO country codes to short ones:
    df_iter_res['Reporter_Country'].replace(dict_iso_codes, inplace = True)
    df_iter_res['Partner_Country'].replace(dict_iso_codes, inplace = True)
    ### Index levels casting to categorical:    
    ser_iter_res = df_iter_res.astype({'Reporter_ISIC_ID': 'category', 'Reporter_Country': 'category', 'Partner_ISIC_ID': 'category', 'Partner_Country': 'category'})\
                              .set_index(['Reporter_ISIC_ID', 'Reporter_Country', 'Partner_ISIC_ID', 'Partner_Country']).squeeze().round(2).astype('float32')
    del df_iter_res
    gc.collect()    
    ser_iter_res.name = 'Value'
    ### Adding annual data to common file:
    if (iter_number == 0): 
        pd.concat({str_iter_year: ser_iter_res}, names = ['Year']).to_hdf(str_path_wiot, key = str_key_wiot, mode = 'w', format = 'table', complevel = 9)        
    else:
        pd.concat({str_iter_year: ser_iter_res}, names = ['Year'])\
          .to_hdf(str_path_wiot, key = str_key_wiot, mode = 'r+', append = True, format = 'table', complevel = 9)
    del ser_iter_res
    gc.collect()
#    break

WIOT2000_Nov16_ROW.xlsx
WIOT2001_Nov16_ROW.xlsx
WIOT2002_Nov16_ROW.xlsx
WIOT2003_Nov16_ROW.xlsx
WIOT2004_Nov16_ROW.xlsx
WIOT2005_Nov16_ROW.xlsx
WIOT2006_Nov16_ROW.xlsx
WIOT2007_Nov16_ROW.xlsx
WIOT2008_Nov16_ROW.xlsx
WIOT2009_Nov16_ROW.xlsx
WIOT2010_Nov16_ROW.xlsx
WIOT2011_Nov16_ROW.xlsx
WIOT2012_Nov16_ROW.xlsx
WIOT2013_Nov16_ROW.xlsx
WIOT2014_Nov16_ROW.xlsx


In [8]:
#### WIOT DATA AGGREGATION & SAVING
#
#ser_wiot_data = pd.concat(dict_collection, names = ['Year'], axis = 0)
#del dict_collection
#gc.collect()
#ser_wiot_data.to_hdf(str_path_wiot, key = str_key_wiot, mode = 'a', append = True, format = 'table', complevel = 9)

In [12]:
### TEMP

pd.read_hdf(str_path_wiot, key = str_key_wiot, where = "Year = '2014'")

Year  Reporter_ISIC_ID  Reporter_Country  Partner_ISIC_ID  Partner_Country
2014  A01               AU                A01              AU                 12924.179688
                                                           AT                     0.010000
                                                           BE                    14.020000
                                                           BG                     0.000000
                                                           BR                     0.090000
                                                                                  ...     
      U                 US                U                SI                     0.000000
                                                           SE                     0.000000
                                                           TR                     0.000000
                                                           TW                     0.000000
               

In [None]:
### TEMP

ser_wiot_data.memory_usage()