In [1]:
### ENVIRONMENT FACTORS SOURCE DATASETS EXTRACTING

In [2]:
### RUN EVERY TIME: INITIALIZATION

import pandas as pd
import numpy as np
import requests

In [3]:
### XLSX IMPORT OPTIONS

### NA for MS Excel files:
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable']

In [4]:
### CONSTANTS (RESEARCH VERSION ONLY)

### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### Yale University Environment performance index data paths and sheets:
str_path_bdh_source = 'Data_Files/Source_Files/bdh_history.xlsx'

### OECD Climate resulting dataset:
str_path_oecd_dataset = 'Data_Files/Source_Files/oecd_climate_dataset.h5'
str_key_oecd_dataset = 'oecd_climate_dataset'
### World Bank API resulting dataset:
str_path_wb_api_dataset = 'Data_Files/Source_Files/wb_api_dataset.h5'
str_key_wb_api_dataset = 'wb_api_dataset'
### World Bank What a Waste dataset:
str_path_what_waste_dataset = 'Data_Files/Source_Files/what_waste_dataset.h5'
str_key_what_waste_dataset = 'what_waste_dataset'
### Yale University Environment performance index resulting dataset:
str_path_bdh_dataset = 'Data_Files/Source_Files/bdh_dataset.h5'
str_key_bdh_dataset = 'bdh_dataset'

In [5]:
### MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### Dates:
date_start = pd.Timestamp('1989-12-29')
date_end = pd.Timestamp('2022-12-30')

In [6]:
### DEFINING & LAUNCH COUNTRY CODES EXTRACTOR (RESEARCH VERSION ONLY)

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

### World Country Codes:
df_country_codes = get_country_codes()

In [7]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (RESEARCH VERSION ONLY)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

### ISON membership history:
ser_ison_membership = ison_membership_converting(str_path_universe, date_end)
### ISON Long IDs as Index, Short IDs - as Values:
ser_ison_long = df_country_codes[df_country_codes['ISO SHORT'].isin(ser_ison_membership.index.get_level_values('Country').unique())].set_index('ISO LONG').squeeze()
### ISON current status:
ser_ison_status = ser_ison_membership[date_end]

  df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],


In [8]:
### OECD GREEN GROWTH: REQUESTS SESSION INITIALIZING

### Constants:
str_oecd_base_url = 'https://stats.oecd.org/sdmx-json/data/'
str_oecd_structure_url = 'https://stats.oecd.org/restsdmx/sdmx.ashx/GetDataStructure/'
str_gg_dataset_add = 'GREEN_GROWTH'
### Session initializing:
request_session = requests.Session()
### For avoiding data request errors:
request_session.headers\
               .update({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})

In [9]:
### OECD GREEN GROWTH: REQUEST PARAMETERS PREPARATION:

### Green growth climate variables:
dict_variables = {}
dict_variables['PM_PWM'] = 'Mean population exposure to PM2.5'
dict_variables['NRGS'] = 'Total energy supply'
dict_variables['CO2_PBEM'] = 'Production-based CO2 emissions'
dict_variables['RE_TPES'] = 'Renewable energy supply, % total energy supply'
dict_variables['MWAS_RECO'] = 'Municipal waste recycled or composted, % treated waste'
str_variables = '+'.join(dict_variables.keys())
### Countries parameter preparation:
list_oecd_reporters = list(ser_ison_long.index).copy()
list_oecd_reporters.remove('HKG')
list_oecd_reporters.remove('TWN')
str_oecd_reporters = '+'.join(list_oecd_reporters)

In [10]:
### OECD GREEN GROWTH: CLIMATE CHANGE REQUEST CONSTRUCTING & PERFORMING

str_gg_climate_request_params = '.'.join([str_oecd_reporters, str_variables])
str_gg_climate_request = str_oecd_base_url + str_gg_dataset_add + '/' + str_gg_climate_request_params + '/all?startTime=' + str(date_start.year) + \
                         '&endTime=' + str(date_end.year) + '&detail=DataOnly'
obj_gg_climate_dataset = request_session.get(str_gg_climate_request).json()

In [11]:
### OECD GREEN GROWTH: CLIMATE CHANGE INDEX DATA COLLECTING:

### Dates:
list_idx_dates = []
for tup_date in obj_gg_climate_dataset['structure']['dimensions']['observation'][0]['values']:
    list_idx_dates.append(pd.to_datetime(tup_date['id']) + pd.offsets.BYearEnd())
### Parameters:    
list_idx_library = []
for iter_position in obj_gg_climate_dataset['structure']['dimensions']['series']:
    list_param_values = []
    for tup_parameter in iter_position['values']:
        list_param_values.append(tup_parameter['id'])            
    list_idx_library.append(list_param_values)
### Result:
list_idx_library.append(list_idx_dates)
### Converting to dictionary for future replacing:
list_idx_dict = []
for iter_list in list_idx_library:
    list_idx_dict.append(dict(zip(map(str, range(len(iter_list))), iter_list)))

In [12]:
### OECD GREEN GROWTH: CLIMATE CHANGE DATASET RESAMPLING

dict_datasets_res = {}
dict_datasets_source = obj_gg_climate_dataset['dataSets'][0]['series']
### Parameters and date indexes integration:
for iter_dataset in dict_datasets_source:
    dict_observations = dict_datasets_source[iter_dataset]['observations']
    for iter_observation in dict_observations:
        str_iter_idx = iter_dataset + ':' + iter_observation
        flo_iter_value = dict_observations[iter_observation][0]
        dict_datasets_res[str_iter_idx] = flo_iter_value

In [13]:
### OECD GREEN GROWTH: CLIMATE CHANGE DATASET REINDEXATION

df_gg_climate_data = pd.Series(dict_datasets_res)
df_gg_climate_data.index = pd.MultiIndex.from_arrays(zip(*df_gg_climate_data.index.str.split(':')))
int_levels_number = df_gg_climate_data.index.nlevels
df_gg_climate_data = df_gg_climate_data.reset_index()
### Replacing numbers with parameter values:
for iter_level in range(int_levels_number):
    df_gg_climate_data['level_' + str(iter_level)].replace(list_idx_dict[iter_level], inplace = True)
    ### Replacing long ISO names with short ISO names:
    if (iter_level == 0):
        df_gg_climate_data['level_' + str(iter_level)].replace(dict(zip(ser_ison_long.index, ser_ison_long)), inplace = True)
    ### Variables renaming:
    elif (iter_level == 1):
        df_gg_climate_data['level_' + str(iter_level)].replace(dict_variables, inplace = True)
### Indexes defining:
ser_gg_climate_data = df_gg_climate_data.set_index(['level_1', 'level_2', 'level_0']).squeeze()
ser_gg_climate_data.index.names = ['Variable', 'Date', 'Country']
ser_gg_climate_data.sort_index(inplace = True)
ser_gg_climate_data.name = 'OECD Climate'

In [14]:
### OECD GREEN GROWTH: CLIMATE CHANGE SAVING (RESEARCH VERSION ONLY)

ser_gg_climate_data#.to_hdf(path_or_buf = str_path_oecd_dataset, key = str_key_oecd_dataset, mode = 'w')

Variable                           Date        Country
Mean population exposure to PM2.5  1990-12-31  AE           42.26
                                               AR           16.01
                                               AT           21.56
                                               AU            7.60
                                               BD           52.99
                                                           ...   
Total energy supply                2021-12-31  SE           46.76
                                               SI            6.44
                                               SK           17.77
                                               TR          158.25
                                               US         2103.43
Name: OECD Climate, Length: 9877, dtype: float64

In [18]:
### WORLD BANK: WDI: REQUESTS SESSION INITIALIZING

str_wdi_base_url = 'http://api.worldbank.org/v2/'
str_wdi_request_format = '?format=json&per_page=29999'
### Session initializing:
request_session = requests.Session()

In [19]:
### OECD GREEN GROWTH: REQUEST PARAMETERS PREPARATION:

dict_variables = {}
dict_variables['NY.ADJ.DRES.GN.ZS'] = 'Natural resources depletion (WB)'

In [20]:
### WORLD BANK: WDI: ALL VARIABLES: DATA EXTRACTING

### Results container:
dict_results = {}
### List of ISON countries converting:
str_reporters_all = ';'.join(sorted(ser_ison_long.index))
### Looping over variables:
for iter_var in dict_variables:
    ### URL for API request:
    str_iter_url = str_wdi_base_url + 'country/' + str_reporters_all + '/indicator/' + iter_var + \
                   str_wdi_request_format + '&date=' + str(date_start.year) + ':' + str(date_end.year)
    ### API response:
    obj_iter_dataset = request_session.get(str_iter_url)
    ### Data converting from JSON to pandas:
    ser_iter_country_id = pd.DataFrame(obj_iter_dataset.json()[1])['country'].apply(pd.Series)['id']
    df_iter_dataset = pd.concat([ser_iter_country_id, pd.DataFrame(obj_iter_dataset.json()[1])[['date', 'value']]], axis = 1)
    df_iter_dataset.columns = ['Country', 'Year', 'Value']
    df_iter_dataset['Date'] = pd.to_datetime(df_iter_dataset['Year']) + pd.offsets.BYearEnd()
    ### Adding data to container:
    dict_results[dict_variables[iter_var]] = df_iter_dataset.set_index(['Date', 'Country'])['Value'].sort_index()
### Data aggregating and saving:
pd.concat(dict_results, axis = 1).stack().reorder_levels([2, 0, 1])
ser_wb_api_data = pd.concat(dict_results, axis = 1).stack().reorder_levels([2, 0, 1])
ser_wb_api_data.index.rename('Variable', level = 0, inplace = True)

In [30]:
### WORLD BANK: WDI: DATA SAVING (RESEARCH VERSION ONLY)

ser_wb_api_data#.to_hdf(path_or_buf = str_path_wb_api_dataset, key = str_key_wb_api_dataset, mode = 'w')

Timestamp('2023-12-29 00:00:00')

In [None]:
### TEMP

ser_wb_api_data.index.levels[1][0] + pd.DateOffset(months=2)
pd.tseries.offsets.BMonthEnd().rollforward(pd.to_datetime('2024-01-01'))
pd.tseries.offsets.BMonthEnd().rollback(pd.to_datetime('2024-01-01'))

In [33]:
### TEMP



Country                     ZM
Year                      1989
Value                14.040456
Date       1989-12-29 00:00:00
Name: 2855, dtype: object

In [10]:
### WORLD BANK: WHAT A WASTE

### Dataset URL: 'https://datacatalogfiles.worldbank.org/ddh-published/0039597/DR0049199/country_level_data_0.csv?versionId=2023-01-18T19:20:07.1683081Z'

In [29]:
### WORLD BANK: WHAT A WASTE: LOADING & REINDEXATION

### World Bank "What A Waste" Global Database Country level Dataset:
str_path_what_waste_source = 'Data_Files/Source_Files/country_level_data_0.csv'
### CVS Loading:
df_what_waste_raw = pd.read_csv(str_path_what_waste_source, header = [0], sep = ',', na_values = 'NA', keep_default_na = False)
df_what_waste_raw = df_what_waste_raw[['iso3c'] + ['waste_treatment_recycling_percent', 'waste_treatment_compost_percent']]
### Data reindexation:
df_what_waste_ison = df_what_waste_raw.merge(df_country_codes, left_on = 'iso3c', right_on = 'ISO LONG').drop(['iso3c', 'ISO LONG'], axis = 1).set_index('ISO SHORT')
df_what_waste_ison.index.names = ['Country']
df_what_waste_ison = df_what_waste_ison.reindex(ser_ison_membership.index.get_level_values('Country').unique()).sort_index()
ser_what_waste_ison = df_what_waste_ison.stack().swaplevel().sort_index()
ser_what_waste_ison.index.names = ['Variable', 'Country']
ser_what_waste_ison = pd.concat({pd.Timestamp('2018-12-31'): ser_what_waste_ison}, names = ['Date']).reorder_levels([1, 0, 2]).sort_index()

In [30]:
### WORLD BANK: WHAT A WASTE: SAVING

ser_what_waste_ison#.to_hdf(path_or_buf = str_path_what_waste_dataset, key = str_key_what_waste_dataset, mode = 'w')

Variable                           Date        Country
waste_treatment_compost_percent    2018-12-31  AE          9.000
                                               AT         31.240
                                               BD          5.250
                                               BE         19.137
                                               BG         10.320
                                                           ...  
waste_treatment_recycling_percent  2018-12-31  UA          3.200
                                               UG          6.000
                                               US         34.600
                                               VN         23.000
                                               ZA         28.000
Length: 120, dtype: float64

In [9]:
### YALE UNIVERSITY: BIODIVERSITY & HABITAT

### Data Source: https://sedac.ciesin.columbia.edu/data/set/epi-environmental-performance-index-2022/data-download
### BDH Data is taken as a column from original EPI Results

In [15]:
### YALE UNIVERSITY: BIODIVERSITY & HABITAT: DATA LOADING

### Data loading:
dict_bio_raw = pd.read_excel(engine = 'openpyxl', io = str_path_bdh_source, sheet_name = None, header = None, na_values = ['NA', '..'], keep_default_na = False)
### Data aggregating:
dict_epi_container = {}
for iter_year in dict_bio_raw:
    df_bio_iter_raw = dict_bio_raw[iter_year]
    ser_bio_iter_res = df_bio_iter_raw.merge(df_country_codes, left_on = 0, right_on = 'ISO LONG').drop([0, 'ISO LONG'], axis = 1).set_index('ISO SHORT').squeeze()
    dict_epi_container[(pd.to_datetime(iter_year)  + pd.offsets.BYearEnd()).date()] = ser_bio_iter_res[ser_ison_status.index]
ser_bio_res = pd.concat(dict_epi_container, axis = 1, sort = False).stack().swaplevel().sort_index().astype('float32')
### Data reindexation:
ser_bio_res.index.names = ['Date', 'Country']
ser_bio_filled = ser_bio_res.unstack('Country').reindex(pd.date_range(date_start, date_end, freq = 'BY')).ffill().bfill().stack('Country').sort_index()
ser_bio_filled.index.names = ['Date', 'Country']
ser_bio_filled = pd.concat({'Biodiversity': ser_bio_filled}, names = ['Variable']).sort_index()

In [16]:
### TEMP

ser_bio_res

Date        Country
2006-12-29  AE         55.599998
            AR         49.799999
            AT         28.799999
            AU         49.599998
            BD         25.299999
                         ...    
2022-12-30  UG         75.800003
            US         60.599998
            VN         27.900000
            ZA         54.700001
            ZM         91.000000
Length: 737, dtype: float32

In [35]:
### YALE UNIVERSITY: IODIVERSITY & HABITAT: SAVING

ser_bio_filled#.to_hdf(path_or_buf = str_path_bdh_dataset, key = str_key_bdh_dataset, mode = 'w')

Variable      Date        Country
Biodiversity  1989-12-29  AE         55.599998
                          AR         49.799999
                          AT         28.799999
                          AU         49.599998
                          BD         25.299999
                                       ...    
              2022-12-30  UG         75.800003
                          US         60.599998
                          VN         27.900000
                          ZA         54.700001
                          ZM         91.000000
Length: 2890, dtype: float32