# Calculate indicators

This notebook enables the user to use GDP, population, or other sets of indicators to calculate combined indicators such as per capita emissions or emissions / final energy use.

It should be used on data that is already pre-processed for this toolset to ensure efficiency and avoid errors.

One cell allows the user to adjust the units of the indicator. This should be done with care and the default option is not to do it! 

In [1]:
# import modules

# system
import sys, os, re
import time
import glob

# analytic
import pandas as pd
#import numpy as np

# open cliamte data
#import countrygroups

# plotting
#import seaborn
import matplotlib.pyplot as plt

# global stocktake tools
import gst_tools.gst_utils as utils


In [2]:
from a_parameters import *

In [3]:
variable_name_to_display, proc_data_fname, source_name = utils.get_primap_variable_and_and_file_name(gas_names[raw_entity], raw_sector, raw_scenario, version)
#data_set_1 = 'R-Andrew-2018_cement-CO2.csv'
data_set_1 = proc_data_fname#'PRIMAP-HISTCR_v2.3.1_CH4_total_excl_LULUCF.csv'
#data_set_2 = 'UN-2017-population.csv'
data_set_2, dset2_name = utils.define_dataset(population_fname,gdp_fname,other_fname, population_dset_name, gdp_dset_name, other_dset_name, population,gdp)

# Use this to generate the prefix of the output data file to include the source names of the original data. 
# The entities will automaticall be added when running the script
#new_source_name = 'R-Andrew-2018-cement-CO2-UN-population'
new_source_name = source_name + '_' + dset2_name
with open('gst_tools/combined_source.txt', 'w') as f:
    f.write(new_source_name)
# (don't need the file type ending!)



In [4]:
# get and clean data 

fname_in1 = os.path.join('proc-data', data_set_1)
#EPO#fname_in2 = os.path.join('proc-data', data_set_2)
fname_in2 = os.path.join('gst_tools', 'data', data_set_2)

# read in the data
var1 = pd.read_csv(fname_in1)
# WARNING: The header structure must be specified to the user.
var2 = pd.read_csv(fname_in2, header=2)

# EPO: make country column and variable name columns coincide
var2.rename(columns={'Country Code':'country','Indicator Name':'variable'},inplace=True)

# EPO: Other dataset
if data_set_2 != None:
    if population == True:
        var2_unit = ['Pers']*len(var2)
        var2['unit'] = var2_unit
    elif gdp == True:
        var2_unit = ['USD']*len(var2)
        var2['unit'] = var2_unit
    else:
        var2_unit = [other_dset_unit]*len(var2)
        var2['unit'] = var2_unit
else:
    print('The secondary dataset is not defined. Please check.')

# make sure that the same countries and years are available
var1, var2 = utils.ensure_common_years(var1, var2)
var1, var2 = utils.ensure_common_countries(var1, var2)

# check the data format
check1 = utils.verify_data_format(var1)
check2 = utils.verify_data_format(var2)

if not check1 or not check2:
    print('One of the dataframes is not correct! Please check and try again!')
else:
    # get metadata for later use and checking
    var1_name = var1['variable'].unique()[0]
    var2_name = var2['variable'].unique()[0]

    var1_unit  = var1['unit'].unique()[0]
    var2_unit = var2['unit'].unique()[0]


Common countries are: 
['ERI', 'NGA', 'PAN', 'TTO', 'NIC', 'PAK', 'AUT', 'AZE', 'TLS', 'OMN', 'SVK', 'HTI', 'CHL', 'KAZ', 'LIE', 'THA', 'CMR', 'GNQ', 'DZA', 'BRB', 'DOM', 'MOZ', 'HND', 'NAM', 'AFG', 'JOR', 'CPV', 'COD', 'MLT', 'RUS', 'SWZ', 'SYR', 'ATG', 'BOL', 'NER', 'LVA', 'SWE', 'ALB', 'FSM', 'RWA', 'LKA', 'COM', 'NZL', 'MDV', 'JAM', 'SYC', 'WSM', 'BRN', 'BWA', 'MDG', 'MYS', 'IRN', 'ARG', 'DEU', 'TKM', 'LAO', 'PRK', 'GRC', 'FRA', 'TON', 'ZWE', 'PHL', 'POL', 'VEN', 'DMA', 'MWI', 'KNA', 'CYP', 'KEN', 'GHA', 'DJI', 'PER', 'COL', 'MHL', 'NOR', 'MMR', 'GBR', 'BRA', 'SDN', 'MLI', 'CAN', 'GRD', 'QAT', 'FJI', 'MEX', 'MNG', 'VUT', 'ZMB', 'KHM', 'SLB', 'CUB', 'AND', 'BHS', 'SAU', 'BIH', 'BTN', 'ARM', 'SVN', 'BEL', 'ECU', 'PRY', 'UZB', 'MCO', 'SLE', 'LTU', 'GNB', 'MRT', 'HUN', 'SSD', 'MDA', 'NRU', 'ETH', 'STP', 'PNG', 'MUS', 'IRQ', 'GTM', 'CIV', 'TZA', 'BDI', 'SUR', 'BLZ', 'SRB', 'ISR', 'USA', 'TUV', 'EST', 'NPL', 'CRI', 'TCD', 'KGZ', 'BLR', 'GEO', 'YEM', 'LBR', 'LCA', 'SEN', 'LSO', 'DNK', 'MA

In [5]:
# combine data...

# for all of these, it's always var1 divided by var 2 and we want to ensure that this is done on countries. 
# Everything else should be consant across the table

def prep_df_for_division(df):
    
    df = df.set_index('country')
    
    year_cols = [y for y in df[df.columns] if (re.match(r"[0-9]{4,7}$", str(y)) is not None)]
    other_cols = list(set(df.columns) - set(year_cols))
    
    df = df.drop(other_cols, axis='columns')
    
    return df
    
# strip original metadata
var1 = prep_df_for_division(var1)
var2 = prep_df_for_division(var2)

# calculate new variables
new_df = var1 / var2

# generate new metadata
new_variable_name = var1_name + '-per-' + var2_name
with open('gst_tools/name_relative_variable.txt', 'w') as f:
    f.write(new_variable_name)
new_df['variable'] = new_variable_name

# automatically generate the unit 
new_df['unit'] = var1_unit + ' / ' + var2_unit
    
new_df = new_df.reset_index()

# reorganise dataframe
new_df = utils.check_column_order(new_df)


In [6]:
# take a look at your new data frame

new_df

Unnamed: 0,country,unit,variable,1990,1991,1992,1993,1994,1995,1996,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,AFG,Gg CO2 / yr / Pers,Total CO2 emissions (excl. LULUCF)-per-Populat...,0.000226,0.000199,0.000110,0.000096,0.000084,0.000076,0.000068,...,0.000296,0.000412,0.000337,0.000274,0.000244,0.000239,0.000201,0.000189,0.000190,0.000257
1,AGO,Gg CO2 / yr / Pers,Total CO2 emissions (excl. LULUCF)-per-Populat...,0.000912,0.000914,0.000916,0.000918,0.000712,0.001334,0.001361,...,0.001006,0.001020,0.001055,0.001011,0.001410,0.001044,0.001051,0.000993,0.000896,0.000848
2,ALB,Gg CO2 / yr / Pers,Total CO2 emissions (excl. LULUCF)-per-Populat...,0.001299,0.001852,0.001241,0.001215,0.001309,0.001280,0.001259,...,0.002025,0.002292,0.002203,0.002276,0.002527,0.002159,0.002159,0.002492,0.002327,0.002484
3,AND,Gg CO2 / yr / Pers,Total CO2 emissions (excl. LULUCF)-per-Populat...,0.007503,0.007359,0.007235,0.007151,0.007004,0.007297,0.007706,...,0.006583,0.006269,0.006345,0.006339,0.006274,0.006372,0.006520,0.006572,0.006415,0.006092
4,ARE,Gg CO2 / yr / Pers,Total CO2 emissions (excl. LULUCF)-per-Populat...,0.025705,0.027205,0.025963,0.028070,0.029376,0.027494,0.027450,...,0.018012,0.018107,0.021769,0.021418,0.020729,0.022779,0.021686,0.018024,0.017651,0.017399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,WSM,Gg CO2 / yr / Pers,Total CO2 emissions (excl. LULUCF)-per-Populat...,0.000639,0.000659,0.000671,0.000664,0.000682,0.000700,0.000736,...,0.001011,0.001083,0.001052,0.001043,0.001092,0.001220,0.001270,0.001254,0.001285,0.001791
190,YEM,Gg CO2 / yr / Pers,Total CO2 emissions (excl. LULUCF)-per-Populat...,0.000829,0.000758,0.000780,0.000646,0.000706,0.000758,0.000769,...,0.000967,0.000958,0.000956,0.001261,0.001189,0.000619,0.000486,0.000456,0.000712,0.000706
191,ZAF,Gg CO2 / yr / Pers,Total CO2 emissions (excl. LULUCF)-per-Populat...,0.007446,0.007529,0.006775,0.007039,0.007346,0.007650,0.007599,...,0.008728,0.008865,0.008385,0.008475,0.008892,0.008305,0.008255,0.008279,0.007890,0.007941
192,ZMB,Gg CO2 / yr / Pers,Total CO2 emissions (excl. LULUCF)-per-Populat...,0.000310,0.000297,0.000293,0.000292,0.000275,0.000240,0.000201,...,0.000195,0.000205,0.000249,0.000260,0.000277,0.000275,0.000289,0.000386,0.000405,0.000397


In [7]:
# If desired, you can set the unit of the new data here. 

# ****** BE CAREFUL!!! *******
# This option is just to allow you to make the name nicer for plots and will not actually change the units. An 'org_unit' column will 
# be added to teh dataframe for safety. 
# Recommended option: set to False and keep automatic calculation of units.

#convert_unit = False


if convert_unit == False:
    final_df = utils.convert_to_kt(new_df, population, gdp, other_unit=other_dset_unit)
    
else:    
    final_df = new_df

*******************
['Converting unit from "Gg CO2 / yr / Pers" to "ktCO2/capita" using a conversion factor of 1000']
*******************


In [8]:
## write the data to file

"""
First ensure that years, unit, 'country', and variable are all in data. If they are
can proceed to print data
"""
# adapted by EPO (change from new_df to final_df)
if 'country' not in final_df.columns or 'unit' not in final_df.columns:
    
    print('Missing required information! Please check your input data and processing!')
    
else:
    
    # define filename as composite of variable and source name
    fname_out = new_source_name + '_' + new_variable_name + '.csv'
    with open('gst_tools/name_relative_dset.txt', 'w') as f:
        f.write(fname_out)
    fullfname_out = os.path.join('proc-data', fname_out)

    # check folder exists
    if not os.path.exists('proc-data'):
        os.makedirs('proc-data')

    # check if file already exists
    files_present = glob.glob(fullfname_out)

    # if no matching files, write to csv, if there are matching files, print statement
    if not files_present:
        # write to csv in proc data folder
        final_df.to_csv(fullfname_out, index=False)
    
    else:
        print('WARNING: This file already exists! adding a date stamp to the file name.')
        fname_out = new_source_name + '_' + new_variable_name + time.strftime("%Y%m%d-%H") + '.csv' 
        final_df.to_csv(fullfname_out, index=False)

    # celebrate success 
    print('Processed data written to file!')
    print(fullfname_out)
    

Processed data written to file!
proc-data\PRIMAP-histcr_v2.3.1_Population_World_Bank_250522_Total CO2 emissions (excl. LULUCF)-per-Population, total.csv
