# Data processing

This notebook can be used for processing various data formats found in this folder. 

Some of these datasets are already somewhat edited by hand before being edited here 
(e.g. reformatting to consistent columns). But as minimally as possible.

In [21]:
# import modules

# system
import re
import sys
import os

# data handling
import pandas as pd
import numpy as np

# open climate data packages
from countrygroups import UNFCCC, EUROPEAN_UNION, ANNEX_ONE, NON_ANNEX_ONE
from shortcountrynames import to_name

# global stocktake tools
import gst_tools.gst_utils as utils

In [29]:
# 1 EIA Energy data

# !!! Currently retains all available countries because EIA data has country names and no ISO codes !!!

# some of the data from the EIA has been pre-processed to an easy to read .csv file. However, a 
# bit more processing is needed to generate separate .csv files for each variable. That is 
# performed by this section of the notebook. It does not need to be repeated but is retained
# here for documentation. 

raw_data_file = "EIA-International_data-energy-production-consumption-by-country.csv"

# first available year is 1980, but more data available later
start_year = 1990

# Based on countrygroups package, select the group of countries you would like to extract. 
# Note that the raw data may also include groups.
needed_countries = UNFCCC
new_source_name = 'EIA'

# get the data
fname = os.path.join('', 'input-data', raw_data_file)
print('reading ' + fname)
raw_data = pd.read_csv(fname)
new_data = raw_data.dropna()

# rename some columns
#new_data = raw_data.rename(columns={'countryISO': 'country'})

# reduce the countries or regions to only those desired
# and tell the user which ones are being removed
all_countries = new_data['country'].unique()
# removed_countries = list(set(all_countries) - set(needed_countries))
# if removed_countries:
#     print('Some countries being trimmed from dataset:')
#     for country in removed_countries:
#         print('   ' + country)
#     print('---------')
# new_data = new_data.loc[new_data['country'].isin(needed_countries)]

# # tell the user if any of the needed countries are missing and, if yes, which ones:
# missing_countries = list(set(needed_countries) - set(new_data['country'].unique()))
# if missing_countries:
#     print('Not all countries requested were available in the raw data. You are missing the following:')
#     for country in missing_countries:
#         print('   ' + to_name(country))
#     print('---------')
    
# Check for available variables and sectors
variables = new_data['variable'].unique()
fuels = new_data['fuel'].unique()
  
# make a new file with each one...

for var in variables:
    for fuel in fuels:
        
        print('getting data for ' + var + ' and ' + fuel)
        
        data_selected = new_data.loc[(new_data['variable'] == var) &
                                     (new_data['fuel'] == fuel)]
        
        # Check the data format
        if not utils.verify_data_format(data_selected):

            print('WARNING: The data is not correctly formatted! Please check your input data and processing!')

        else:

            # define the variable name
            new_variable_name = (var + '-' + fuel)
            new_variable_name = new_variable_name.replace(' ', '-').lower()
            data_selected['variable'] = new_variable_name
            
            # make nans were appropriate
            data_selected = data_selected.replace('(s)','nan')
            data_selected = data_selected.replace('--','nan')
            
            data_selected = utils.change_first_year(data_selected, start_year)
            
            # make column names strings
            data_selected.columns = data_selected.columns.astype(str)
            
            # define filename as composite of variable and source name
            fname_out = new_source_name + '_' + new_variable_name + '.csv' 
            fullfname_out = os.path.join('proc-data', fname_out)

            # check folder exists
            if not os.path.exists('proc-data'):
                os.makedirs('proc-data')

            # write to csv in proc data folder
            data_selected.to_csv(fullfname_out, index=False)

            # celebrate success 
            print('Processed data written to file! - ' + fullfname_out)


# reduce to only required years
#new_data = utils.change_first_year(new_data, start_year)

# make the columns strings
data_selected

reading input-data/EIA-International_data-energy-production-consumption-by-country.csv
getting data for Primary Energy Production and total
First year of data available is now 1990
Last year of data available is 2016
Processed data written to file! - proc-data/EIA_primary-energy-production-total.csv
getting data for Primary Energy Production and Coal
First year of data available is now 1990
Last year of data available is 2016
Processed data written to file! - proc-data/EIA_primary-energy-production-coal.csv
getting data for Primary Energy Production and Natural Gas


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


First year of data available is now 1990
Last year of data available is 2016
Processed data written to file! - proc-data/EIA_primary-energy-production-natural-gas.csv
getting data for Primary Energy Production and Petroleum and Other Liquids
First year of data available is now 1990
Last year of data available is 2016
Processed data written to file! - proc-data/EIA_primary-energy-production-petroleum-and-other-liquids.csv
getting data for Primary Energy Production and Nuclear Renewables and Other
First year of data available is now 1990
Last year of data available is 2016
Processed data written to file! - proc-data/EIA_primary-energy-production-nuclear-renewables-and-other.csv
getting data for Energy Consumption and total
First year of data available is now 1990
Last year of data available is 2016
Processed data written to file! - proc-data/EIA_energy-consumption-total.csv
getting data for Energy Consumption and Coal
First year of data available is now 1990
Last year of data available i

Unnamed: 0,country,fuel,unit,variable,1990,1991,1992,1993,1994,1995,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Afghanistan,Nuclear Renewables and Other,Quad Btu,energy-consumption-nuclear-renewables-and-other,0.008320341,0.007438198,0.005281766,0.005068733,0.005081109,0.005348473,...,0.010254709,0.007526483,0.012348795,0.013538529,0.013050802,0.017607649,0.020017682,0.021619521,0.023682696,0.025484871
1,Albania,Nuclear Renewables and Other,Quad Btu,energy-consumption-nuclear-renewables-and-other,0.030112915,0.033504808,0.031299817,0.032912066,0.038238214,0.04356612,...,0.037947871,0.045885187,0.056312723,0.071539694,0.050085449,0.055620137,0.069903128,0.054906608,0.060006991,0.06529268
2,Algeria,Nuclear Renewables and Other,Quad Btu,energy-consumption-nuclear-renewables-and-other,0.001354841,0.000444964,,,,0.001074909,...,0.007107806,0.00692524,0.009379231,0.010603904,0.018068068,0.01970028,0.012129965,0.009894547,0.009096526,0.007811325
3,American Samoa,Nuclear Renewables and Other,Quad Btu,energy-consumption-nuclear-renewables-and-other,0.000210538,0.000210538,0.000445519,0.000216196,3.58E-06,,...,,,,,,,,,,
4,Angola,Nuclear Renewables and Other,Quad Btu,energy-consumption-nuclear-renewables-and-other,0.007608634,0.007441112,0.008668242,0.009123161,0.009101856,0.008994519,...,0.023996338,0.030374466,0.029724455,0.035553859,0.038647984,0.03571138,0.044773298,0.047514011,0.048014759,0.054847771
5,Antarctica,Nuclear Renewables and Other,Quad Btu,energy-consumption-nuclear-renewables-and-other,,,,,,0.000189188,...,0.000365764,0.000168117,0.000229959,0.000229943,0.000229783,0.000196199,0.00019569,0.000195674,0.000200238,0.000212286
6,Antigua and Barbuda,Nuclear Renewables and Other,Quad Btu,energy-consumption-nuclear-renewables-and-other,,,,,0.000367219,,...,0.000342398,,1.30E-05,,,,,0.000145155,0.000383474,0.000474791
7,Argentina,Nuclear Renewables and Other,Quad Btu,energy-consumption-nuclear-renewables-and-other,0.270693778,0.263602727,0.290530959,0.337920825,0.377380664,0.364811824,...,0.432644028,0.414872446,0.452081241,0.453575624,0.427355476,0.398128183,0.432961263,0.427982177,0.447092139,0.41590747
8,Armenia,Nuclear Renewables and Other,Quad Btu,energy-consumption-nuclear-renewables-and-other,,,0.031473295,0.043610028,0.036930478,0.020433803,...,0.045755222,0.044875459,0.046518071,0.049215228,0.047301718,0.040730889,0.041899632,0.041737598,0.046469965,0.044061497
9,Aruba,Nuclear Renewables and Other,Quad Btu,energy-consumption-nuclear-renewables-and-other,,,0.000371014,,0.000446587,,...,,,0.000474918,0.001281778,0.001544212,0.001231637,0.001646844,0.001907898,0.00142454,0.001354786


In [23]:
fuels


array(['Production', 'Coal', 'Natural Gas', 'Petroleum and Other Liquids',
       'Nuclear Renewables and Other', 'total'], dtype=object)