# PRIMAP data processor

Unlike the 'PRIMAP-hist' data processor, this script operates on data exported from the PRIMAP emissions module using the write_csv_bulkplus_py function. It is intended primarily for users of the PRIMAP emissions module to make the PRIMAP data accessible for this tool. It was used to pre-process some of the standard datasets available for the gst toolset. 

Primarily it: 
* checks for data completeness and removes unwanted countries
* renames the countryISO column to 'country'
* simplifies and standardises the filename
* reduces the number of years contained (where applicable)
* checks the formatting of the data

In [11]:
# import modules

# system
import re
import sys
import os

# data handling
import pandas as pd
import numpy as np

# open climate data packages
from countrygroups import UNFCCC, EUROPEAN_UNION, ANNEX_ONE, NON_ANNEX_ONE
from shortcountrynames import to_name

# global stocktake tools
import gst_tools.gst_utils as utils

In [25]:
# user options

raw_data_file = "WDI2017P_POP_04-Apr-2019.csv"

# choose something useful! These will be used to generate the new filename.
new_variable_name = 'population'
new_source_name = 'WDI2017'

# Based on countrygroups package, select the group of countries you would like to extract. 
# Note that the raw data also includes groups.
needed_countries = UNFCCC

# First year of data needed for further plotting
start_year = 1990 
 

In [26]:
# get the data
raw_data_folder = os.path.join('input-data', 'PRIMAP')
fname = os.path.join('', raw_data_folder, raw_data_file)
print('reading ' + fname)
raw_data = pd.read_csv(fname)

# rename some columns
new_data = raw_data.rename(columns={'countryISO': 'country'})

# reduce to a complete set

# reduce the countries or regions to only those desired
new_data = new_data.loc[new_data['country'].isin(needed_countries)]

# tell the user if any of the needed countries are missing and, if yes, which ones:
missing_countries = list(set(needed_countries) - set(new_data['country'].unique()))
if missing_countries:
    print('Not all countries requested were available in the raw data. You are missing the following:')
    for country in missing_countries:
        print('   ' + to_name(country))
    print('---------')
                
# reduce to only required years
new_data = utils.change_first_year(new_data, start_year)

# make the columns strings
new_data.columns = new_data.columns.astype(str)

new_data

reading input-data/PRIMAP/WDI2017P_POP_04-Apr-2019.csv
Not all countries requested were available in the raw data. You are missing the following:
   European Union
   Cook Islands
   Niue
---------
First year of data available is now 1990
Last year of data available is 2015


Unnamed: 0,GWP,category,categoryCode,categorySet,country,countryName,scenario,source,unit,variable,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,,Demography,DEMOGR,misc,AFG,Afghanistan,HISTORY,WDI2017P,Pers,POP,...,25180000.0,25880000.0,26530000.0,27210000.0,27960000.0,28810000.0,29730000.0,30680000.0,31630000.0,32530000.0
1,,Demography,DEMOGR,misc,AGO,Angola,HISTORY,WDI2017P,Pers,POP,...,18540000.0,19180000.0,19840000.0,20520000.0,21220000.0,21940000.0,22690000.0,23450000.0,24230000.0,25020000.0
2,,Demography,DEMOGR,misc,ALB,Albania,HISTORY,WDI2017P,Pers,POP,...,2993000.0,2970000.0,2947000.0,2928000.0,2913000.0,2905000.0,2900000.0,2897000.0,2894000.0,2889000.0
3,,Demography,DEMOGR,misc,AND,Andorra,HISTORY,WDI2017P,Pers,POP,...,83370.0,84880.0,85620.0,85470.0,84420.0,82330.0,79320.0,75900.0,72790.0,70470.0
4,,Demography,DEMOGR,misc,ARE,United arab emirates,HISTORY,WDI2017P,Pers,POP,...,5171000.0,6010000.0,6900000.0,7705000.0,8329000.0,8735000.0,8953000.0,9040000.0,9086000.0,9157000.0
5,,Demography,DEMOGR,misc,ARG,Argentina,HISTORY,WDI2017P,Pers,POP,...,39560000.0,39970000.0,40380000.0,40800000.0,41220000.0,41660000.0,42100000.0,42540000.0,42980000.0,43420000.0
6,,Demography,DEMOGR,misc,ARM,Armenia,HISTORY,WDI2017P,Pers,POP,...,3002000.0,2988000.0,2975000.0,2966000.0,2963000.0,2968000.0,2978000.0,2992000.0,3006000.0,3018000.0
7,,Demography,DEMOGR,misc,ATG,Antigua and barbuda,HISTORY,WDI2017P,Pers,POP,...,83470.0,84400.0,85350.0,86300.0,87230.0,88150.0,89070.0,89980.0,90900.0,91820.0
8,,Demography,DEMOGR,misc,AUS,Australia,HISTORY,WDI2017P,Pers,POP,...,20700000.0,20830000.0,21250000.0,21690000.0,22030000.0,22340000.0,22730000.0,23120000.0,23460000.0,23790000.0
9,,Demography,DEMOGR,misc,AUT,Austria,HISTORY,WDI2017P,Pers,POP,...,8269000.0,8295000.0,8321000.0,8343000.0,8363000.0,8392000.0,8430000.0,8479000.0,8542000.0,8638000.0


In [27]:
## write the data to file

"""
First ensure that years, unit, 'country', and variable are all in data. If they are
can proceed to print data
"""

# Check the data format
if not utils.verify_data_format(new_data):
    
    print('WARNING: The data is not correctly formatted! Please check your input data and processing!')
    
else:
    
    # define filename as composite of variable and source name
    fname_out = new_source_name + '_' + new_variable_name + '.csv' 
    fullfname_out = os.path.join('proc-data', fname_out)

    # check folder exists
    if not os.path.exists('proc-data'):
        os.makedirs('proc-data')

    # write to csv in proc data folder
    new_data.to_csv(fullfname_out, index=False)

    # celebrate success 
    print('Processed data written to file! - ' + fullfname_out)
    

Processed data written to file! - proc-data/WDI2017_population.csv
