In [1]:
import pandas as pd
import numpy as np
import re
import os
from ddf_utils.str import to_concept_id
from ddf_utils.index import create_datapackage

In [2]:
source_t = 'source/WPP2015_INT_F03_1_POPULATION_BY_AGE_ANNUAL_BOTH_SEXES.XLS'
source_m = 'source/WPP2015_INT_F03_2_POPULATION_BY_AGE_ANNUAL_MALE.XLS'
source_f = 'source/WPP2015_INT_F03_3_POPULATION_BY_AGE_ANNUAL_FEMALE.XLS'
out_dir = '../../'

In [3]:
def read_cleanup(source, gender):
    data_est = pd.read_excel(source, sheetname='ESTIMATES', skiprows=16, na_values='…')
    data_var = pd.read_excel(source, sheetname='MEDIUM VARIANT', skiprows=16, na_values='…')

    # rename/drop some columns.
    # for 80+ and 100+ groups, rename to 80plus and 100plus
    data_est = data_est.drop(['Index', 'Notes'], axis=1)
    data_var = data_var.drop(['Index', 'Notes'], axis=1)

    data_est = data_est.rename(columns={'80+': '80plus',
                                        '100+': '100plus'})
    data_var = data_var.rename(columns={'100+': '100plus'})  # todo: no use to rename for now.

    # insert Gender column and rearrange the order
    col_est_1 = data_est.columns[:4]
    col_est_2 = data_est.columns[4:]

    col_var_1 = data_var.columns[:4]
    col_var_2 = data_var.columns[4:]

    cols_est = [*col_est_1, 'Gender', *col_est_2]
    cols_var = [*col_var_1, 'Gender', *col_var_2]

    data_est['Gender'] = gender
    data_var['Gender'] = gender

    return (data_est[cols_est], data_var[cols_var])

In [4]:
def extract_concepts(data):
    """extract concept from one of the dataframes."""
    data_ = data.rename(columns={
        'Major area, region, country or area *': 'Name',
        'Reference date (as of 1 July)': 'Year'
    })

    concept_name = list(data_.columns[:5])
    concept_name.append('Population')
    concept_name.append('Age')
    concepts = list(map(to_concept_id, concept_name))

    # now construct the dataframe
    cdf = pd.DataFrame([], columns=['concept', 'concept_type', 'name'])
    cdf['concept'] = concepts
    cdf['name'] = concept_name

    cdf['concept_type'] = 'string'

    # population
    cdf['concept_type'].iloc[5] = 'measure'

    # entity domains
    cdf['concept_type'].iloc[[2, 4, 6]] = 'entity_domain'

    # year
    cdf['concept_type'].iloc[3] = 'time'
    cdf['name'].iloc[3] = 'Reference date (as of 1 July)'

    return cdf

In [5]:
def extract_entities_country(data_est, data_var):
    """extract country entities from source.

    data_est is data from estimates tab.
    data_var is from medium variant tab.

    we assume that both tab should have same entities.
    """
    data_est.columns = list(map(to_concept_id, data_est.columns))
    data_var.columns = list(map(to_concept_id, data_var.columns))

    entity = data_est[['major_area_region_country_or_area', 'country_code']].copy()
    entity = entity.rename(columns={'major_area_region_country_or_area': 'name'})
    entity = entity.drop_duplicates()

    entity_2 = data_var[['major_area_region_country_or_area', 'country_code']].copy()
    entity_2 = entity_2.rename(columns={'major_area_region_country_or_area': 'name'})
    entity_2 = entity_2.drop_duplicates()

    if len(entity) != len(entity_2):
        print('Warning: entities not same in the excel tabs.')

        ent = pd.concat([entity, entity_2])
        return ent.drop_duplicates()

    return entity


def extract_entities_gender():
    """no more information about gender in source, just create that"""
    df = pd.DataFrame([], columns=['gender', 'name'])
    df['gender'] = ['male', 'female']
    df['name'] = ['Male', 'Female']

    return df


def extract_entities_age(data_est):
    """extract ages from estimates tab of source data."""

    df = pd.DataFrame([], columns=['age', 'name'])
    df['age'] = data_est.columns[5:]

    df['name'] = 'Age ' + df['age']
    return df


def extract_datapoints(dflist):
    """make datapoint file with all dataframe in dflist."""

    to_concat = []

    for df in dflist:
        e = df.drop(['Variant', 'Major area, region, country or area *'], axis=1)
        e = e.set_index([
            'Country code', 'Reference date (as of 1 July)', 'Gender'])
        e.columns.name = 'Age'
        df_new = e.stack().reset_index().rename(columns={0: 'Population'})
        to_concat.append(df_new)

    df_all = pd.concat(to_concat, ignore_index=True)
    df_all = df_all.rename(columns={'Reference date (as of 1 July)': 'Year'})
    df_all.columns = list(map(to_concept_id, df_all.columns))

    # make age column sort correctly by changing to categorial dtype.
    df_all['age'] = df_all['age'].astype('category', categories=list(df_all['age'].unique()), ordered=True)

    df_all = df_all.sort_values(by=['country_code', 'year', 'age', 'gender'])

    # the only duplicates are in year 2015. There are both esitmated and observed data.
    # But both are same so we can drop them.
    df_all = df_all.drop_duplicates()
    # assert not np.any(df_all.duplicated(['country_code', 'year', 'age', 'gender']))

    return df_all


In [6]:
if __name__ == '__main__':

    print('reading source data...')
    print('\tboth sexes...')
    est_t, var_t = read_cleanup(source_t, 'both_sexes')
    print('\tmale...')
    est_m, var_m = read_cleanup(source_m, 'male')
    print('\tfemale...')
    est_f, var_f = read_cleanup(source_f, 'female')

    print('creating datapoint file...')
    dflist = [est_m, var_m, est_f, var_f]
    df_mf = extract_datapoints(dflist)
    for geo, idxs in df_mf.groupby(by='country_code').groups.items():
        path = os.path.join(out_dir, 
                            'ddf--datapoints--population--by--country_code-{}--year--gender--age.csv'.format(geo))
        to_save = df_mf.ix[idxs]
        to_save = to_save.sort_values(by=['country_code', 'year'])
        to_save.ix[idxs].to_csv(path, index=False, float_format='%.15g')

    df_t = extract_datapoints([est_t, var_t])
    df_t = df_t.drop('gender', axis=1)  # we don't need gender = both sexes in datapoint
    for geo, idxs in df_t.groupby(by='country_code').groups.items():
        path = os.path.join(out_dir, 
                            'ddf--datapoints--population--by--country_code-{}--year--age.csv'.format(geo))
        to_save = df_t.ix[idxs]
        to_save = to_save.sort_values(by=['country_code', 'year'])
        to_save.ix[idxs].to_csv(path, index=False, float_format='%.15g')

    print('creating concepts files...')
    concepts = extract_concepts(est_t)
    path = os.path.join(out_dir, 'ddf--concepts.csv')
    concepts.to_csv(path, index=False)

    print('creating entities files...')
    country = extract_entities_country(est_t, var_t)
    path = os.path.join(out_dir, 'ddf--entities--country_code.csv')
    country.to_csv(path, index=False)

    gender = extract_entities_gender()
    path = os.path.join(out_dir, 'ddf--entities--gender.csv')
    gender.to_csv(path, index=False)

    age = extract_entities_age(est_t)
    path = os.path.join(out_dir, 'ddf--entities--age.csv')
    age.to_csv(path, index=False)

    print('creating index files...')
    create_datapackage(out_dir)


reading source data...
	both sexes...
	male...
	female...
creating datapoint file...


NameError: name 'out_dir' is not defined

Unnamed: 0,concept,concept_type,name
0,variant,string,Variant
1,name,string,Name
2,country_code,entity_domain,Country code
3,reference_date_as_of_1_july,string,Reference date (as of 1 July)
4,gender,entity_domain,Gender
5,population,measure,Population
6,age,entity_domain,Age


Unnamed: 0,Variant,"Major area, region, country or area *",Country code,Reference date (as of 1 July),Gender,0,1,2,3,4,...,91,92,93,94,95,96,97,98,99,100plus
0,Estimates,WORLD,900,1950,both_sexes,78116.685,71707.672,66449.346,62228.019,58930.171,...,,,,,,,,,,
1,Estimates,WORLD,900,1951,both_sexes,81973.089,76250.302,70474.481,65672.988,61757.344,...,,,,,,,,,,
2,Estimates,WORLD,900,1952,both_sexes,85085.460,79060.671,74351.359,69205.529,64859.079,...,,,,,,,,,,
3,Estimates,WORLD,900,1953,both_sexes,87298.857,81532.381,76324.468,72491.108,67974.254,...,,,,,,,,,,
4,Estimates,WORLD,900,1954,both_sexes,88613.869,83346.474,78486.514,74028.307,70707.145,...,,,,,,,,,,
5,Estimates,WORLD,900,1955,both_sexes,89073.802,84597.955,80339.688,76310.740,72522.845,...,,,,,,,,,,
6,Estimates,WORLD,900,1956,both_sexes,88381.637,87292.271,83245.950,79325.051,75551.995,...,,,,,,,,,,
7,Estimates,WORLD,900,1957,both_sexes,87532.337,85311.474,85756.890,82114.467,78508.670,...,,,,,,,,,,
8,Estimates,WORLD,900,1958,both_sexes,86844.894,85699.623,84085.182,84314.323,81070.277,...,,,,,,,,,,
9,Estimates,WORLD,900,1959,both_sexes,86773.089,86344.658,85329.798,83802.875,82850.644,...,,,,,,,,,,


Unnamed: 0,variant,country_code,year,gender,age,population
545618,Estimates,4,1950,both_sexes,80plus,10.557
545618,Estimates,4,1950,female,80plus,4.590
545618,Estimates,4,1950,male,80plus,5.967
545699,Estimates,4,1951,both_sexes,80plus,12.149
545699,Estimates,4,1951,female,80plus,5.298
545699,Estimates,4,1951,male,80plus,6.852
545780,Estimates,4,1952,both_sexes,80plus,13.410
545780,Estimates,4,1952,female,80plus,5.886
545780,Estimates,4,1952,male,80plus,7.524
545861,Estimates,4,1953,both_sexes,80plus,14.137


545538    False
545538    False
545538    False
545539    False
545539    False
545539    False
545540    False
545540    False
545540    False
545541    False
545541    False
545541    False
545542    False
545542    False
545542    False
545543    False
545543    False
545543    False
545544    False
545544    False
545544    False
545545    False
545545    False
545545    False
545546    False
545546    False
545546    False
545547    False
545547    False
545547    False
          ...  
807788     True
807788     True
807788     True
807789     True
807789     True
807789     True
807790     True
807790     True
807790     True
807791     True
807791     True
807791     True
807792     True
807792     True
807792     True
807793     True
807793     True
807793     True
807794     True
807794     True
807794     True
807795     True
807795     True
807795     True
807796     True
807796     True
807796     True
807797     True
807797     True
807797     True
Name: age, dtype: bool

array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
       '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45',
       '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56',
       '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67',
       '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78',
       '79', '80plus', '80', '81', '82', '83', '84', '85', '86', '87',
       '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98',
       '99', '100plus'], dtype=object)

Index(['0', '1', '10', '100plus', '11', '12', '13', '14', '15', '16',
       ...
       '90', '91', '92', '93', '94', '95', '96', '97', '98', '99'],
      dtype='object', length=102)

Index(['Variant', 'Major area, region, country or area *', 'Country code',
       'Reference date (as of 1 July)', 'Gender'],
      dtype='object')

In [78]:
conc_df['concept_type'] = 'string'

conc_df['concept_type'].iloc[[2, 4, 5]] = 'entity_domain'

conc_df['concept_type'].iloc[3] = 'time'

conc_df['concept_type'].iloc[6] = 'measure'

Unnamed: 0,concept,concept_type,name
0,variant,string,Variant
1,name,string,Name
2,country_code,entity_domain,Country code
3,year,time,Reference date (as of 1 July)
4,gender,entity_domain,Gender
5,age,entity_domain,age
6,population,measure,Population


[0, 1, 2, 3, 4, ..., 96, 97, 98, 99, 100plus]
Length: 102
Categories (102, object): [0 < 1 < 2 < 3 ... 97 < 98 < 99 < 100plus]

Unnamed: 0,age,name
0,0,
1,1,
2,2,
3,3,
4,4,
5,5,
6,6,
7,7,
8,8,
9,9,


TypeError: Categorical cannot perform the operation +

NameError: name 'data_est' is not defined

NameError: name 'data_est' is not defined

NameError: name 'data_var' is not defined

True

Unnamed: 0,variant,major_area_region_country_or_area,country_code,year,total_population_aged_0_number,total_population_aged_1_number,total_population_aged_2_number,total_population_aged_3_number,total_population_aged_4_number,total_population_aged_5_number,...,total_population_aged_91_number,total_population_aged_92_number,total_population_aged_93_number,total_population_aged_94_number,total_population_aged_95_number,total_population_aged_96_number,total_population_aged_97_number,total_population_aged_98_number,total_population_aged_99_number,total_population_aged_100_number
0,Estimates,WORLD,900,1950,78116.685,71707.672,66449.346,62228.019,58930.171,56442.125,...,,,,,,,,,,
1,Estimates,WORLD,900,1951,81973.089,76250.302,70474.481,65672.988,61757.344,58639.277,...,,,,,,,,,,
2,Estimates,WORLD,900,1952,85085.46,79060.671,74351.359,69205.529,64859.079,61248.694,...,,,,,,,,,,
3,Estimates,WORLD,900,1953,87298.857,81532.381,76324.468,72491.108,67974.254,64081.777,...,,,,,,,,,,
4,Estimates,WORLD,900,1954,88613.869,83346.474,78486.514,74028.307,70707.145,66815.402,...,,,,,,,,,,


Unnamed: 0,name,concept,concept_type,unit
0,Variant,variant,string,
0,unit,Unit,string,
1,name,Name,string,
1,"Major area, region, country or area *",major_area_region_country_or_area,string,
2,Country code,country_code,entity_domain,
3,Reference date (as of 1 July),year,time,
4,Total Population aged 0 (Number),total_population_aged_0_number,measure,thousands
5,Total Population aged 1 (Number),total_population_aged_1_number,measure,thousands
6,Total Population aged 2 (Number),total_population_aged_2_number,measure,thousands
7,Total Population aged 3 (Number),total_population_aged_3_number,measure,thousands


241

241

Unnamed: 0,key,value,file
0,concept,concept_type,ddf--concepts.csv
1,concept,name,ddf--concepts.csv
0,"country_code,year,gender,age",population,ddf--datapoints--population--by--country_code-...
0,age,name,ddf--entities--age.csv
0,country_code,name,ddf--entities--country_code.csv
0,gender,name,ddf--entities--gender.csv
