In [180]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook
#from xlrd import open_workbook
import re
import os
import time
from os import listdir
from os.path import isfile, join
from ddf_utils.str import to_concept_id
from ddf_utils.datapackage import create_datapackage

In [181]:
#out_dir = '../../'
out_dir = '../'

# global variables to build data for concepts and entities
variants = []
agegroups = []
age1YrInterval = []
ageBroad = []
age5YrInterval = []
ref_AreaCode = []

In [182]:
# method to create directory if it does not exist
def createDirectory(Directory):
    if not os.path.exists('../'+Directory.lower()):
        os.makedirs('../'+Directory.lower())

In [187]:
def load_Files_new(source, variant, gender, TypeBy, NewFormatInfo):
    #load file using pandas
    data = pd.read_excel(source, sheetname= variant, skiprows=16, na_values='…')
    data = data.drop(['Index', 'Notes'], axis = 1)
    
    #remove the empty space and convert '-' to '_' char
    data['Variant'] = data['Variant'].str.lower().replace(' ', '_').replace('-','_')
    
    #print(data.Variant.unique())
    
    #rename country column and country code column
    data = data.rename(columns={
        'Major area, region, country or area *': 'Ref_Area',
        'Country code': 'Ref_Area_Code',
        'Reference date (1 January - 31 December)': 'Year'
    })
    
    #update columns name from NewFormatInfo file
    for i, row in enumerate(NewFormatInfo.values):
        IndicatorInitial, IndicatorDest, DestFolder = row
        if(";" in IndicatorInitial):
            if(IndicatorDest == "TotalDeaths"):
                data = data.rename(columns={
                        "Male deaths (thousands)" : "TotalDeaths_Male",
                        "Female deaths (thousands)" : "TotalDeaths_Female"
                    })
            elif(IndicatorDest == "LifeExpectancyAtBirth"):
                data = data.rename(columns={
                        "Life expectancy at birth, males (years)" : "LifeExpectancyAtBirth_Male",
                        "Life expectancy at birth, females (years)" : "LifeExpectancyAtBirth_Female"
                    })
        else:
            data = data.rename(columns={IndicatorInitial : IndicatorDest})
    
    return data

In [188]:
def GetDataFromWorkBook(source, gender, indicator, TypeBy, NewFormatInfo):
    all_variants = []

    #pandas excel file fun.
    wbb = pd.ExcelFile(source)

    #iterate through each SHEET except "NOTES"
    for sheetName in wbb.sheet_names:
        if(sheetName == 'NOTES'):
            #ignore NOTES sheet since we are not collecting metadata yet
            continue
        else:
            #first load the files
            mydata = load_Files_new(source, sheetName, gender, TypeBy, NewFormatInfo)
            mydata = mydata.drop(['Ref_Area'], axis=1)
            #mydata = mydata.set_index(['Ref_Area_Code','Variant', 'Year'])
            all_variants.append(mydata)
    return all_variants

In [189]:
def createDataFiles_new(ds, NewFormatInfo, Ref_Area_List):
    for i, row in enumerate(NewFormatInfo.values):
        IndicatorInitial, IndicatorDest, DestFolder = row
        
        ds.columns = list(map(to_concept_id, ds.columns))
        
        ds['variant'] = [ x.replace(' ', '_').replace('-','_') for x in ds['variant']]
    
        
        Directory = DestFolder.lower().replace('newformat', IndicatorDest.lower())
        #print(Directory)
        newFileName = Directory.lower()
        newFileName = newFileName.replace("ref_area_code", "ref_area_code-{}")
        newFileName = newFileName + ".csv"
        
        ds_all = ds[['variant','ref_area_code','year',IndicatorDest.lower()]].copy()
        
        #if IndicatorInitial contain ;
        if(";" in IndicatorInitial):
            if(IndicatorDest == "TotalDeaths"):
                ds_all_1 = ds[['variant','ref_area_code','year','totaldeaths_male']].copy()
                ds_all_1.columns = ['variant','ref_area_code','year','totaldeaths']
                ds_all_1['gender'] = 'male'
                
                ds_all_2 = ds[['variant','ref_area_code','year','totaldeaths_female']].copy()
                ds_all_2.columns = ['variant','ref_area_code','year','totaldeaths']
                ds_all_2['gender'] = 'female'
                
                frames = [ds_all_1, ds_all_2]
                ds_all = pd.concat(frames)
            elif(IndicatorDest == "LifeExpectancyAtBirth"):
                ds_all_1 = ds[['variant','ref_area_code','year','lifeexpectancyatbirth_male' ]].copy()
                ds_all_1.columns = ['variant','ref_area_code','year','totaldeaths']
                ds_all_1['gender'] = 'male'
                ds_all_2 = ds[['variant','ref_area_code','year','lifeexpectancyatbirth_female' ]].copy()
                ds_all_2.columns = ['variant','ref_area_code','year','totaldeaths']
                ds_all_2['gender'] = 'female'
                frames = [ds_all_1, ds_all_2]
                ds_all = pd.concat(frames)
        
        # group data by ref_area_code as files are generated by this grouping
        for geo, idxs in ds_all.groupby(by='ref_area_code').groups.items():
            myDS = ds_all.ix[idxs]

            data = Ref_Area_List.loc[Ref_Area_List['Code'] == geo]
            # update the directory and file names based on geo in country, continent, region or global
            if(data.iloc[0]['is--world'] == 1):
                NewDirectory = Directory.replace('ref_area_code','global')
                newFileName = newFileName.replace('ref_area_code','global')
                myDS = myDS.rename(columns={
                    'ref_area_code': 'global'
                })
            elif (data.iloc[0]['is--region'] == 1):
                NewDirectory = Directory.replace('ref_area_code','region')
                newFileName = newFileName.replace('ref_area_code','region')
                myDS = myDS.rename(columns={
                    'ref_area_code': 'region'
                })
            elif (data.iloc[0]['is--continent'] == 1):
                NewDirectory = Directory.replace('ref_area_code','continent')
                newFileName = newFileName.replace('ref_area_code','continent')
                myDS = myDS.rename(columns={
                    'ref_area_code': 'continent'
                })
            elif (data.iloc[0]['is--country'] == 1):
                NewDirectory = Directory.replace('ref_area_code','country')
                newFileName = newFileName.replace('ref_area_code','country')
                myDS = myDS.rename(columns={
                    'ref_area_code': 'country'
                })

            createDirectory(NewDirectory)
            path = os.path.join(out_dir,NewDirectory+'/'+newFileName.format(geo))

            myDS.to_csv(path, index=False, float_format='%.15g')
        


In [190]:
#MAIN Function. Calls the metadata.xslx file and iterate through each file
#supports reading multiple files and concatenating them to one dataset as well.
def callDataPointFiles(metadata_df, cdf):
    Ref_Area_List = pd.read_excel('source/countrymetadata.xlsx', parse_cols = "A:G")
    NewFormatInfo = pd.read_excel ('source/NewFormatInfo.xlsx', parse_cols = "A:C")
    df = metadata_df
    MainStart = time.time()
    for i, row in enumerate(metadata_df.values):
        start = time.time()
        
        #date = metadata_df.index[i]
        FileName, TypeBY, SEX, Indicator, Directory, Include, name, description, url = row
        newFileName = Directory.lower()
        newFileName = newFileName.replace("ref_area_code", "ref_area_code-{}")
        newFileName = newFileName + ".csv"

        if(Include == 1):
            #load the file
            ds1 = GetDataFromWorkBook("source/"+FileName, SEX, Indicator, TypeBY, NewFormatInfo )
            ds = pd.concat(ds1, ignore_index = True)
            #print(ds.head(3))
            createDataFiles_new(ds, NewFormatInfo, Ref_Area_List)
        
    FinalEnd = time.time()
    print ('Total Time For All Files: ' + str(FinalEnd-MainStart))
    return cdf

In [191]:
metadata_df = pd.read_excel('source/metadata.xlsx', parse_cols = "A:F")
metadata_df['name'] = ''
metadata_df['description'] = ''
metadata_df['sourceurl'] = 'https://esa.un.org/unpd/wpp/Download/Standard/Population/'

cdf = callDataPointFiles(metadata_df, cdf)



Total Time For All Files: 25.225170135498047
