In [1]:
%%bash
ls ../../data

WDICountry.csv
WDIData.csv
WDISeries.csv


In [2]:
import os
from IPython.display import display, HTML
import pandas as pd

#Locating where pyspark is installed
import findspark
findspark.init()
import pyspark

#Settings for PySpark to work
driver_memory = '4g'
num_executors = 2
executor_memory = '1g'
#pyspark_submit_args = ' --driver-memory ' + driver_memory + ' --executor-memory ' + executor_memory + ' --num-executors ' + num_executors + ' pyspark-shell'
pyspark_submit_args = ' --driver-memory ' + driver_memory + ' pyspark-shell'

#Setting the required parameters to start up PySpark
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

#Import Modules Needed for PySpark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
#Helper for pretty formatting for Spark DataFrames
def showDF(df, limitRows =  20, truncate = True):
    if(truncate):
        pd.set_option('display.max_colwidth', 50)
    else:
        pd.set_option('display.max_colwidth', -1)
    pd.set_option('display.max_rows', limitRows)
    display(df.limit(limitRows).toPandas())
    pd.reset_option('display.max_rows')

In [4]:
#Creating a spark session
spark = SparkSession.builder.appName("Data Transformation").getOrCreate()

In [5]:
#Read the file into a Spark Data Frame
country = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("../../data/WDICountry.csv")

country.count()

263

In [6]:
series = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("../../data/WDISeries.csv")

series.count()

1593

In [7]:
indicators = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("../../data/WDIData.csv")

indicators.count()

420024

In [8]:
countryDim = country \
    .select("2-alpha code", "Country Code", "Short Name", "Long Name", "Region", "Income Group") \
    .withColumnRenamed("2-alpha code", "country_iso_code") \
    .withColumnRenamed("Country Code", "wb_country_code") \
    .withColumnRenamed("Short Name", "country_name") \
    .withColumnRenamed("Long Name", "country_long_name") \
    .withColumnRenamed("Region", "region") \
    .withColumnRenamed("Income Group", "income_group")
    
showDF(countryDim)

Unnamed: 0,country_iso_code,wb_country_code,country_name,country_long_name,region,income_group
0,AW,ABW,Aruba,Aruba,Latin America & Caribbean,High income
1,AF,AFG,Afghanistan,Islamic State of Afghanistan,South Asia,Low income
2,AO,AGO,Angola,People's Republic of Angola,Sub-Saharan Africa,Lower middle income
3,AL,ALB,Albania,Republic of Albania,Europe & Central Asia,Upper middle income
4,AD,AND,Andorra,Principality of Andorra,Europe & Central Asia,High income
5,1A,ARB,Arab World,Arab World,,
6,AE,ARE,United Arab Emirates,United Arab Emirates,Middle East & North Africa,High income
7,AR,ARG,Argentina,Argentine Republic,,
8,AM,ARM,Armenia,Republic of Armenia,Europe & Central Asia,Lower middle income
9,AS,ASM,American Samoa,American Samoa,East Asia & Pacific,Upper middle income


In [9]:
#Lets you create a view that you can use in SQL queries
countryDim.createOrReplaceTempView("country")

In [10]:
# Alternative SQL query 
transformQuery = """
select 
    country_iso_code
    , wb_country_code
    , country_name as name
    , country_long_name as long_name
    , region
    , income_group
from 
    country
"""

showDF(spark.sql(transformQuery))

Unnamed: 0,country_iso_code,wb_country_code,name,long_name,region,income_group
0,AW,ABW,Aruba,Aruba,Latin America & Caribbean,High income
1,AF,AFG,Afghanistan,Islamic State of Afghanistan,South Asia,Low income
2,AO,AGO,Angola,People's Republic of Angola,Sub-Saharan Africa,Lower middle income
3,AL,ALB,Albania,Republic of Albania,Europe & Central Asia,Upper middle income
4,AD,AND,Andorra,Principality of Andorra,Europe & Central Asia,High income
5,1A,ARB,Arab World,Arab World,,
6,AE,ARE,United Arab Emirates,United Arab Emirates,Middle East & North Africa,High income
7,AR,ARG,Argentina,Argentine Republic,,
8,AM,ARM,Armenia,Republic of Armenia,Europe & Central Asia,Lower middle income
9,AS,ASM,American Samoa,American Samoa,East Asia & Pacific,Upper middle income


In [11]:
showDF( \
    countryDim \
       .select(col("country_iso_code"), \
               length(col("country_iso_code")).alias("column_length")) \
       .groupBy("column_length") \
       .agg(count("*").alias("cnt")) \
       .filter("cnt > 1") \
)

Unnamed: 0,column_length,cnt
0,2,262


In [12]:
#You can also do this in SQL
countryCodeLengthQuery = """
select 
    length(country_iso_code) as column_length
    , count(1) as cnt
from 
    country
group by 
    length(country_iso_code)
having 
    count(1) > 1
"""

showDF(spark.sql(countryCodeLengthQuery))

Unnamed: 0,column_length,cnt
0,2,262


In [14]:
# Check duplicate
showDF(countryDim.groupBy("country_iso_code").agg(count("*").alias("cnt")).filter("cnt > 1"))

showDF(countryDim.groupBy("wb_country_code").agg(count("*").alias("cnt")).filter("cnt > 1"))

showDF(countryDim.groupBy("country_name").agg(count("*").alias("cnt")).filter("cnt > 1"))

Unnamed: 0,country_iso_code,cnt


Unnamed: 0,wb_country_code,cnt


Unnamed: 0,country_name,cnt


In [15]:
# Fix issue with data quality
countryDimFinal = countryDim.filter("country_iso_code is not null")

showDF(countryDimFinal)

Unnamed: 0,country_iso_code,wb_country_code,country_name,country_long_name,region,income_group
0,AW,ABW,Aruba,Aruba,Latin America & Caribbean,High income
1,AF,AFG,Afghanistan,Islamic State of Afghanistan,South Asia,Low income
2,AO,AGO,Angola,People's Republic of Angola,Sub-Saharan Africa,Lower middle income
3,AL,ALB,Albania,Republic of Albania,Europe & Central Asia,Upper middle income
4,AD,AND,Andorra,Principality of Andorra,Europe & Central Asia,High income
5,1A,ARB,Arab World,Arab World,,
6,AE,ARE,United Arab Emirates,United Arab Emirates,Middle East & North Africa,High income
7,AR,ARG,Argentina,Argentine Republic,,
8,AM,ARM,Armenia,Republic of Armenia,Europe & Central Asia,Lower middle income
9,AS,ASM,American Samoa,American Samoa,East Asia & Pacific,Upper middle income


In [16]:
countryDimFinal.count()

262

In [17]:
# Here we are going to write the country dimension to an output csv file
countryDimFinal \
    .coalesce(1) \
    .write.csv('../../output/CountryDim', mode='overwrite', header='true')

In [18]:
%%bash
cat ../../output/CountryDim/*csv | head

country_iso_code,wb_country_code,country_name,country_long_name,region,income_group
AW,ABW,Aruba,Aruba,Latin America & Caribbean,High income
AF,AFG,Afghanistan,Islamic State of Afghanistan,South Asia,Low income
AO,AGO,Angola,People's Republic of Angola,Sub-Saharan Africa,Lower middle income
AL,ALB,Albania,Republic of Albania,Europe & Central Asia,Upper middle income
AD,AND,Andorra,Principality of Andorra,Europe & Central Asia,High income
1A,ARB,Arab World,Arab World,,
AE,ARE,United Arab Emirates,United Arab Emirates,Middle East & North Africa,High income
AR,ARG,Argentina,Argentine Republic,,
AM,ARM,Armenia,Republic of Armenia,Europe & Central Asia,Lower middle income


In [19]:
# Series dataset 
# Name in CSV	Column Name
# Series Code	indicator_code
# Indicator Name	indicator_name
# Periodicity	periodicity
# Aggregation Method	aggregation_method
seriesDim = series \
    .select("Series Code", "Indicator Name", "Short Definition", "Periodicity", "Aggregation Method") \
    .withColumnRenamed("Series Code", "indicator_code") \
    .withColumnRenamed("Indicator Name", "indicator_name") \
    .withColumnRenamed("Periodicity", "periodicity") \
    .withColumnRenamed("Aggregation Method", "aggregation_method") \
    .filter(col("periodicity") == "Annual") 
    
showDF(seriesDim)

seriesDim.count()

Unnamed: 0,indicator_code,indicator_name,Short Definition,periodicity,aggregation_method
0,AG.AGR.TRAC.NO,"Agricultural machinery, tractors",,Annual,Sum
1,AG.CON.FERT.PT.ZS,Fertilizer consumption (% of fertilizer produc...,,Annual,Weighted average
2,AG.CON.FERT.ZS,Fertilizer consumption (kilograms per hectare ...,,Annual,Weighted average
3,AG.LND.AGRI.K2,Agricultural land (sq. km),,Annual,Sum
4,AG.LND.AGRI.ZS,Agricultural land (% of land area),,Annual,Weighted average
5,AG.LND.ARBL.HA,Arable land (hectares),,Annual,
6,AG.LND.ARBL.HA.PC,Arable land (hectares per person),,Annual,Weighted Average
7,AG.LND.ARBL.ZS,Arable land (% of land area),,Annual,Weighted average
8,AG.LND.CREL.HA,Land under cereal production (hectares),,Annual,Sum
9,AG.LND.CROP.ZS,Permanent cropland (% of land area),,Annual,Weighted average


1587

### Complex Transformations
##### Problem Statement
We want to measure the cellular and broadband penetration in comparison to the population demographics for every country.  It'll also be helpful to get some insights on annual global aggregates.

###### Our dataset has multiple types of metrics.  The only ones that we care about are simple aggregates.

In [20]:
simpleAggIndicators = seriesDim \
    .filter("lower(aggregation_method) = 'sum'") \
    .select("indicator_code", "indicator_name") \
    .orderBy("indicator_code")

showDF(simpleAggIndicators, limitRows = 500, truncate = False)

Unnamed: 0,indicator_code,indicator_name
0,AG.AGR.TRAC.NO,"Agricultural machinery, tractors"
1,AG.LND.AGRI.K2,Agricultural land (sq. km)
2,AG.LND.CREL.HA,Land under cereal production (hectares)
3,AG.LND.EL5M.RU.K2,Rural land area where elevation is below 5 meters (sq. km)
4,AG.LND.EL5M.UR.K2,Urban land area where elevation is below 5 meters (sq. km)
5,AG.LND.FRST.K2,Forest area (sq. km)
6,AG.LND.TOTL.K2,Land area (sq. km)
7,AG.LND.TOTL.RU.K2,Rural land area (sq. km)
8,AG.LND.TOTL.UR.K2,Urban land area (sq. km)
9,AG.PRD.CREL.MT,Cereal production (metric tons)


##### Only keep the indicators that are relevant to requirements i.e. Population indicators and Cellular and Broadband penetration

In [21]:
targetIndicators = simpleAggIndicators \
    .filter("lower(indicator_name) like '%population%total%' " + 
            " or lower(indicator_name) like '%cellular%' " +
            " or lower(indicator_name) like '%broadband%'") \
    .filter("lower(indicator_name) not like '%refugee%'")

showDF(targetIndicators)

Unnamed: 0,indicator_code,indicator_name
0,IT.CEL.SETS,Mobile cellular subscriptions
1,IT.NET.BBND,Fixed broadband subscriptions
2,SP.POP.0014.TO,"Population ages 0-14, total"
3,SP.POP.1564.TO,"Population ages 15-64, total"
4,SP.POP.65UP.TO,"Population ages 65 and above, total"
5,SP.POP.TOTL,"Population, total"


##### Now that we have identified the various indicators of interest, we can continue with getting the metrics for these indicators

In [22]:
# Keep the columns that are relevant for further transformations
indicatorsData = indicators \
    .withColumnRenamed("Indicator Code", "indicator_code") \
    .withColumnRenamed("Country Code", "wb_country_code") \
    .drop("Indicator Name") \
    .drop("Country Name") \
    .drop("_c62")

In [23]:
#Keep only the indicators that we care about
targetIndicatorsData = indicatorsData.join(targetIndicators \
                                         , indicatorsData.indicator_code == targetIndicators.indicator_code) \
    .drop(targetIndicators.indicator_code)

In [24]:
showDF(targetIndicatorsData)

Unnamed: 0,wb_country_code,indicator_code,1960,1961,1962,1963,1964,1965,1966,1967,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,indicator_name
0,ARB,IT.NET.BBND,,,,,,,,,...,5845325.0,7123797.0,8570787.0,10323590.0,11799990.0,13834070.0,16604450.0,18526990.0,,Fixed broadband subscriptions
1,ARB,IT.CEL.SETS,0.0,,,,,0.0,,,...,265270800.0,312247800.0,351958000.0,381641900.0,407704500.0,415029400.0,417195600.0,412315300.0,,Mobile cellular subscriptions
2,ARB,SP.POP.0014.TO,40064255.0,41518715.0,42987169.0,44458639.0,45910606.0,47331020.0,48923976.0,50456294.0,...,118075200.0,119921700.0,122165900.0,124673800.0,127344100.0,129987000.0,132465400.0,135007700.0,,"Population ages 0-14, total"
3,ARB,SP.POP.1564.TO,49179481.0,50158016.0,51208328.0,52348368.0,53610121.0,55012095.0,56343748.0,57859656.0,...,215639500.0,221771600.0,227485600.0,232951100.0,238215400.0,243405400.0,248614900.0,253566100.0,,"Population ages 15-64, total"
4,ARB,SP.POP.65UP.TO,3247196.0,3367766.0,3486797.0,3604069.0,3719175.0,3831873.0,3962869.0,4090982.0,...,14430470.0,14815540.0,15244330.0,15682050.0,16142610.0,16650640.0,17224680.0,17878930.0,,"Population ages 65 and above, total"
5,ARB,SP.POP.TOTL,92490932.0,95044497.0,97682294.0,100411076.0,103239902.0,106174988.0,109230593.0,112406932.0,...,348145100.0,356508900.0,364895900.0,373307000.0,381702100.0,390043000.0,398305000.0,406452700.0,,"Population, total"
6,CSS,IT.NET.BBND,,,,,,,,,...,434162.0,473138.0,498285.0,627251.0,674419.0,753368.0,897134.0,981886.0,,Fixed broadband subscriptions
7,CSS,IT.CEL.SETS,0.0,,,,,0.0,,,...,7646708.0,7935923.0,7534114.0,7322041.0,7738401.0,7952805.0,8158698.0,8487436.0,,Mobile cellular subscriptions
8,CSS,SP.POP.0014.TO,1766884.0,1816372.0,1864013.0,1908732.0,1949651.0,1986537.0,2027526.0,2065067.0,...,1828309.0,1804177.0,1783918.0,1761456.0,1738418.0,1717638.0,1700765.0,1687271.0,,"Population ages 0-14, total"
9,CSS,SP.POP.1564.TO,2151060.0,2173846.0,2198714.0,2225619.0,2253715.0,2281832.0,2303230.0,2324023.0,...,4477802.0,4533534.0,4585728.0,4638828.0,4691000.0,4739145.0,4781112.0,4816676.0,,"Population ages 15-64, total"


#### The output that we see currently isn't the most ideal from a modelling perspective.  
A well modeled dataset, should allow for data to be easily augmented.  e.g. instead of having a column for each year we would ideally prefer a row for each year to be able to add more rows in the future similar to the output of the following code block.

In [25]:
indicatorsSample = targetIndicatorsData \
    .select(col("wb_country_code")
            , col("indicator_code")
            , lit("1960").alias("year")
            , col("1960").alias("indicator_value")) \
    .filter("indicator_value >= 0.0")

showDF(indicatorsSample)

Unnamed: 0,wb_country_code,indicator_code,year,indicator_value
0,ARB,IT.CEL.SETS,1960,0.0
1,ARB,SP.POP.0014.TO,1960,40064255.0
2,ARB,SP.POP.1564.TO,1960,49179481.0
3,ARB,SP.POP.65UP.TO,1960,3247196.0
4,ARB,SP.POP.TOTL,1960,92490932.0
5,CSS,IT.CEL.SETS,1960,0.0
6,CSS,SP.POP.0014.TO,1960,1766884.0
7,CSS,SP.POP.1564.TO,1960,2151060.0
8,CSS,SP.POP.65UP.TO,1960,169157.0
9,CSS,SP.POP.TOTL,1960,4198307.0


##### Let us start by getting the list of years that we have metrics for

In [26]:
yearList = [x for x in targetIndicatorsData.schema.names \
             if x != 'wb_country_code' and x != 'indicator_code' and x != 'indicator_name'] 

print(yearList)

['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']


In [27]:
#Cheat for creating a dataframe with no rows 
indicatorsDF = indicatorsSample.filter('1 = 0')

#Iterate through the list of years and store the rows in the DataFrame we created above
for indicatorYear in yearList:
    print("Processing indicators for " + indicatorYear)
    yearIndicatorDF = targetIndicatorsData \
        .select(col("wb_country_code")
                , col("indicator_code")
                , lit(indicatorYear).alias("year")
                , col(indicatorYear).alias("indicator_value")) \
        .filter("indicator_value >= 0")
    indicatorsDF = indicatorsDF.union(yearIndicatorDF)    

Processing indicators for 1960
Processing indicators for 1961
Processing indicators for 1962
Processing indicators for 1963
Processing indicators for 1964
Processing indicators for 1965
Processing indicators for 1966
Processing indicators for 1967
Processing indicators for 1968
Processing indicators for 1969
Processing indicators for 1970
Processing indicators for 1971
Processing indicators for 1972
Processing indicators for 1973
Processing indicators for 1974
Processing indicators for 1975
Processing indicators for 1976
Processing indicators for 1977
Processing indicators for 1978
Processing indicators for 1979
Processing indicators for 1980
Processing indicators for 1981
Processing indicators for 1982
Processing indicators for 1983
Processing indicators for 1984
Processing indicators for 1985
Processing indicators for 1986
Processing indicators for 1987
Processing indicators for 1988
Processing indicators for 1989
Processing indicators for 1990
Processing indicators for 1991
Processi

#### Let's cache the dataset to iterate over it

In [28]:
# You can iterate over a dataframe that is already computed by caching it onces and using it repeatedly
indicatorsDF.cache()

#Force the data to be cached
indicatorsDF.count()

70303

In [29]:
#Check the indicator counts per year
#showDF(indicatorsDF.groupBy('year').agg(count("*")).orderBy("year"), limitRows=100)

#### Getting yearly indicator totals

In [30]:
yearPivot = indicatorsDF.groupBy('year').pivot('indicator_code').sum('indicator_value') 

In [31]:
showDF(yearPivot.orderBy('year'))

Unnamed: 0,year,IT.CEL.SETS,IT.NET.BBND,SP.POP.0014.TO,SP.POP.1564.TO,SP.POP.65UP.TO,SP.POP.TOTL
0,1960,0.0,,11640430000.0,17646560000.0,1419169000.0,30720520000.0
1,1961,,,11860660000.0,17807070000.0,1451321000.0,31133690000.0
2,1962,,,12153580000.0,18033400000.0,1483492000.0,31685500000.0
3,1963,,,12498410000.0,18341100000.0,1517104000.0,32372040000.0
4,1964,,,12817220000.0,18681780000.0,1549219000.0,33064040000.0
5,1965,0.0,,13104550000.0,19071430000.0,1581083000.0,33773270000.0
6,1966,,,13437310000.0,19431870000.0,1634468000.0,34520250000.0
7,1967,,,13717670000.0,19843420000.0,1687816000.0,35265910000.0
8,1968,,,13969240000.0,20297060000.0,1741146000.0,36024840000.0
9,1969,,,14225160000.0,20782730000.0,1795141000.0,36820820000.0


In [32]:
yearPivot.printSchema()

root
 |-- year: string (nullable = false)
 |-- IT.CEL.SETS: double (nullable = true)
 |-- IT.NET.BBND: double (nullable = true)
 |-- SP.POP.0014.TO: double (nullable = true)
 |-- SP.POP.1564.TO: double (nullable = true)
 |-- SP.POP.65UP.TO: double (nullable = true)
 |-- SP.POP.TOTL: double (nullable = true)



In [33]:
yearPivotDF = yearPivot.orderBy('year') \
    .withColumnRenamed('IT.CEL.SETS', 'cellular_subscriptions') \
    .withColumnRenamed('IT.NET.BBND', 'broadband_subscriptions') \
    .withColumnRenamed('SP.POP.0014.TO', 'population_age_0_to_14') \
    .withColumnRenamed('SP.POP.1564.TO', 'population_age_15_64') \
    .withColumnRenamed('SP.POP.65UP.TO', 'population_age_65_and_above') \
    .withColumnRenamed('SP.POP.TOTL', 'population')

#### Data Quality Checkpoint 

In [34]:
# You can iterate over a dataframe that is already computed by caching it onces and using it repeatedly
yearPivotDF.cache()

#Forces the data to be cached
yearPivotDF.count()

57

In [35]:
yearPivotDF.filter('population_age_0_to_14 < 0').count()

0

In [36]:
yearPivotDF.filter('population_age_15_64 < 0').count()

0

In [37]:
yearPivotDF.filter('population_age_0_to_14 < 0').count()

0

In [38]:
yearPivotDF.filter('population_age_65_and_above < 0').count()

0

In [39]:
yearPivotDF.filter('population < 0').count()

0

In [40]:
yearPivotDF.filter('cellular_subscriptions < 0').count()

0

In [41]:
yearPivotDF.filter('broadband_subscriptions < 0').count()

0

In [42]:
yearPivotDF.filter('population_age_0_to_14 > population').count()

0

In [43]:
yearPivotDF.filter('population_age_15_64 > population').count()

0

In [44]:
yearPivotDF.filter('population_age_65_and_above > population').count()

0

In [45]:
yearPivotDF.filter('(population_age_0_to_14 + population_age_15_64 + population_age_65_and_above) > population').count()

0

In [46]:
#Write the yearly totals to a CSV File
yearPivotDF \
    .select(col('year')
            , col('population').cast(DecimalType(38, 2))
            , col('population_age_0_to_14').cast(DecimalType(38, 2))
            , col('population_age_15_64').cast(DecimalType(38, 2))
            , col('population_age_65_and_above').cast(DecimalType(38, 2))
            , col('broadband_subscriptions').cast(DecimalType(38, 2))
            , col('cellular_subscriptions').cast(DecimalType(38, 2))) \
    .coalesce(1) \
    .write.csv('../../output/YearlyStats', mode='overwrite', header='true')

#### Getting yearly regional totals

In [47]:
regionalIndicators = indicatorsDF.join(countryDimFinal
                                       , indicatorsDF.wb_country_code == countryDim.wb_country_code
                                       , "inner") \
    .select(countryDim.region
            , indicatorsDF.wb_country_code
            , indicatorsDF.year
            , indicatorsDF.indicator_code
            , indicatorsDF.indicator_value)

In [48]:
showDF(regionalIndicators)

Unnamed: 0,region,wb_country_code,year,indicator_code,indicator_value
0,,ARB,1960,IT.CEL.SETS,0.0
1,,ARB,1960,SP.POP.0014.TO,40064255.0
2,,ARB,1960,SP.POP.1564.TO,49179481.0
3,,ARB,1960,SP.POP.65UP.TO,3247196.0
4,,ARB,1960,SP.POP.TOTL,92490932.0
5,,CSS,1960,IT.CEL.SETS,0.0
6,,CSS,1960,SP.POP.0014.TO,1766884.0
7,,CSS,1960,SP.POP.1564.TO,2151060.0
8,,CSS,1960,SP.POP.65UP.TO,169157.0
9,,CSS,1960,SP.POP.TOTL,4198307.0


In [49]:
regionalPivot = regionalIndicators.groupBy('region', 'year').pivot('indicator_code').sum('indicator_value')

In [50]:
showDF(regionalPivot.orderBy('region', 'year'), limitRows=100)

Unnamed: 0,region,year,IT.CEL.SETS,IT.NET.BBND,SP.POP.0014.TO,SP.POP.1564.TO,SP.POP.65UP.TO,SP.POP.TOTL
0,,1960,0.0,,10529180000.0,15913790000.0,1270274000.0,27726200000.0
1,,1961,,,10728830000.0,16057890000.0,1299015000.0,28098950000.0
2,,1962,,,10995080000.0,16261950000.0,1327810000.0,28598410000.0
3,,1963,,,11308760000.0,16539850000.0,1357916000.0,29220450000.0
4,,1964,,,11598830000.0,16847780000.0,1386660000.0,29847550000.0
5,,1965,0.0,,11860240000.0,17200400000.0,1415175000.0,30490450000.0
6,,1966,,,12163110000.0,17526690000.0,1463169000.0,31167950000.0
7,,1967,,,12418380000.0,17899580000.0,1511139000.0,31844460000.0
8,,1968,,,12647600000.0,18311040000.0,1559115000.0,32533460000.0
9,,1969,,,12880630000.0,18751020000.0,1607638000.0,33255360000.0


In [51]:
#Write the regional-yearly totals to a CSV File
regionalPivot.filter('region is not null') \
    .orderBy('region','year') \
    .withColumnRenamed('IT.CEL.SETS', 'cellular_subscriptions') \
    .withColumnRenamed('IT.NET.BBND', 'broadband_subscriptions') \
    .withColumnRenamed('SP.POP.0014.TO', 'population_age_0_to_14') \
    .withColumnRenamed('SP.POP.1564.TO', 'population_age_15_64') \
    .withColumnRenamed('SP.POP.65UP.TO', 'population_age_65_and_above') \
    .withColumnRenamed('SP.POP.TOTL', 'population') \
    .select(col('region')
            , col('year')
            , col('population').cast(DecimalType(38, 2))
            , col('population_age_0_to_14').cast(DecimalType(38, 2))
            , col('population_age_15_64').cast(DecimalType(38, 2))
            , col('population_age_65_and_above').cast(DecimalType(38, 2))
            , col('broadband_subscriptions').cast(DecimalType(38, 2))
            , col('cellular_subscriptions').cast(DecimalType(38, 2))) \
    .coalesce(1) \
    .write.csv('../../output/RegionalStats', mode='overwrite', header='true')

In [52]:
countryIndicators = indicatorsDF.join(countryDimFinal
                                       , indicatorsDF.wb_country_code == countryDim.wb_country_code
                                       , "inner") \
    .select(indicatorsDF.wb_country_code
            , countryDim.country_iso_code
            , countryDim.country_name
            , indicatorsDF.year
            , indicatorsDF.indicator_code
            , indicatorsDF.indicator_value)

showDF(countryIndicators)

Unnamed: 0,wb_country_code,country_iso_code,country_name,year,indicator_code,indicator_value
0,ARB,1A,Arab World,1960,IT.CEL.SETS,0.0
1,ARB,1A,Arab World,1960,SP.POP.0014.TO,40064255.0
2,ARB,1A,Arab World,1960,SP.POP.1564.TO,49179481.0
3,ARB,1A,Arab World,1960,SP.POP.65UP.TO,3247196.0
4,ARB,1A,Arab World,1960,SP.POP.TOTL,92490932.0
5,CSS,S3,Caribbean small states,1960,IT.CEL.SETS,0.0
6,CSS,S3,Caribbean small states,1960,SP.POP.0014.TO,1766884.0
7,CSS,S3,Caribbean small states,1960,SP.POP.1564.TO,2151060.0
8,CSS,S3,Caribbean small states,1960,SP.POP.65UP.TO,169157.0
9,CSS,S3,Caribbean small states,1960,SP.POP.TOTL,4198307.0


In [53]:
countryPivot = countryIndicators.groupBy('country_iso_code', 'country_name', 'year') \
    .pivot('indicator_code').sum('indicator_value')

In [54]:
showDF(countryPivot.orderBy('country_iso_code', 'country_name', 'year'), limitRows=100)

Unnamed: 0,country_iso_code,country_name,year,IT.CEL.SETS,IT.NET.BBND,SP.POP.0014.TO,SP.POP.1564.TO,SP.POP.65UP.TO,SP.POP.TOTL
0,1A,Arab World,1960,0.0,,40064260.0,49179480.0,3247196.0,92490930.0
1,1A,Arab World,1961,,,41518720.0,50158020.0,3367766.0,95044500.0
2,1A,Arab World,1962,,,42987170.0,51208330.0,3486797.0,97682290.0
3,1A,Arab World,1963,,,44458640.0,52348370.0,3604069.0,100411100.0
4,1A,Arab World,1964,,,45910610.0,53610120.0,3719175.0,103239900.0
5,1A,Arab World,1965,0.0,,47331020.0,55012100.0,3831873.0,106175000.0
6,1A,Arab World,1966,,,48923980.0,56343750.0,3962869.0,109230600.0
7,1A,Arab World,1967,,,50456290.0,57859660.0,4090982.0,112406900.0
8,1A,Arab World,1968,,,51945970.0,59517130.0,4217070.0,115680200.0
9,1A,Arab World,1969,,,53411640.0,61262880.0,4342025.0,119016500.0


In [55]:
#Write the regional-yearly totals to a CSV File
countryPivot.filter('country_iso_code is not null') \
    .orderBy('country_iso_code','country_name', 'year') \
    .withColumnRenamed('IT.CEL.SETS', 'cellular_subscriptions') \
    .withColumnRenamed('IT.NET.BBND', 'broadband_subscriptions') \
    .withColumnRenamed('SP.POP.0014.TO', 'population_age_0_to_14') \
    .withColumnRenamed('SP.POP.1564.TO', 'population_age_15_64') \
    .withColumnRenamed('SP.POP.65UP.TO', 'population_age_65_and_above') \
    .withColumnRenamed('SP.POP.TOTL', 'population') \
    .select(col('country_iso_code')
            , col('country_name')
            , col('year')
            , col('population').cast(DecimalType(38, 2))
            , col('population_age_0_to_14').cast(DecimalType(38, 2))
            , col('population_age_15_64').cast(DecimalType(38, 2))
            , col('population_age_65_and_above').cast(DecimalType(38, 2))
            , col('broadband_subscriptions').cast(DecimalType(38, 2))
            , col('cellular_subscriptions').cast(DecimalType(38, 2))) \
    .coalesce(1) \
    .write.csv('../../output/CountryStats', mode='overwrite', header='true')

In [56]:
recentIndicators = indicatorsData \
    .select("wb_country_code", "indicator_code", "2016") \
    .filter(col('indicator_code').isin('IC.REG.COST.PC.ZS', 'IC.REG.DURS', 'IC.REG.PROC', \
        'NY.GNP.ATLS.CD', 'NY.GDP.MKTP.KD', 'NY.GDP.PCAP.KD', 'IQ.CPA.BREG.XQ', 'IC.BUS.EASE.XQ')) \
    .withColumnRenamed("2016", "indicator_value") \
    .withColumn("indicator_value", col("indicator_value").cast(DecimalType(38, 2)))

showDF(recentIndicators)

Unnamed: 0,wb_country_code,indicator_code,indicator_value
0,ARB,IC.REG.COST.PC.ZS,36.47
1,ARB,IQ.CPA.BREG.XQ,2.8
2,ARB,IC.BUS.EASE.XQ,
3,ARB,NY.GDP.MKTP.KD,2616760000000.0
4,ARB,NY.GDP.PCAP.KD,6438.05
5,ARB,NY.GNP.ATLS.CD,2676040000000.0
6,ARB,IC.REG.PROC,8.32
7,ARB,IC.REG.DURS,23.17
8,CSS,IC.REG.COST.PC.ZS,19.9
9,CSS,IQ.CPA.BREG.XQ,3.9


In [None]:
# Start by matching up indicators that might have descriptions matching what Kat is looking for. e.g.
# showDF(seriesDim.filter("indicator_name like '%GNI%'").orderBy("indicator_code"), limitRows = 500)

In [57]:
businessIndexIndicators = indicatorsData \
    .select("wb_country_code", "indicator_code", "2017") \
    .filter(col('indicator_code').isin('IC.BUS.EASE.XQ')) \
    .withColumnRenamed("2017", "indicator_value") \
    .withColumn("indicator_value", col("indicator_value").cast(DecimalType(38, 2)))

In [58]:
allIndicators = recentIndicators.union(businessIndexIndicators)

In [59]:
countryBusinessStartupPivot = allIndicators.join(countryDimFinal
                                       , recentIndicators.wb_country_code == countryDim.wb_country_code
                                       , "inner") \
    .select(countryDimFinal.country_iso_code
            , countryDimFinal.country_name
            , recentIndicators.indicator_code
            , recentIndicators.indicator_value) \
    .groupBy('country_iso_code', 'country_name').pivot('indicator_code').sum('indicator_value') \
    .withColumnRenamed('country_iso_code', 'Country ISO Code') \
    .withColumnRenamed('country_name', 'Country Name') \
    .withColumnRenamed('NY.GNP.ATLS.CD', 'GNI') \
    .withColumnRenamed('IC.REG.DURS', 'Startup Time') \
    .withColumnRenamed('IC.REG.PROC', 'Startup Procedures') \
    .withColumnRenamed('IC.REG.COST.PC.ZS', 'Startup Cost Pct of GNI') \
    .withColumnRenamed('NY.GDP.MKTP.KD', 'GDP') \
    .withColumnRenamed('NY.GDP.PCAP.KD', 'GDP Per Capita') \
    .withColumnRenamed('IQ.CPA.BREG.XQ', 'Business Regulation') \
    .withColumnRenamed('IC.BUS.EASE.XQ', 'Ease of business') \
    .withColumn('Startup Cost', (col('GNI') * col('Startup Cost Pct of GNI') / lit(100.0)).cast(DecimalType(38, 2))) \
    .filter(col('GNI') > 0) \
    .filter(col('Startup Time').isNotNull()) \
    .filter(col('Startup Procedures').isNotNull()) \
    .filter(col('Startup Cost').isNotNull()) 

In [60]:
showDF(countryBusinessStartupPivot, limitRows = 500)

Unnamed: 0,Country ISO Code,Country Name,Ease of business,Startup Cost Pct of GNI,Startup Time,Startup Procedures,Business Regulation,GDP,GDP Per Capita,GNI,Startup Cost
0,BJ,Benin,151.0,3.8,8.5,6.0,3.0,9103831278.0,837.34,8938879520.0,339677421.76
1,XC,Euro area,,3.83,9.55,5.26,,13377100000000.0,39256.27,12312600000000.0,471572580000.0
2,KZ,Kazakhstan,36.0,0.3,9.0,5.0,,188309000000.0,10582.5,156812000000.0,470436000.0
3,JM,Jamaica,70.0,4.3,10.0,2.0,,13801803130.0,4790.04,13349534142.0,574029968.11
4,NO,Norway,8.0,0.9,4.0,4.0,,472766000000.0,90288.82,429276000000.0,3863484000.0
5,IR,Iran,124.0,1.2,15.5,9.0,,540581000000.0,6733.91,438869000000.0,5266428000.0
6,AG,Antigua and Barbuda,107.0,9.4,22.0,9.0,,1344373698.0,13315.51,1369440563.0,128727412.92
7,CG,Congo,179.0,61.2,50.0,11.0,2.5,14342385958.0,2798.07,8770006555.0,5367244011.66
8,AR,Argentina,117.0,10.8,24.0,13.0,,445227000000.0,10154.0,524974000000.0,56697192000.0
9,HT,Haiti,181.0,219.3,97.0,12.0,2.0,7910618370.0,729.27,8488280524.0,18614799189.13


In [61]:
countryBusinessStartupPivot \
    .select("Country ISO Code", "Country Name", "GDP", "GDP Per Capita", "GNI", \
            "Startup Cost", "Startup Cost Pct of GNI", "Startup Time", "Startup Procedures", \
            "Business Regulation", "Ease of business") \
    .coalesce(1) \
    .write.csv('../../output/BusinessStartupData', mode='overwrite', header='true')