In [1]:
from pyspark.sql import SparkSession,Window
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime

spark = SparkSession.builder.appName("World Bank Indicators").getOrCreate()

In [2]:
dataFile = "./data/World_Bank_Indicators.csv"
schemaStruct = StructType([StructField("Country",StringType(),True),
                           StructField("Date",StringType(),True),
                           StructField("Railways",StringType(),True),
                           StructField("Passenger_cars",StringType(),True),
                           StructField("Mobile_Users",StringType(),True),
                           StructField("Internet_Users",StringType(),True),
                           StructField("Mortality",StringType(),True),
                           StructField("Health_exp_per_capita",StringType(),True),
                           StructField("Health_exp",StringType(),True),
                           StructField("Total_Population",StringType(),True),
                           StructField("Urban_Population",StringType(),True),
                           StructField("Birth_Rate",StringType(),True),
                           StructField("Female_Life_Exp",StringType(),True),
                           StructField("Male Life Exp",StringType(),True),
                           StructField("Life_Exp",StringType(),True),
                           StructField("Age_0_14",StringType(),True),
                           StructField("Age_15_64",StringType(),True),
                           StructField("Age_65+",StringType(),True),
                           StructField("GDP",StringType(),True),
                           StructField("GDP_per_capita",StringType(),True),                           
                           
                          ])
data= spark.read.csv(dataFile,schema=schemaStruct)

In [3]:
data1 = data.na.fill("0")

In [4]:
def udf_to_long(col_str):
    return regexp_replace(col_str,',','').cast('long')

spark.udf.register('udf_to_long',udf_to_long,StringType())

<function __main__.udf_to_long(col_str)>

In [5]:
def udf_to_date(date_str):
        return to_date(date_str,"d/m/yyyy")

In [6]:
udf_to_date('7/1/2000')

Column<b"to_date(`7/1/2000`, 'd/m/yyyy')">

In [7]:
spark.udf.register('udf_to_date',udf_to_date,StringType())

<function __main__.udf_to_date(date_str)>

In [8]:
data1.select("Date",udf_to_date("Date")).show()

+--------+---------------------------+
|    Date|to_date(`Date`, 'd/m/yyyy')|
+--------+---------------------------+
|7/1/2000|                 2000-01-07|
|7/1/2001|                 2001-01-07|
|7/1/2002|                 2002-01-07|
|7/1/2003|                 2003-01-07|
|7/1/2004|                 2004-01-07|
|7/1/2005|                 2005-01-07|
|7/1/2006|                 2006-01-07|
|7/1/2007|                 2007-01-07|
|7/1/2008|                 2008-01-07|
|7/1/2009|                 2009-01-07|
|7/1/2010|                 2010-01-07|
|7/1/2000|                 2000-01-07|
|7/1/2001|                 2001-01-07|
|7/1/2002|                 2002-01-07|
|7/1/2003|                 2003-01-07|
|7/1/2004|                 2004-01-07|
|7/1/2005|                 2005-01-07|
|7/1/2006|                 2006-01-07|
|7/1/2007|                 2007-01-07|
|7/1/2008|                 2008-01-07|
+--------+---------------------------+
only showing top 20 rows



In [9]:
newdata = (data1
           .withColumn('Date',udf_to_date("Date"))
           .withColumn('Railways',udf_to_long('Railways'))
           .withColumn('Passenger_cars',udf_to_long('Passenger_cars'))
           .withColumn('Mobile_Users',udf_to_long('Mobile_Users'))
           .withColumn('Internet_Users',udf_to_long('Internet_Users'))
           .withColumn('Mortality',udf_to_long('Mortality'))      
           .withColumn('Health_exp_per_capita',udf_to_long('Health_exp_per_capita'))  
           .withColumn('Health_exp',udf_to_long('Health_exp'))               
           .withColumn('Urban_Population',udf_to_long('Urban_Population'))
           .withColumn('Total_Population',udf_to_long('Total_Population'))     
           .withColumn('Birth_Rate',udf_to_long('Birth_Rate'))      
           .withColumn('Female_Life_Exp',udf_to_long('Female_Life_Exp'))  
           .withColumn('Male Life Exp',udf_to_long('Male Life Exp'))               
           .withColumn('Life_Exp',udf_to_long('Life_Exp'))
           .withColumn('Age_0_14',udf_to_long('Age_0_14'))           
           .withColumn('Age_15_64',udf_to_long('Age_15_64'))      
           .withColumn('Age_65+',udf_to_long('Age_65+'))  
           .withColumn('GDP',udf_to_long('GDP'))               
           .withColumn('GDP_per_capita',udf_to_long('GDP_per_capita'))
          )                
           

In [10]:
newdata.show(1)

+-----------+----------+--------+--------------+------------+--------------+---------+---------------------+----------+----------------+----------------+----------+---------------+-------------+--------+--------+---------+-------+---+--------------+
|    Country|      Date|Railways|Passenger_cars|Mobile_Users|Internet_Users|Mortality|Health_exp_per_capita|Health_exp|Total_Population|Urban_Population|Birth_Rate|Female_Life_Exp|Male Life Exp|Life_Exp|Age_0_14|Age_15_64|Age_65+|GDP|GDP_per_capita|
+-----------+----------+--------+--------------+------------+--------------+---------+---------------------+----------+----------------+----------------+----------+---------------+-------------+--------+--------+---------+-------+---+--------------+
|Afghanistan|2000-01-07|       0|             0|           0|             0|      151|                   11|         8|        25950816|         5527524|        51|             45|           45|      45|      48|       50|      2|  0|             0|


## Q1.Highest urban population - Country having the highest urban population

In [12]:
(newdata
.select('Country','Urban_Population')
.groupBy('Country')
.agg(max('Urban_Population').alias('Total_Urban_Population'))
.orderBy('Total_Urban_Population',ascending=False) 
.show(1) 
)

+-------+----------------------+
|Country|Total_Urban_Population|
+-------+----------------------+
|  China|             600683425|
+-------+----------------------+
only showing top 1 row



## Q2.Most populous Countries - List of countries in the descending order of their population

In [13]:
(newdata
.select('Country','Total_Population')
.groupBy('Country')
.agg(max('Total_Population').alias('Total_Population'))
.orderBy('Total_Population',ascending=False) 
.show(5,False) 
)

+-------------+----------------+
|Country      |Total_Population|
+-------------+----------------+
|China        |1337825000      |
|India        |1224614327      |
|United States|309349689       |
|Indonesia    |239870937       |
|Brazil       |194946470       |
+-------------+----------------+
only showing top 5 rows



## Q3. Highest population growth - Country with highest % population growth in past decade

In [14]:
modDf = (newdata
         .withColumn("max_pop",max("Total_Population").over(Window.partitionBy("Country")))
         .withColumn("min_pop",min("Total_Population").over(Window.partitionBy("Country")))
)

In [15]:
modDf = modDf.withColumn('Population_Growth',expr("((max_pop-min_pop)/min_pop)*100"))

In [16]:
modDf.groupBy("Country").agg(max("Population_Growth")).orderBy("max(Population_Growth)",ascending=False).show(1)

+-------+----------------------+
|Country|max(Population_Growth)|
+-------+----------------------+
|  Qatar|     197.6177623752659|
+-------+----------------------+
only showing top 1 row



## Q4. Highest GDP growth - List of Countries with highest GDP growth from 2009 to 2010 in descending order

In [17]:
modDf.filter(expr("year(Date) == 2009 or year(Date) == 2010")).filter("Country = 'China'").show()

+-------+----------+--------+--------------+------------+--------------+---------+---------------------+----------+----------------+----------------+----------+---------------+-------------+--------+--------+---------+-------+-------------+--------------+----------+----------+-----------------+
|Country|      Date|Railways|Passenger_cars|Mobile_Users|Internet_Users|Mortality|Health_exp_per_capita|Health_exp|Total_Population|Urban_Population|Birth_Rate|Female_Life_Exp|Male Life Exp|Life_Exp|Age_0_14|Age_15_64|Age_65+|          GDP|GDP_per_capita|   max_pop|   min_pop|Population_Growth|
+-------+----------+--------+--------------+------------+--------------+---------+---------------------+----------+----------------+----------------+----------+---------------+-------------+--------+--------+---------+-------+-------------+--------------+----------+----------+-----------------+
|  China|2009-01-07|  787890|            34|   747214000|            29|       20|                  191|        

In [18]:
modnewDF = (modDf.filter(expr("year(Date) = 2009 or year(Date) = 2010"))
           .withColumn("max_GDP",max("GDP").over(Window.partitionBy("Country")))
           .withColumn("min_GDP",min("GDP").over(Window.partitionBy("Country")))
           )

In [19]:
modnewDF = (modnewDF
           .withColumn("GDP_Growth",expr("max_GDP-min_GDP")))

In [20]:
modnewDF.select("Country","GDP_Growth").distinct().orderBy("GDP_Growth",ascending=False).show(truncate=False)

+------------------+------------+
|Country           |GDP_Growth  |
+------------------+------------+
|China             |939273064064|
|United States     |583500000000|
|Brazil            |521373825603|
|Japan             |453274928126|
|Iran, Islamic Rep.|331014973186|
|India             |323266546576|
|Russian Federation|264867473958|
|Canada            |239462442466|
|Australia         |207425653765|
|Korea, Rep.       |180829700030|
|Indonesia         |168446881442|
|Mexico            |153516134331|
|Turkey            |116590470733|
|United Kingdom    |80512352076 |
|South Africa      |80510778707 |
|Saudi Arabia      |74098666667 |
|Spain             |72293506497 |
|France            |70657737599 |
|Italy             |67508282591 |
|Venezuela, RB     |64388531931 |
+------------------+------------+
only showing top 20 rows



## Q5: Internet usage grown - Country where Internet usage has grown the most in the past decade

In [21]:
modnewDF1 = (modDf
           .withColumn("max_Internet_Users",max("Internet_Users").over(Window.partitionBy("Country")))
           .withColumn("min_Internet_Users",min("Internet_Users").over(Window.partitionBy("Country")))
           )

In [22]:
modnewDF1 = (modnewDF1
           .withColumn("Internet_Users_Growth",expr("max_Internet_Users-min_Internet_Users")))

In [23]:
modnewDF1.select("Country","Internet_Users_Growth").distinct().orderBy("Internet_Users_Growth",ascending=False).show(1,truncate=False)

+-------+---------------------+
|Country|Internet_Users_Growth|
+-------+---------------------+
|Qatar  |82                   |
+-------+---------------------+
only showing top 1 row



## Q6. Youngest Country - Yearly distribution of youngest Countries

In [26]:
modnewDF1.select("Country","Age_15_64").groupBy("Country").agg(max("Age_15_64")).orderBy("max(Age_15_64)",ascending=False).show(1,truncate=False)

+-------+--------------+
|Country|max(Age_15_64)|
+-------+--------------+
|Qatar  |85            |
+-------+--------------+
only showing top 1 row

