# Joinning and Pivoting DataFrames

In [0]:
from pyspark.sql.functions import *

In [0]:
path_countries = "dbfs:/FileStore/sample_data/countries.txt"
path_regions = 'dbfs:/FileStore/sample_data/country_regions.csv'

In [0]:
countries_df = (spark.read
    .format('csv')
    .option('header','true')
    .option('inferschema','true')
     .option('sep', '\t')
    .load(path_countries)
)

In [0]:
path_regions = 'dbfs:/FileStore/sample_data/country_regions.csv'

regions_df = (spark.read
    .format('csv')
    .option('header','true')
    .option('inferschema','true')
     .option('sep', ',')
    .load(path_regions)
)

In [0]:
display(regions_df.limit(10))

ID,NAME
10,America
20,Europe
30,Asia
40,Oceania
50,Africa



When piviting a DF, you apply the pivot method after the groupby method but before the aggregation. 

In [0]:
countries_df.groupBy('region_id','sub_region_id') \
    .agg(sum(col('population')) \
    .alias('population')) \
    .orderBy('population', ascending = False) \
    .display()

region_id,sub_region_id,population
30,30.0,1918211381
30,60.0,1672611098
50,160.0,1066287151
30,100.0,662011806
10,10.0,648120719
10,80.0,366600964
20,140.0,293444913
30,170.0,275324813
50,40.0,241780768
20,150.0,195522410


In [0]:
countries_df \
    .groupBy('region_id','sub_region_id') \
    .pivot('region_id') \
    .agg(sum(col('population'))) \
    .display()


    # We can drop region ID from the broupBy method

region_id,sub_region_id,10,20,30,40,50
10,80.0,366600964.0,,,,
30,100.0,,,662011806.0,,
50,160.0,,,,,1066287151.0
20,150.0,,195522410.0,,,
30,170.0,,,275324813.0,,
40,,,,,1106.0,
50,40.0,,,,,241780768.0
40,130.0,,,,10918517.0,
20,70.0,,152446923.0,,,
40,50.0,,,,543786.0,


In [0]:
countries_df \
    .groupBy('sub_region_id') \
    .pivot('region_id') \
    .agg(sum(col('population'))) \
    .display()


    # Pivots are good for creating matrices

sub_region_id,10,20,30,40,50
140.0,,293444913.0,,,
,,,,1106.0,
40.0,,,,,241780768.0
20.0,,,,679839.0,
120.0,,105791728.0,,,
100.0,,,662011806.0,,
130.0,,,,10918517.0,
10.0,648120719.0,,,,
50.0,,,,543786.0,
80.0,366600964.0,,,,


In [0]:
population_df = countries_df \
    .join(regions_df, countries_df['region_id'] == regions_df['id'], how='left') \
    .select(
        countries_df['name'].alias('country_name'),
        regions_df['name'].alias('region_name'),
        countries_df['population']
    ) \
 
population_df.display()

country_name,region_name,population
Afghanistan,Asia,38041754
Albania,Europe,2880917
Algeria,Africa,43053054
American Samoa,Oceania,55312
Andorra,Europe,77142
Angola,Africa,31825295
Anguilla,America,14869
Antarctica,Oceania,1106
Antigua and Barbuda,America,97118
Argentina,America,44780677


In [0]:
population_df \
    .groupBy('region_name') \
    .agg(sum('population').alias('population')) \
    .orderBy('population', ascending=False) \
    .display()  


region_name,population
Asia,4601371198
Africa,1308067919
America,1014721683
Europe,747205974
Oceania,42133644


In [0]:
population_df \
    .groupBy('country_name') \
    .pivot('region_name') \
    .sum('population') \
    .orderBy('country_name') \
    .display()  


country_name,Africa,America,Asia,Europe,Oceania
Afghanistan,,,38041754.0,,
Albania,,,,2880917.0,
Algeria,43053054.0,,,,
American Samoa,,,,,55312.0
Andorra,,,,77142.0,
Angola,31825295.0,,,,
Anguilla,,14869.0,,,
Antarctica,,,,,1106.0
Antigua and Barbuda,,97118.0,,,
Argentina,,44780677.0,,,


In [0]:
# Unpivot

population_df \
    .select('country_name', expr(""))

In [0]:
population_df \
    .groupBy('region_name','country_name') \
    .agg(sum('population').alias('population')) \
    .orderBy('population', ascending=False) \
    .display()  


region_name,country_name,population
Asia,China,1433783686
Asia,India,1366417754
America,United States of America,329064917
Asia,Indonesia,270625568
Asia,Pakistan,216565318
America,Brazil,211049527
Africa,Nigeria,200963599
Asia,Bangladesh,163046161
Europe,Russian Federation,145872256
America,Mexico,127575529


In [0]:
# Use Union to append two dataframes. Must have the same number of columns.

In [0]:
countries_df.count()

249

In [0]:
countries_df = countries_df.union(countries_df)

In [0]:
countries_df.count()

498

In [0]:
countries_df.printSchema()

root
 |-- COUNTRY_ID: integer (nullable = true)
 |-- NAME: string (nullable = true)
 |-- NATIONALITY: string (nullable = true)
 |-- COUNTRY_CODE: string (nullable = true)
 |-- ISO_ALPHA2: string (nullable = true)
 |-- CAPITAL: string (nullable = true)
 |-- POPULATION: integer (nullable = true)
 |-- AREA_KM2: double (nullable = true)
 |-- REGION_ID: integer (nullable = true)
 |-- SUB_REGION_ID: integer (nullable = true)
 |-- INTERMEDIATE_REGION_ID: integer (nullable = true)
 |-- ORGANIZATION_REGION_ID: integer (nullable = true)

