# Joinning and Pivoting DataFrames

In [0]:
from pyspark.sql.functions import *

In [0]:
path_countries = "dbfs:/FileStore/sample_data/countries.txt"
path_regions = 'dbfs:/FileStore/sample_data/country_regions.csv'

In [0]:
countries_df = (spark.read
    .format('csv')
    .option('header','true')
    .option('inferschema','true')
     .option('sep', '\t')
    .load(path_countries)
)

In [0]:
path_regions = 'dbfs:/FileStore/sample_data/country_regions.csv'

regions_df = (spark.read
    .format('csv')
    .option('header','true')
    .option('inferschema','true')
     .option('sep', ',')
    .load(path_regions)
)

In [0]:
display(regions_df.limit(10))

ID,NAME
10,America
20,Europe
30,Asia
40,Oceania
50,Africa



When piviting a DF, you apply the pivot method after the groupby method but before the aggregation. 

In [0]:
countries_df.groupBy('region_id','sub_region_id') \
    .agg(sum(col('population')) \
    .alias('population')) \
    .orderBy('population', ascending = False) \
    .limit(10) \
    .display()

region_id,sub_region_id,population
30,30,3836422762
30,60,3345222196
50,160,2132574302
30,100,1324023612
10,10,1296241438
10,80,733201928
20,140,586889826
30,170,550649626
50,40,483561536
20,150,391044820


In [0]:
countries_df \
    .groupBy('sub_region_id') \
    .pivot('region_id') \
    .agg(sum(col('population'))) \
    .limit(10) \
    .display()


    # Pivots are good for creating matrices

sub_region_id,10,20,30,40,50
140.0,,586889826.0,,,
,,,,2212.0,
40.0,,,,,483561536.0
20.0,,,,1359678.0,
120.0,,211583456.0,,,
100.0,,,1324023612.0,,
130.0,,,,21837034.0,
10.0,1296241438.0,,,,
50.0,,,,1087572.0,
80.0,733201928.0,,,,


In [0]:
population_df = countries_df \
    .join(regions_df, countries_df['region_id'] == regions_df['id'], how='left') \
    .select(
        countries_df['name'].alias('country_name'),
        regions_df['name'].alias('region_name'),
        countries_df['population']
    )
 
display(population_df.head(10))

country_name,region_name,population
Afghanistan,Asia,38041754
Albania,Europe,2880917
Algeria,Africa,43053054
American Samoa,Oceania,55312
Andorra,Europe,77142
Angola,Africa,31825295
Anguilla,America,14869
Antarctica,Oceania,1106
Antigua and Barbuda,America,97118
Argentina,America,44780677


In [0]:
population_df \
    .groupBy('region_name') \
    .agg(sum('population').alias('population')) \
    .orderBy('population', ascending=False) \
    .display()  


region_name,population
Asia,4601371198
Africa,1308067919
America,1014721683
Europe,747205974
Oceania,42133644


In [0]:
 population_df \
    .groupBy('country_name') \
    .pivot('region_name') \
    .sum('population') \
    .orderBy('country_name') \
    .limit(10) \
    .display()



country_name,Africa,America,Asia,Europe,Oceania
Afghanistan,,,38041754.0,,
Albania,,,,2880917.0,
Algeria,43053054.0,,,,
American Samoa,,,,,55312.0
Andorra,,,,77142.0,
Angola,31825295.0,,,,
Anguilla,,14869.0,,,
Antarctica,,,,,1106.0
Antigua and Barbuda,,97118.0,,,
Argentina,,44780677.0,,,


In [0]:
population_df \
    .groupBy('region_name','country_name') \
    .agg(sum('population').alias('population')) \
    .orderBy('population', ascending=False) \
    .limit(10) \
    .display()  


region_name,country_name,population
Asia,China,1433783686
Asia,India,1366417754
America,United States of America,329064917
Asia,Indonesia,270625568
Asia,Pakistan,216565318
America,Brazil,211049527
Africa,Nigeria,200963599
Asia,Bangladesh,163046161
Europe,Russian Federation,145872256
America,Mexico,127575529


In [0]:
# Use Union to append two dataframes. Must have the same number of columns.

[0;31m---------------------------------------------------------------------------[0m
[0;31mParseException[0m                            Traceback (most recent call last)
File [0;32m<command-1657774773984061>, line 4[0m
[1;32m      1[0m [38;5;66;03m# Unpivot[39;00m
[1;32m      3[0m population_df \
[0;32m----> 4[0m     [38;5;241m.[39mselect([38;5;124m'[39m[38;5;124mcountry_name[39m[38;5;124m'[39m, expr([38;5;124m"[39m[38;5;124m"[39m))

File [0;32m/databricks/spark/python/pyspark/sql/utils.py:264[0m, in [0;36mtry_remote_functions.<locals>.wrapped[0;34m(*args, **kwargs)[0m
[1;32m    262[0m     [38;5;28;01mreturn[39;00m [38;5;28mgetattr[39m(functions, f[38;5;241m.[39m[38;5;18m__name__[39m)([38;5;241m*[39margs, [38;5;241m*[39m[38;5;241m*[39mkwargs)
[1;32m    263[0m [38;5;28;01melse[39;00m:
[0;32m--> 264[0m     [38;5;28;01mreturn[39;00m f([38;5;241m*[39margs, [38;5;241m*[39m[38;5;241m*[39mkwargs)

File [0;32m/databricks/spark/pytho

In [0]:
countries_df.count()

249

In [0]:
countries_df = countries_df.union(countries_df)

In [0]:
countries_df.count()

498

In [0]:
countries_df.printSchema()

root
 |-- COUNTRY_ID: integer (nullable = true)
 |-- NAME: string (nullable = true)
 |-- NATIONALITY: string (nullable = true)
 |-- COUNTRY_CODE: string (nullable = true)
 |-- ISO_ALPHA2: string (nullable = true)
 |-- CAPITAL: string (nullable = true)
 |-- POPULATION: integer (nullable = true)
 |-- AREA_KM2: double (nullable = true)
 |-- REGION_ID: integer (nullable = true)
 |-- SUB_REGION_ID: integer (nullable = true)
 |-- INTERMEDIATE_REGION_ID: integer (nullable = true)
 |-- ORGANIZATION_REGION_ID: integer (nullable = true)

