In [0]:
# here countries and happiness_scores are the global tables, created in UI in default hive database
happy_countries_df = spark.sql('SELECT * FROM countries')
happy_countries_df.display()

Country_id,Country,Region
C001,Denmark,Western Europe
C002,Switzerland,Western Europe
C003,Iceland,Western Europe
C004,Norway,Western Europe
C005,Finland,Western Europe
C011,Israel,Middle East and Northern Africa
C012,Austria,Western Europe
C013,United States,North America
C014,Costa Rica,Latin America and Caribbean
C015,Puerto Rico,Latin America and Caribbean


In [0]:
happy_scores_df = spark.sql('SELECT * FROM happiness_scores')
happy_scores_df.display()

Country_id,Happiness Rank,Happiness Score,Economy (GDP per Capita)
C006,6,7.404,1.44015
C007,7,7.339,1.46468
C008,8,7.334,1.36066
C009,9,7.313,1.44443
C010,10,7.291,1.45181
C011,11,7.267,1.33766
C012,12,7.119,1.45038
C013,13,7.104,1.50796
C014,14,7.087,1.06879
C015,15,7.039,1.35943


In [0]:
# inner join

happy_countries_df.join(happy_scores_df, 
                        happy_countries_df.Country_id == happy_scores_df.Country_id, 
                       'inner').display()

Country_id,Country,Region,Country_id.1,Happiness Rank,Happiness Score,Economy (GDP per Capita)
C011,Israel,Middle East and Northern Africa,C011,11,7.267,1.33766
C012,Austria,Western Europe,C012,12,7.119,1.45038
C013,United States,North America,C013,13,7.104,1.50796
C014,Costa Rica,Latin America and Caribbean,C014,14,7.087,1.06879
C015,Puerto Rico,Latin America and Caribbean,C015,15,7.039,1.35943
C016,Germany,Western Europe,C016,16,6.994,1.44787
C017,Brazil,Latin America and Caribbean,C017,17,6.952,1.08754
C018,Belgium,Western Europe,C018,18,6.929,1.42539
C019,Ireland,Western Europe,C019,19,6.907,1.48341
C020,Luxembourg,Western Europe,C020,20,6.871,1.69752


In [0]:
# the above result includes twice the column used for joining
# it's better to explicitly specify which columns to return in output

happy_countries_df.join(happy_scores_df, 
                       ['country_id'],
                       'inner').display()

Country_id,Country,Region,Happiness Rank,Happiness Score,Economy (GDP per Capita)
C011,Israel,Middle East and Northern Africa,11,7.267,1.33766
C012,Austria,Western Europe,12,7.119,1.45038
C013,United States,North America,13,7.104,1.50796
C014,Costa Rica,Latin America and Caribbean,14,7.087,1.06879
C015,Puerto Rico,Latin America and Caribbean,15,7.039,1.35943
C016,Germany,Western Europe,16,6.994,1.44787
C017,Brazil,Latin America and Caribbean,17,6.952,1.08754
C018,Belgium,Western Europe,18,6.929,1.42539
C019,Ireland,Western Europe,19,6.907,1.48341
C020,Luxembourg,Western Europe,20,6.871,1.69752


In [0]:
# Join using SQL query
spark.sql("""select countries.country_id, country, 'happiness rank' 
             from countries join happiness_scores on countries.country_id = happiness_scores.country_id""").display()

country_id,country,happiness rank
C011,Israel,happiness rank
C012,Austria,happiness rank
C013,United States,happiness rank
C014,Costa Rica,happiness rank
C015,Puerto Rico,happiness rank
C016,Germany,happiness rank
C017,Brazil,happiness rank
C018,Belgium,happiness rank
C019,Ireland,happiness rank
C020,Luxembourg,happiness rank


In [0]:
# left (outer) join
happy_countries_df.join(happy_scores_df, ['country_id'], 'leftouter').display()

Country_id,Country,Region,Happiness Rank,Happiness Score,Economy (GDP per Capita)
C001,Denmark,Western Europe,,,
C002,Switzerland,Western Europe,,,
C003,Iceland,Western Europe,,,
C004,Norway,Western Europe,,,
C005,Finland,Western Europe,,,
C011,Israel,Middle East and Northern Africa,11.0,7.267,1.33766
C012,Austria,Western Europe,12.0,7.119,1.45038
C013,United States,North America,13.0,7.104,1.50796
C014,Costa Rica,Latin America and Caribbean,14.0,7.087,1.06879
C015,Puerto Rico,Latin America and Caribbean,15.0,7.039,1.35943


In [0]:
# right (outer) join
happy_countries_df.join(happy_scores_df, ['country_id'], 'rightouter').display()

Country_id,Country,Region,Happiness Rank,Happiness Score,Economy (GDP per Capita)
C006,,,6,7.404,1.44015
C007,,,7,7.339,1.46468
C008,,,8,7.334,1.36066
C009,,,9,7.313,1.44443
C010,,,10,7.291,1.45181
C011,Israel,Middle East and Northern Africa,11,7.267,1.33766
C012,Austria,Western Europe,12,7.119,1.45038
C013,United States,North America,13,7.104,1.50796
C014,Costa Rica,Latin America and Caribbean,14,7.087,1.06879
C015,Puerto Rico,Latin America and Caribbean,15,7.039,1.35943


In [0]:
# fullouter join
happy_countries_df.join(happy_scores_df, ['country_id'], 'fullouter').display()

Country_id,Country,Region,Happiness Rank,Happiness Score,Economy (GDP per Capita)
C001,Denmark,Western Europe,,,
C002,Switzerland,Western Europe,,,
C003,Iceland,Western Europe,,,
C004,Norway,Western Europe,,,
C005,Finland,Western Europe,,,
C006,,,6.0,7.404,1.44015
C007,,,7.0,7.339,1.46468
C008,,,8.0,7.334,1.36066
C009,,,9.0,7.313,1.44443
C010,,,10.0,7.291,1.45181


In [0]:
# left semi-join. Returns all records from left table that have a match in right table
happy_countries_df.join(happy_scores_df, ['country_id'], 'leftsemi').display()

Country_id,Country,Region
C011,Israel,Middle East and Northern Africa
C012,Austria,Western Europe
C013,United States,North America
C014,Costa Rica,Latin America and Caribbean
C015,Puerto Rico,Latin America and Caribbean
C016,Germany,Western Europe
C017,Brazil,Latin America and Caribbean
C018,Belgium,Western Europe
C019,Ireland,Western Europe
C020,Luxembourg,Western Europe


In [0]:
# left anti-join. Returns all records from left table that do not have a match in right table
happy_countries_df.join(happy_scores_df, ['country_id'], 'leftanti').display()

Country_id,Country,Region
C001,Denmark,Western Europe
C002,Switzerland,Western Europe
C003,Iceland,Western Europe
C004,Norway,Western Europe
C005,Finland,Western Europe
