In [None]:
import os
# Set spark environments
os.environ['PYSPARK_PYTHON'] = r'C:\Users\room102sys2\AppData\Local\Programs\Python\Python39'
#os.environ['PYSPARK_DRIVER_PYTHON'] = r'C:\Users\room102sys2\AppData\Local\Programs\Python\Python39\Scripts\jupyter'
#os.environ['']

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [3]:
import findspark
findspark.init('C:\Program Files\Spark')

In [4]:
sp = SparkSession.builder.appName("Spark Joins").getOrCreate()
sc = sp.sparkContext

In [5]:
candidate = sp.createDataFrame([
    (0,"Matei Zaharia",1, [100]),
    (1, "Bill Chambers",0,[500, 250, 100]),
    (2,"JOSHUA U",1,[250,100])
]).toDF("id","name","graduate_program","spark_status")

In [6]:
graduateProgram = sp.createDataFrame([
(0, "Masters", "School of Information", "UC Berkeley"),
(2, "Masters", "EECS", "UC Berkeley"),
(1, "Ph.D.", "EECS", "UC Berkeley")])\
.toDF("id", "degree", "department", "school")

In [7]:
sparkStatus = sp.createDataFrame([
(500, "Vice President"),
(250, "PMC Member"),
(100, "Contributor")])\
.toDF("id", "status")

In [8]:
candidate.createOrReplaceTempView("candidate")
graduateProgram.createOrReplaceTempView("graduateProgram")
sparkStatus.createOrReplaceTempView("sparkStatus")

## inner Joins

In [9]:
joinExpression = candidate["graduate_program"] == graduateProgram['id']

Keys that do not exist in both DataFrames will not show in the resulting DataFrame.

In [10]:
wrongJoinExpression = candidate["name"] == graduateProgram["school"]


Inner joins are the default join, so we just need to specify our left DataFrame and join the right in the
JOIN expression:

In [11]:
a=candidate.join(graduateProgram, joinExpression)

In [12]:
a.show()

+---+-------------+----------------+---------------+---+-------+--------------------+-----------+
| id|         name|graduate_program|   spark_status| id| degree|          department|     school|
+---+-------------+----------------+---------------+---+-------+--------------------+-----------+
|  1|Bill Chambers|               0|[500, 250, 100]|  0|Masters|School of Informa...|UC Berkeley|
|  0|Matei Zaharia|               1|          [100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|     JOSHUA U|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+-------------+----------------+---------------+---+-------+--------------------+-----------+



We can also specify this explicitly by passing, in a third parameter, the joinType:

In [17]:
joinType = "inner"
candidate.join(graduateProgram, joinExpression, joinType).show()

+---+-------------+----------------+---------------+---+-------+--------------------+-----------+
| id|         name|graduate_program|   spark_status| id| degree|          department|     school|
+---+-------------+----------------+---------------+---+-------+--------------------+-----------+
|  1|Bill Chambers|               0|[500, 250, 100]|  0|Masters|School of Informa...|UC Berkeley|
|  0|Matei Zaharia|               1|          [100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|     JOSHUA U|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+-------------+----------------+---------------+---+-------+--------------------+-----------+



## Outer Joins
- Outer joins evaluate the keys in both of the DataFrames or tables and includes (and joins together) the rows that evaluate to true or false.
- If there is no equivalent row in either the left or right DataFrame, Spark will insert null:

In [19]:
joinType = "outer"
candidate.join(graduateProgram, joinExpression, joinType).show()

+----+-------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|         name|graduate_program|   spark_status| id| degree|          department|     school|
+----+-------------+----------------+---------------+---+-------+--------------------+-----------+
|   1|Bill Chambers|               0|[500, 250, 100]|  0|Masters|School of Informa...|UC Berkeley|
|   0|Matei Zaharia|               1|          [100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|     JOSHUA U|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|         null|            null|           null|  2|Masters|                EECS|UC Berkeley|
+----+-------------+----------------+---------------+---+-------+--------------------+-----------+



## Left Outer Joins
- Left outer joins evaluate the keys in both of the DataFrames or tables and includes all rows from the left DataFrame as well as any rows in the right DataFrame that have a match in the left DataFrame.
- If there is no equivalent row in the right DataFrame, Spark will insert `null`

In [20]:
joinType = "left_outer"
graduateProgram.join(candidate, joinExpression, joinType).show()

+---+-------+--------------------+-----------+----+-------------+----------------+---------------+
| id| degree|          department|     school|  id|         name|graduate_program|   spark_status|
+---+-------+--------------------+-----------+----+-------------+----------------+---------------+
|  0|Masters|School of Informa...|UC Berkeley|   1|Bill Chambers|               0|[500, 250, 100]|
|  1|  Ph.D.|                EECS|UC Berkeley|   0|Matei Zaharia|               1|          [100]|
|  1|  Ph.D.|                EECS|UC Berkeley|   2|     JOSHUA U|               1|     [250, 100]|
|  2|Masters|                EECS|UC Berkeley|null|         null|            null|           null|
+---+-------+--------------------+-----------+----+-------------+----------------+---------------+



## Right Outer Joins
- Right outer joins evaluate the keys in both of the DataFrames or tables and includes all rows from the right DataFrame as well as any rows in the left DataFrame that have a match in the right DataFrame.
- If there is no equivalent row in the left DataFrame, Spark will insert `null`.

In [21]:
joinType = "right_outer"
candidate.join(graduateProgram, joinExpression, joinType).show()

+----+-------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|         name|graduate_program|   spark_status| id| degree|          department|     school|
+----+-------------+----------------+---------------+---+-------+--------------------+-----------+
|   1|Bill Chambers|               0|[500, 250, 100]|  0|Masters|School of Informa...|UC Berkeley|
|   0|Matei Zaharia|               1|          [100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|     JOSHUA U|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|         null|            null|           null|  2|Masters|                EECS|UC Berkeley|
+----+-------------+----------------+---------------+---+-------+--------------------+-----------+



## Left Semi JOins
- semi joins are a bit of a departure from the other joins.
- They do not actually include any values from the right DataFrame. They only compare values to see if the value exists in the second DataFrame. If the value exists, those rows will be kept in the results, even if there are duplicate keys in the left DataFrame.
- Think of left semi joins as filters on a DataFrame, as opposed to the function of a conventional join

In [22]:
joinType = "left_semi"
graduateProgram.join(candidate, joinExpression, joinType).show()

+---+-------+--------------------+-----------+
| id| degree|          department|     school|
+---+-------+--------------------+-----------+
|  0|Masters|School of Informa...|UC Berkeley|
|  1|  Ph.D.|                EECS|UC Berkeley|
+---+-------+--------------------+-----------+



In [24]:
gradProgram2 = graduateProgram.union(sp.createDataFrame([
    (0,"Masters","Duplicated Row","Duplicated School")
]))

In [25]:
gradProgram2.createOrReplaceTempView("gradProgram2")

In [27]:
gradProgram2.join(candidate, joinExpression, joinType).show()

+---+-------+--------------------+-----------------+
| id| degree|          department|           school|
+---+-------+--------------------+-----------------+
|  0|Masters|School of Informa...|      UC Berkeley|
|  1|  Ph.D.|                EECS|      UC Berkeley|
|  0|Masters|      Duplicated Row|Duplicated School|
+---+-------+--------------------+-----------------+



In [30]:
%SQL
SELECT * FROM gradProgram2 LEFT SEMI JOIN candidate ON gradProgram2.id = candidate.graduate_program

SyntaxError: invalid syntax (Temp/ipykernel_5248/4019634124.py, line 2)

## Left Anti Joins
- Left anti joins are the opposite of left semi joins. Like left semi joins, they do not actually include any values from the right DataFrame.
- However, rather than keeping the values that exist in the second DataFrame, they keep only the values that do not have a corresponding key in the second DataFrame.
- Think of anti joins as a NOT IN SQL-style filter.

In [31]:
joinType = "left_anti"
graduateProgram.join(candidate, joinExpression, joinType).show()

+---+-------+----------+-----------+
| id| degree|department|     school|
+---+-------+----------+-----------+
|  2|Masters|      EECS|UC Berkeley|
+---+-------+----------+-----------+



## Natural Joins
- Natural joins make implicit guesses at the columns on which we would like join.
- It finds matching columns and returns the results. 
- Left, right, and outer natural joins are all supported.


### Warning                                                                                                                 
Implicit is always dangerous!. The following query will give us incorrect results because the two DataFrames/tables share a   column name (id), but it means different things in the datasets. We should always use this join with caustion.               
                                                                                                                             
    -- in SQL                                                                                                                
    SELECT * FROM graduateProgram NATURAL JOIN candidate                                                                     


## Cross (Cartesian) Joins
- Cross joins in simplest terms are inner joins that do not specify a predicate.
- Cross joins will join every single row in the left DataFrame to ever single row in the right DataFrame. This will cause an abosolute explosion in the number of rows contained in the resulting DataFrame.

In [32]:
joinType = "cross"
graduateProgram.join(candidate, joinExpression, joinType).show()

+---+-------+--------------------+-----------+---+-------------+----------------+---------------+
| id| degree|          department|     school| id|         name|graduate_program|   spark_status|
+---+-------+--------------------+-----------+---+-------------+----------------+---------------+
|  0|Masters|School of Informa...|UC Berkeley|  1|Bill Chambers|               0|[500, 250, 100]|
|  1|  Ph.D.|                EECS|UC Berkeley|  0|Matei Zaharia|               1|          [100]|
|  1|  Ph.D.|                EECS|UC Berkeley|  2|     JOSHUA U|               1|     [250, 100]|
+---+-------+--------------------+-----------+---+-------------+----------------+---------------+



--in SQL
SELECT * FROM graduateProgram CROSS JOIN person
   ON graduateProgram.id= candidate.graduate_program

We truly intended to have a cross-join, we can call that out explicitly