# Spark DataFrame Join

In [None]:
from pyspark.sql import functions as F

## Employee and Department

In [None]:
emp = [
    (1,"Smith",-1,"2018","10","M",3000),
    (2,"Rose",1,"2010","20","M",4000),
    (3,"Williams",1,"2010","10","M",1000),
    (4,"Jones",2,"2005","10","F",2000),
    (5,"Brown",2,"2010","40","",-1),
    (6,"Brown",2,"2010","50","",-1),
]

empColumns = ["emp_id","name","superior_emp_id","year_joined", "emp_dept_id","gender","salary"]

empDF = spark.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
empDF.show(truncate=False)

In [None]:
dept = [
    ("Finance",10),
    ("Marketing",20),
    ("Sales",30),
    ("IT",40),
]

deptColumns = ["dept_name","dept_id"]

deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

## Inner

In [None]:
(
    empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"inner")
        .show(truncate=False)
)

## Outer

In [None]:
(
    empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"outer")
        .show(truncate=False)
)

## Full, FullOuter

In [None]:
(
    empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"full")
        .show(truncate=False)
)

In [None]:
(
    empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"fullouter")
        .show(truncate=False)
)

## Left, LeftOuter

In [None]:
(
    empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"left")
        .show(truncate=False)
)

In [None]:
(
    empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"leftouter") \
       .show(truncate=False)
)

## Right, RightOuter

In [None]:
(
    empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"right")
       .show(truncate=False)
)

In [None]:
(
    empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"rightouter")
       .show(truncate=False)
)

## Left Semi

In [None]:
(
    empDF.join(deptDF, empDF.emp_dept_id ==  deptDF.dept_id,"leftsemi")
       .show(truncate=False)
)

The above statement is equal to the following statement.

In [None]:
(
    empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "leftouter")
        .where(deptDF.dept_id.isNotNull())
        .select(empDF.columns)
        .show(truncate=False)
)

## Left Anti

In [None]:
(
    empDF.join(deptDF, empDF.emp_dept_id ==  deptDF.dept_id, "leftanti")
       .show(truncate=False)
)

The above statement is equal to the following statement:

In [None]:
(
    empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "leftouter")
        .where(deptDF.dept_id.isNull())
        .select(empDF.columns)
        .show(truncate=False)
)

## Aliases

If we reference the columns in dataframe with `col()`, using `alias()` can help use to distinguish a specific column.

In [None]:
(
    empDF.alias("emp1").join(
        empDF.alias("emp2"), 
        F.col("emp1.superior_emp_id") == F.col("emp2.emp_id"),
        "inner",
    ).select(
        F.col("emp1.emp_id"),
        F.col("emp1.name"),
        F.col("emp2.emp_id").alias("superior_emp_id"),
        F.col("emp2.name").alias("superior_emp_name"),
    ).show(truncate=False)
)