<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/04-joins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Joins
- inner join
- left/right join
- full join
- left anti join
- cartesian product
- union/unionAll
- minus
- intersect

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()
sc = spark.sparkContext

In [2]:
employee_data = [("101", "Chloe", 3),
            ("102", "Paul", 1),
            ("103", "John", 1),
            ("104", "Lisa", 2),
            ("105", "Evan", 3),
            ("106", "Amy", 3),
            ("107", "Jimmy", 5)]
dpto_data = [("1", "Engineering"), ("2", "Sales"), ("3", "Marketing"), ("4", "Finance")]

employee_columns = ["id", "name", "dpto"]
dpto_columns = ["dpto", "deptname"]

employee = sc.parallelize(employee_data).toDF(employee_columns)
dpto = sc.parallelize(dpto_data).toDF(dpto_columns)

In [3]:
# Employee's dtaframe
employee.show()

+---+-----+----+
| id| name|dpto|
+---+-----+----+
|101|Chloe|   3|
|102| Paul|   1|
|103| John|   1|
|104| Lisa|   2|
|105| Evan|   3|
|106|  Amy|   3|
|107|Jimmy|   5|
+---+-----+----+



In [4]:
# Department's dtaframe
dpto.show()

+----+-----------+
|dpto|   deptname|
+----+-----------+
|   1|Engineering|
|   2|      Sales|
|   3|  Marketing|
|   4|    Finance|
+----+-----------+



# Joins

In [5]:
# Inner join - exists on both sides
employee.join(dpto, ["dpto"]).show()
# or
employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "inner").show()

+----+---+-----+-----------+
|dpto| id| name|   deptname|
+----+---+-----+-----------+
|   1|102| Paul|Engineering|
|   1|103| John|Engineering|
|   2|104| Lisa|      Sales|
|   3|101|Chloe|  Marketing|
|   3|105| Evan|  Marketing|
|   3|106|  Amy|  Marketing|
+----+---+-----+-----------+

+---+-----+----+----+-----------+
| id| name|dpto|dpto|   deptname|
+---+-----+----+----+-----------+
|102| Paul|   1|   1|Engineering|
|103| John|   1|   1|Engineering|
|104| Lisa|   2|   2|      Sales|
|101|Chloe|   3|   3|  Marketing|
|105| Evan|   3|   3|  Marketing|
|106|  Amy|   3|   3|  Marketing|
+---+-----+----+----+-----------+



In [7]:
# Left join - bring everything from the left side + what exists on right side

employee.join(dpto, ["dpto"], how = "left").show()

+----+---+-----+-----------+
|dpto| id| name|   deptname|
+----+---+-----+-----------+
|   5|107|Jimmy|       NULL|
|   1|102| Paul|Engineering|
|   1|103| John|Engineering|
|   3|101|Chloe|  Marketing|
|   3|105| Evan|  Marketing|
|   3|106|  Amy|  Marketing|
|   2|104| Lisa|      Sales|
+----+---+-----+-----------+



In [11]:
# Right join - bring everything from the right side + what exists on left side

employee.join(dpto, ["dpto"], how = "right").show()

+----+----+-----+-----------+
|dpto|  id| name|   deptname|
+----+----+-----+-----------+
|   1| 103| John|Engineering|
|   1| 102| Paul|Engineering|
|   3| 106|  Amy|  Marketing|
|   3| 105| Evan|  Marketing|
|   3| 101|Chloe|  Marketing|
|   2| 104| Lisa|      Sales|
|   4|NULL| NULL|    Finance|
+----+----+-----+-----------+



In [9]:
# Full join - bring everything from both side

employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "full").show()

+----+-----+----+----+-----------+
|  id| name|dpto|dpto|   deptname|
+----+-----+----+----+-----------+
| 102| Paul|   1|   1|Engineering|
| 103| John|   1|   1|Engineering|
| 104| Lisa|   2|   2|      Sales|
| 101|Chloe|   3|   3|  Marketing|
| 105| Evan|   3|   3|  Marketing|
| 106|  Amy|   3|   3|  Marketing|
|NULL| NULL|NULL|   4|    Finance|
| 107|Jimmy|   5|NULL|       NULL|
+----+-----+----+----+-----------+



In [10]:
# left Anti Join - Bring everything from the left that don´t exist on the right
employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "left_anti").show()

# right Anti Join - Bring everything from the right that don´t exist on the left
dpto.join(employee, employee["dpto"] == dpto["dpto"], how = "left_anti").show()

+---+-----+----+
| id| name|dpto|
+---+-----+----+
|107|Jimmy|   5|
+---+-----+----+

+----+--------+
|dpto|deptname|
+----+--------+
|   4| Finance|
+----+--------+



### Using SQL

In [25]:
employee.createOrReplaceTempView("employee")
dpto.createOrReplaceTempView("dpto")

# bring all the employees with or without department
spark.sql("select * from employee left join dpto using (dpto)").show()

+----+---+-----+-----------+
|dpto| id| name|   deptname|
+----+---+-----+-----------+
|   5|107|Jimmy|       NULL|
|   1|102| Paul|Engineering|
|   1|103| John|Engineering|
|   3|101|Chloe|  Marketing|
|   3|105| Evan|  Marketing|
|   3|106|  Amy|  Marketing|
|   2|104| Lisa|      Sales|
+----+---+-----+-----------+



# Union / Minus / Intersect

In [12]:
employee_hr = employee

data = [(200, "George", 5), (201, "Anna", 5), (202, "Carl", 3), (101, "Chloe", 3), (103, "John", 1), (106, "Amy", 1)]
employee_erp = spark.createDataFrame(data, schema=["id", "name", "dpto"])

print("HR database")
employee_hr.show()

print("ERP database")
employee_erp.show()

HR database
+---+-----+----+
| id| name|dpto|
+---+-----+----+
|101|Chloe|   3|
|102| Paul|   1|
|103| John|   1|
|104| Lisa|   2|
|105| Evan|   3|
|106|  Amy|   3|
|107|Jimmy|   5|
+---+-----+----+

ERP database
+---+------+----+
| id|  name|dpto|
+---+------+----+
|200|George|   5|
|201|  Anna|   5|
|202|  Carl|   3|
|101| Chloe|   3|
|103|  John|   1|
|106|   Amy|   1|
+---+------+----+



In [13]:
# union / unionByName
print("Combine both dataframes")
employee_hr.unionByName(employee_erp).show() # union, unionAll - does not handle distinct

print("Get values that are common in both dataframes")
employee_hr.intersect(employee_erp).show()

print("Get only the difference - does not exist on the second dataframe")
employee_hr.exceptAll(employee_erp).show()

Combine both dataframes
+---+------+----+
| id|  name|dpto|
+---+------+----+
|101| Chloe|   3|
|102|  Paul|   1|
|103|  John|   1|
|104|  Lisa|   2|
|105|  Evan|   3|
|106|   Amy|   3|
|107| Jimmy|   5|
|200|George|   5|
|201|  Anna|   5|
|202|  Carl|   3|
|101| Chloe|   3|
|103|  John|   1|
|106|   Amy|   1|
+---+------+----+

Get values that are common in both dataframes
+---+-----+----+
| id| name|dpto|
+---+-----+----+
|101|Chloe|   3|
|103| John|   1|
+---+-----+----+

Get only the difference - does not exist on the second dataframe
+---+-----+----+
| id| name|dpto|
+---+-----+----+
|106|  Amy|   3|
|105| Evan|   3|
|104| Lisa|   2|
|102| Paul|   1|
|107|Jimmy|   5|
+---+-----+----+



In [24]:
employee_hr.show()
employee_hr.filter(employee_hr["dpto"] == 3).show()
employee_hr.where(employee_hr["dpto"] == 3).show()

+---+-----+----+
| id| name|dpto|
+---+-----+----+
|101|Chloe|   3|
|102| Paul|   1|
|103| John|   1|
|104| Lisa|   2|
|105| Evan|   3|
|106|  Amy|   3|
|107|Jimmy|   5|
+---+-----+----+

+---+-----+----+
| id| name|dpto|
+---+-----+----+
|101|Chloe|   3|
|105| Evan|   3|
|106|  Amy|   3|
+---+-----+----+

+---+-----+----+
| id| name|dpto|
+---+-----+----+
|101|Chloe|   3|
|105| Evan|   3|
|106|  Amy|   3|
+---+-----+----+



# Questions

In [30]:
# Q1
# Implement Cartesian Product using dataframe and SQL

employee.crossJoin(dpto).show()
spark.sql("select * from employee cross join dpto").show()
# Use employee and dpto

+---+-----+----+----+-----------+
| id| name|dpto|dpto|   deptname|
+---+-----+----+----+-----------+
|101|Chloe|   3|   1|Engineering|
|101|Chloe|   3|   2|      Sales|
|101|Chloe|   3|   3|  Marketing|
|101|Chloe|   3|   4|    Finance|
|102| Paul|   1|   1|Engineering|
|102| Paul|   1|   2|      Sales|
|102| Paul|   1|   3|  Marketing|
|102| Paul|   1|   4|    Finance|
|103| John|   1|   1|Engineering|
|103| John|   1|   2|      Sales|
|103| John|   1|   3|  Marketing|
|103| John|   1|   4|    Finance|
|104| Lisa|   2|   1|Engineering|
|104| Lisa|   2|   2|      Sales|
|104| Lisa|   2|   3|  Marketing|
|104| Lisa|   2|   4|    Finance|
|105| Evan|   3|   1|Engineering|
|105| Evan|   3|   2|      Sales|
|105| Evan|   3|   3|  Marketing|
|105| Evan|   3|   4|    Finance|
+---+-----+----+----+-----------+
only showing top 20 rows

+---+-----+----+----+-----------+
| id| name|dpto|dpto|   deptname|
+---+-----+----+----+-----------+
|101|Chloe|   3|   1|Engineering|
|101|Chloe|   3|   2| 

In [32]:
dpto.show()
employee.show()

+----+-----------+
|dpto|   deptname|
+----+-----------+
|   1|Engineering|
|   2|      Sales|
|   3|  Marketing|
|   4|    Finance|
+----+-----------+

+---+-----+----+
| id| name|dpto|
+---+-----+----+
|101|Chloe|   3|
|102| Paul|   1|
|103| John|   1|
|104| Lisa|   2|
|105| Evan|   3|
|106|  Amy|   3|
|107|Jimmy|   5|
+---+-----+----+



In [31]:
# Q2
# Implement "Left Anti Join" using SQL
employee.join(dpto, ["dpto"], how="left_anti").show()
# Use employee and dpto

+----+---+-----+
|dpto| id| name|
+----+---+-----+
|   5|107|Jimmy|
+----+---+-----+



In [33]:
spark.sql("select * from employee left anti join dpto using (dpto)").show()


+----+---+-----+
|dpto| id| name|
+----+---+-----+
|   5|107|Jimmy|
+----+---+-----+



In [59]:


from pyspark.sql.functions import  col, upper

# apply lambda function to all values in name for employees
employee.withColumn("name", upper(col("name"))).show()

+---+-----+----+
| id| name|dpto|
+---+-----+----+
|101|CHLOE|   3|
|102| PAUL|   1|
|103| JOHN|   1|
|104| LISA|   2|
|105| EVAN|   3|
|106|  AMY|   3|
|107|JIMMY|   5|
+---+-----+----+



In [60]:
! git status


fatal: not a git repository (or any of the parent directories): .git
