In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=46a857c4e46bea88427735319dd7a7ed37f580be65d2b3efd1d00e2328219ca2
  Stored in directory: /root/.cache/pip/wheels/81/1b/c2/e41f4bddafe5564d11b2414a62dd7f5d75fcf65b2d7b7805c4
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
[0m

## Ways of Joins:

In [2]:
#Creating the Spark Session

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession. \
    builder. \
    appName('Join_Data'). \
    getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/18 00:55:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Before we can Join

We have to explore the dataset that can work well to explain the different Joins. 

How to do this when we don't know how Joins work? Simple, start with a Toy Data. 

In [3]:
#Creating database so the Spark SQL tables can be created. 
spark.sql("CREATE DATABASE join_db")
spark.sql("USE join_db")

DataFrame[]

In [4]:
SQL = spark.sql
SQL("SET spark.sql.shuffle.partitions = 2;")

DataFrame[key: string, value: string]

In [6]:
#Creating Toy Table 1

SQL("CREATE TABLE employee (id INT, name STRING, deptno INT) USING CSV")

AnalysisException: [TABLE_OR_VIEW_ALREADY_EXISTS] Cannot create table or view `spark_catalog`.`join_db`.`employee` because it already exists.
Choose a different name, drop or replace the existing object, or add the IF NOT EXISTS clause to tolerate pre-existing objects.

In [7]:
SQL("CREATE TABLE department (deptno INT, deptname STRING) USING CSV")

DataFrame[]

In [8]:
SQL("SHOW TABLES").show()

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
|  join_db|department|      false|
|  join_db|  employee|      false|
+---------+----------+-----------+



In [10]:
SQL("""INSERT INTO employee (id, name, deptno)
        VALUES(105,'Chloe',5),
        (103,'Paul',3),
        (101,'John',1),
        (102,'Lisa',2),
        (104,'Evan',4),
        (106,'Amy',6)""")

DataFrame[]

In [11]:
SQL("""INSERT INTO department(deptno, deptname)
        VALUES(3,'Engineering'),
        (2,'Sales'),
        (1,'Marketing')""")

DataFrame[]

### Inner Join

In [12]:
SQL("""SELECT id, name, employee.deptno, deptname
        FROM employee INNER JOIN department 
        ON employee.deptno = department.deptno""").show()

+---+----+------+-----------+
| id|name|deptno|   deptname|
+---+----+------+-----------+
|103|Paul|     3|Engineering|
|101|John|     1|  Marketing|
|102|Lisa|     2|      Sales|
+---+----+------+-----------+



### Full Join

In [13]:
SQL("""SELECT id, name, employee.deptno, deptname
        FROM employee FULL JOIN department 
        ON employee.deptno = department.deptno""").show()

+---+-----+------+-----------+
| id| name|deptno|   deptname|
+---+-----+------+-----------+
|101| John|     1|  Marketing|
|102| Lisa|     2|      Sales|
|103| Paul|     3|Engineering|
|104| Evan|     4|       null|
|105|Chloe|     5|       null|
|106|  Amy|     6|       null|
+---+-----+------+-----------+



### Right Join

In [14]:
SQL("""SELECT id, name, employee.deptno, deptname
        FROM employee RIGHT JOIN department 
        ON employee.deptno = department.deptno""").show()

+---+----+------+-----------+
| id|name|deptno|   deptname|
+---+----+------+-----------+
|103|Paul|     3|Engineering|
|101|John|     1|  Marketing|
|102|Lisa|     2|      Sales|
+---+----+------+-----------+



### Left Join

In [15]:
SQL("""SELECT id, name, employee.deptno, deptname
        FROM employee LEFT JOIN department 
        ON employee.deptno = department.deptno""").show()

+---+-----+------+-----------+
| id| name|deptno|   deptname|
+---+-----+------+-----------+
|103| Paul|     3|Engineering|
|101| John|     1|  Marketing|
|104| Evan|     4|       null|
|106|  Amy|     6|       null|
|105|Chloe|     5|       null|
|102| Lisa|     2|      Sales|
+---+-----+------+-----------+



### Semi Join

In [19]:
SQL("""SELECT *
        FROM employee SEMI JOIN department 
        ON employee.deptno = department.deptno""").show()

+---+----+------+
| id|name|deptno|
+---+----+------+
|103|Paul|     3|
|101|John|     1|
|102|Lisa|     2|
+---+----+------+



### Anti Join

In [20]:
SQL("""SELECT *
        FROM employee ANTI JOIN department 
        ON employee.deptno = department.deptno""").show()

+---+-----+------+
| id| name|deptno|
+---+-----+------+
|104| Evan|     4|
|106|  Amy|     6|
|105|Chloe|     5|
+---+-----+------+



### Cross Join

In [23]:
SQL("""SELECT id, name, employee.deptno, deptname 
        FROM employee CROSS JOIN department;""").show()

+---+-----+------+-----------+
| id| name|deptno|   deptname|
+---+-----+------+-----------+
|103| Paul|     3|Engineering|
|103| Paul|     3|  Marketing|
|103| Paul|     3|      Sales|
|101| John|     1|Engineering|
|101| John|     1|  Marketing|
|101| John|     1|      Sales|
|104| Evan|     4|Engineering|
|104| Evan|     4|  Marketing|
|104| Evan|     4|      Sales|
|106|  Amy|     6|Engineering|
|106|  Amy|     6|  Marketing|
|106|  Amy|     6|      Sales|
|105|Chloe|     5|Engineering|
|105|Chloe|     5|  Marketing|
|105|Chloe|     5|      Sales|
|102| Lisa|     2|Engineering|
|102| Lisa|     2|  Marketing|
|102| Lisa|     2|      Sales|
+---+-----+------+-----------+

