In [None]:
"""
PySpark Join is used to combine two DataFrames and by chaining these you can join multiple DataFrames; 
it supports all basic join type operations available in traditional SQL like 
INNER, LEFT OUTER, RIGHT OUTER, LEFT ANTI, LEFT SEMI, CROSS, SELF JOIN. 
PySpark Joins are wider transformations that involve data shuffling across the network.
"""


In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()


In [None]:
spark


In [None]:
emp = [(1, "Smith", -1, "2018", "10", "M", 3000), (2, "Rose", 1, "2010", "20", "M", 4000), (3, "Williams", 1, "2010", "10", "M",
                                                                                            1000), (4, "Jones", 2, "2005", "10", "F", 2000), (5, "Brown", 2, "2010", "40", "", -1), (6, "Brown", 2, "2010", "50", "", -1)]
empColumns = ["emp_id", "name", "superior_emp_id",
              "year_joined", "emp_dept_id", "gender", "salary"]
empDF = spark.createDataFrame(data=emp, schema=empColumns)

empDF.toPandas()


In [None]:
dept = [("Finance", 10), ("Marketing", 20), ("Sales", 30), ("IT", 40)]
deptColumns = ["dept_name", "dept_id"]
deptDF = spark.createDataFrame(data=dept, schema=deptColumns)
deptDF.toPandas()


In [6]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "outer") \
    .show(truncate=False)
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "full") \
    .show(truncate=False)
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "fullouter") \
    .show(truncate=False)


+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [7]:
empDF.join(deptDF, empDF["emp_dept_id"] == deptDF["dept_id"], "left") \
.show()
empDF.join(deptDF, empDF["emp_dept_id"] == deptDF["dept_id"], "leftouter")  \
.show()


+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
|     6|   Brown|              2|       2010|         50|      |    -1|     null|   null|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id| 

In [8]:
"""
Note: In other SQL languages, Union eliminates the duplicates but UnionAll merges two datasets including duplicate records. 
But, in PySpark both behave the same and recommend using DataFrame duplicate() function to remove duplicate rows.
"""

'\nNote: In other SQL languages, Union eliminates the duplicates but UnionAll merges two datasets including duplicate records. \nBut, in PySpark both behave the same and recommend using DataFrame duplicate() function to remove duplicate rows.\n'

In [9]:
"""
In Spark or PySpark let’s see how to merge/union two DataFrames with a different number of columns (different schema). 
In Spark 3.1, you can easily achieve this using unionByName() transformation by passing allowMissingColumns with the value true. 
In order version, this property is not available
"""

'\nIn Spark or PySpark let’s see how to merge/union two DataFrames with a different number of columns (different schema). \nIn Spark 3.1, you can easily achieve this using unionByName() transformation by passing allowMissingColumns with the value true. \nIn order version, this property is not available\n'

In [27]:
#Create DataFrame df1 with columns name,dept & age
data = [("James","Sales",34), ("Michael","Sales",56),
               ("Robert","Sales",30), ("Maria","Finance",24) ]
dataDF = spark.createDataFrame(data=data, schema=['Name', 'Dept', 'Age'])
data1 = [("James","Sales","NY",9000),("Maria","Finance","CA",9000),
              ("Jen","Finance","NY",7900),("Jeff","Marketing","CA",8000)]
dataDF1 = spark.createDataFrame(data=data1, schema=['Name', 'Dept', 'City', 'Salary'])


In [28]:
dataDF.toPandas()

Unnamed: 0,Name,Dept,Age
0,James,Sales,34
1,Michael,Sales,56
2,Robert,Sales,30
3,Maria,Finance,24


In [29]:
dataDF1.toPandas()

Unnamed: 0,Name,Dept,City,Salary
0,James,Sales,NY,9000
1,Maria,Finance,CA,9000
2,Jen,Finance,NY,7900
3,Jeff,Marketing,CA,8000


In [34]:
dataDF.unionByName(dataDF1, allowMissingColumns=True).toPandas()

Unnamed: 0,Name,Dept,Age,City,Salary
0,James,Sales,34.0,,
1,Michael,Sales,56.0,,
2,Robert,Sales,30.0,,
3,Maria,Finance,24.0,,
4,James,Sales,,NY,9000.0
5,Maria,Finance,,CA,9000.0
6,Jen,Finance,,NY,7900.0
7,Jeff,Marketing,,CA,8000.0


Unnamed: 0,Name,Dept,Age
0,James,Sales,34
1,Maria,Finance,24
