In [None]:
PySpark union() and unionAll() transformations are used to merge two or more DataFrame’s of the same schema or structure.
In other SQL languages, Union eliminates the duplicates but UnionAll merges two datasets including duplicate records. 
But, in PySpark both behave the same and recommend using DataFrame duplicate() function to remove duplicate rows.

The union() can be performed on the DataFrames that have the same schema and structure. If the schemas are different 
we may need to use unionByName() or make changes to the DataFrames to align to their schemas before performing union() 
transformation.

union() retains NULL values from both DataFrames. If a column has a NULL value in one DataFrame and a non-NULL value
in the corresponding column of the other DataFrame, both values will be included in the result.

The union() transformation aligns columns based on their names, not their positions. If the columns have the same names 
in both DataFrames, the ordering of columns does not matter.

In [2]:
# Imports
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]").appName('SparkByExamples.com').getOrCreate()

simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

simpleData2 = [("James","Sales","NY",90000,34,10000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns2= ["employee_name","department","state","salary","age","bonus"]

df2 = spark.createDataFrame(data = simpleData2, schema = columns2)

df2.printSchema()
df2.show(truncate=False)

unionDF = df.union(df2)
unionDF.show(truncate=False)
disDF = df.union(df2).distinct()
disDF.show(truncate=False)

unionAllDF = df.unionAll(df2)
unionAllDF.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
+-------------+----------+-----+------+---+-----+

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----

In [None]:
employee_data = [(10,"raj kumar","1900","100","M",2000),(20,"kumar sani","2000","500","M",5000),(30,"Ajaykumar","1980","400",None,3000),(40,"kumarh","1990","500",None,4000),(50,"kumar","2000","600","F",6000)]

empolyee_schema = ["emplyee_id","name","doj","employee_dept_id","gender","salary"]

df1 = spark.createDataFrame(data=employee_data,schema=empolyee_schema)
display(df1)

emplyee_id,name,doj,employee_dept_id,gender,salary
10,raj kumar,1900,100,M,2000
20,kumar sani,2000,500,M,5000
30,Ajaykumar,1980,400,,3000
40,kumarh,1990,500,,4000
50,kumar,2000,600,F,6000


In [None]:
employee_data1 = [(10,"raj kumar","1900","100","M",2000),(20,"kumar sani","3000","500","M",6000),(100,"AjaySkumar","1980","500",None,2000),(70,"kumarhari","1990","500",None,4000),(80,"vetrikumar","2000","600","F",6000)]

empolyee_schema = ["emplyee_id","name","doj","employee_dept_id","gender","salary"]

df2 = spark.createDataFrame(data=employee_data1,schema=empolyee_schema)
display(df2)

emplyee_id,name,doj,employee_dept_id,gender,salary
10,raj kumar,1900,100,M,2000
20,kumar sani,3000,500,M,6000
100,AjaySkumar,1980,500,,2000
70,kumarhari,1990,500,,4000
80,vetrikumar,2000,600,F,6000


In [None]:
df3 = df1.union(df2)
display(df3)

emplyee_id,name,doj,employee_dept_id,gender,salary
10,raj kumar,1900,100,M,2000
20,kumar sani,2000,500,M,5000
30,Ajaykumar,1980,400,,3000
40,kumarh,1990,500,,4000
50,kumar,2000,600,F,6000
10,raj kumar,1900,100,M,2000
20,kumar sani,3000,500,M,6000
100,AjaySkumar,1980,500,,2000
70,kumarhari,1990,500,,4000
80,vetrikumar,2000,600,F,6000


In [None]:
df4 = df3.dropDuplicates()
display(df4)

emplyee_id,name,doj,employee_dept_id,gender,salary
10,raj kumar,1900,100,M,2000
20,kumar sani,2000,500,M,5000
30,Ajaykumar,1980,400,,3000
40,kumarh,1990,500,,4000
50,kumar,2000,600,F,6000
20,kumar sani,3000,500,M,6000
100,AjaySkumar,1980,500,,2000
70,kumarhari,1990,500,,4000
80,vetrikumar,2000,600,F,6000


In [None]:
df4 = df1.unionAll(df2)
display(df4)

emplyee_id,name,doj,employee_dept_id,gender,salary
10,raj kumar,1900,100,M,2000
20,kumar sani,2000,500,M,5000
30,Ajaykumar,1980,400,,3000
40,kumarh,1990,500,,4000
50,kumar,2000,600,F,6000
10,raj kumar,1900,100,M,2000
20,kumar sani,3000,500,M,6000
100,AjaySkumar,1980,500,,2000
70,kumarhari,1990,500,,4000
80,vetrikumar,2000,600,F,6000


In [None]:
df6 = df2.select(df2.emplyee_id,df2.name,df2.doj,df2.employee_dept_id,df2.gender)
display(df6)

emplyee_id,name,doj,employee_dept_id,gender
10,raj kumar,1900,100,M
20,kumar sani,3000,500,M
100,AjaySkumar,1980,500,
70,kumarhari,1990,500,
80,vetrikumar,2000,600,F


In [None]:
df_merge = df1.union(df6)

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-3468587347303643>:1[0m
[0;32m----> 1[0m df_merge [38;5;241m=[39m [43mdf1[49m[38;5;241;43m.[39;49m[43munion[49m[43m([49m[43mdf6[49m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     logger[38;5;241m.[39mlog_success(
[1;32m     50[0m         module_name, class_name, function_name, time[38;5;241m.[39mperf_counter() [38;5;241