## Union
PySpark union() method of the DataFrame is used to merge two DataFrame’s of the same structure/schema. If schemas are not the same it returns an error
union() merges two datasets including duplicate records. PySpark recommends using DataFrame duplicate() function to remove duplicate rows.

In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Union').getOrCreate()

simpleData = [("James","Sales","NY",90000,34,10000), \
              ("Michael","Sales","NY",86000,56,20000), \
              ("Robert","Sales","CA",81000,30,23000), \
              ("Maria","Finance","CA",90000,24,23000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/08 17:16:32 WARN Utils: Your hostname, javier-ubuntu, resolves to a loopback address: 127.0.1.1; using 172.17.0.1 instead (on interface docker0)
25/08/08 17:16:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/08 17:16:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/08 17:16:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)



                                                                                

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
+-------------+----------+-----+------+---+-----+



Second Dataframe with the new records and some records from the above Dataframe but with the same schema.

In [2]:
simpleData2 = [ ("James","Sales","NY",90000,34,10000), \
                ("Maria","Finance","CA",90000,24,23000), \
                ("Jen","Finance","NY",79000,53,15000), \
                ("Jeff","Marketing","CA",80000,25,18000), \
                ("Kumar","Marketing","NY",91000,50,21000) \
            ]
columns2= ["employee_name","department","state","salary","age","bonus"]

df2 = spark.createDataFrame(data = simpleData2, schema = columns2)

df2.printSchema()
df2.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



### Merge two or more DataFrames using union
DataFrame union() method merges two DataFrames and returns the new DataFrame with all rows from two Dataframes regardless of duplicate data.

In [3]:
unionDF = df.union(df2)
unionDF.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



                                                                                

### Merge without Duplicates
Since the union() method returns all rows without `distinct` records, we will use the `distinct()` function to return just one record when duplicate exists

In [4]:
disDF = df.union(df2).distinct()
disDF.show(truncate=False)



+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



                                                                                

In [5]:
disDF.select('employee_name','department','state','salary','age','bonus').show()

                                                                                

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [6]:
disDF.select(disDF.columns[0:-1]).show()

+-------------+----------+-----+------+---+
|employee_name|department|state|salary|age|
+-------------+----------+-----+------+---+
|        James|     Sales|   NY| 90000| 34|
|      Michael|     Sales|   NY| 86000| 56|
|       Robert|     Sales|   CA| 81000| 30|
|        Maria|   Finance|   CA| 90000| 24|
|          Jen|   Finance|   NY| 79000| 53|
|         Jeff| Marketing|   CA| 80000| 25|
|        Kumar| Marketing|   NY| 91000| 50|
+-------------+----------+-----+------+---+



                                                                                

In [7]:
disDF.select(disDF.columns).show()



+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



                                                                                

In [8]:
df_small = disDF.select(disDF.columns[0:-1])
df_small.show()

+-------------+----------+-----+------+---+
|employee_name|department|state|salary|age|
+-------------+----------+-----+------+---+
|        James|     Sales|   NY| 90000| 34|
|      Michael|     Sales|   NY| 86000| 56|
|       Robert|     Sales|   CA| 81000| 30|
|        Maria|   Finance|   CA| 90000| 24|
|          Jen|   Finance|   NY| 79000| 53|
|         Jeff| Marketing|   CA| 80000| 25|
|        Kumar| Marketing|   NY| 91000| 50|
+-------------+----------+-----+------+---+



## PySpark Merge Two DataFrames with Different Columns

In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

#Create DataFrame df1 with columns name,dept & age
data = [("James","Sales",34), ("Michael","Sales",56), \
    ("Robert","Sales",30), ("Maria","Finance",24) ]
columns= ["name","dept","age"]
df1 = spark.createDataFrame(data = data, schema = columns)
df1.printSchema()
df1.show()

#Create DataFrame df1 with columns name,dep,state & salary
data2=[("James","Sales","NY",9000),("Maria","Finance","CA",9000), \
    ("Jen","Finance","NY",7900),("Jeff","Marketing","CA",8000)]
columns2= ["name","dept","state","salary"]
df2 = spark.createDataFrame(data = data2, schema = columns2)
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- age: long (nullable = true)



25/08/08 17:16:47 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+-------+-------+---+
|   name|   dept|age|
+-------+-------+---+
|  James|  Sales| 34|
|Michael|  Sales| 56|
| Robert|  Sales| 30|
|  Maria|Finance| 24|
+-------+-------+---+

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)

+-----+---------+-----+------+
| name|     dept|state|salary|
+-----+---------+-----+------+
|James|    Sales|   NY|  9000|
|Maria|  Finance|   CA|  9000|
|  Jen|  Finance|   NY|  7900|
| Jeff|Marketing|   CA|  8000|
+-----+---------+-----+------+



In [10]:
# Add missing columns 'state' & 'salary' to df1
from pyspark.sql.functions import lit

for column in [column for column in df2.columns if column not in df1.columns]:
    df1 = df1.withColumn(column, lit(None))

df1.show()

#Add missing column 'age' to df2
for column in [column for column in df1.columns if column not in df2.columns]:
    df2 = df2.withColumn(column, lit(None))

df2.show()

+-------+-------+---+-----+------+
|   name|   dept|age|state|salary|
+-------+-------+---+-----+------+
|  James|  Sales| 34| NULL|  NULL|
|Michael|  Sales| 56| NULL|  NULL|
| Robert|  Sales| 30| NULL|  NULL|
|  Maria|Finance| 24| NULL|  NULL|
+-------+-------+---+-----+------+

+-----+---------+-----+------+----+
| name|     dept|state|salary| age|
+-----+---------+-----+------+----+
|James|    Sales|   NY|  9000|NULL|
|Maria|  Finance|   CA|  9000|NULL|
|  Jen|  Finance|   NY|  7900|NULL|
| Jeff|Marketing|   CA|  8000|NULL|
+-----+---------+-----+------+----+



In [11]:
# Finally join two dataframe's df1 & df2 by name
merged_df=df1.unionByName(df2)
merged_df.show()

+-------+---------+----+-----+------+
|   name|     dept| age|state|salary|
+-------+---------+----+-----+------+
|  James|    Sales|  34| NULL|  NULL|
|Michael|    Sales|  56| NULL|  NULL|
| Robert|    Sales|  30| NULL|  NULL|
|  Maria|  Finance|  24| NULL|  NULL|
|  James|    Sales|NULL|   NY|  9000|
|  Maria|  Finance|NULL|   CA|  9000|
|    Jen|  Finance|NULL|   NY|  7900|
|   Jeff|Marketing|NULL|   CA|  8000|
+-------+---------+----+-----+------+

