In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Union').getOrCreate()

simpleData = [("James","Sales","NY",90000,34,10000, "R"), \
              ("Michael","Sales","NY",86000,56,20000, "D"), \
              ("Robert","Sales","CA",81000,30,23000, "R"), \
              ("Maria","Finance","CA",90000,24,23000, "D"), \
              ("Jen","Finance","NY",79000,53,15000, "R"), \
              ("Jeff","Marketing","CA",80000,25,18000, "L"), \
              ("Kumar","Marketing","NY",91000,50,21000, "R")  
  ]

columns= ["employee_name","department","state","salary","age","bonus","afilliation"]
df_full = spark.createDataFrame(data = simpleData, schema = columns)
df_full.printSchema()
df_full.show(truncate=False)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/10 15:31:09 WARN Utils: Your hostname, javier-ubuntu, resolves to a loopback address: 127.0.1.1; using 10.0.0.205 instead (on interface wlx0013eff3e14d)
25/08/10 15:31:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/10 15:31:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/10 15:31:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/08/10 15:31:10 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)
 |-- afilliation: string (nullable = true)



                                                                                

+-------------+----------+-----+------+---+-----+-----------+
|employee_name|department|state|salary|age|bonus|afilliation|
+-------------+----------+-----+------+---+-----+-----------+
|James        |Sales     |NY   |90000 |34 |10000|R          |
|Michael      |Sales     |NY   |86000 |56 |20000|D          |
|Robert       |Sales     |CA   |81000 |30 |23000|R          |
|Maria        |Finance   |CA   |90000 |24 |23000|D          |
|Jen          |Finance   |NY   |79000 |53 |15000|R          |
|Jeff         |Marketing |CA   |80000 |25 |18000|L          |
|Kumar        |Marketing |NY   |91000 |50 |21000|R          |
+-------------+----------+-----+------+---+-----+-----------+



In [2]:
regular_df = df_full.select(df_full.columns[0:-2])
regular_df.show()

+-------------+----------+-----+------+---+
|employee_name|department|state|salary|age|
+-------------+----------+-----+------+---+
|        James|     Sales|   NY| 90000| 34|
|      Michael|     Sales|   NY| 86000| 56|
|       Robert|     Sales|   CA| 81000| 30|
|        Maria|   Finance|   CA| 90000| 24|
|          Jen|   Finance|   NY| 79000| 53|
|         Jeff| Marketing|   CA| 80000| 25|
|        Kumar| Marketing|   NY| 91000| 50|
+-------------+----------+-----+------+---+



Create a smaller set to test union

In [3]:
smaller_df = df_full.select(df_full.columns[0:-4])
smaller_df.show()

+-------------+----------+-----+
|employee_name|department|state|
+-------------+----------+-----+
|        James|     Sales|   NY|
|      Michael|     Sales|   NY|
|       Robert|     Sales|   CA|
|        Maria|   Finance|   CA|
|          Jen|   Finance|   NY|
|         Jeff| Marketing|   CA|
|        Kumar| Marketing|   NY|
+-------------+----------+-----+



In [4]:
print( list(set(regular_df.columns) - set(smaller_df.columns)))
print( list(set(smaller_df.columns) - set(regular_df.columns)))

['salary', 'age']
[]


In [5]:
print( list(set(regular_df.columns) - set(df_full.columns)))
print( list(set(df_full.columns) - set(smaller_df.columns)))

[]
['afilliation', 'bonus', 'age', 'salary']


In [6]:
print( list(set(df_full.columns) - set(regular_df.columns)))

['afilliation', 'bonus']


In [7]:
from pyspark.sql.functions import lit

for column in [column for column in regular_df.columns if column not in smaller_df.columns]:
    smaller_df = smaller_df.withColumn(column, lit(None))
merged_df = regular_df.union(smaller_df)

merged_df.show()

+-------------+----------+-----+------+----+
|employee_name|department|state|salary| age|
+-------------+----------+-----+------+----+
|        James|     Sales|   NY| 90000|  34|
|      Michael|     Sales|   NY| 86000|  56|
|       Robert|     Sales|   CA| 81000|  30|
|        Maria|   Finance|   CA| 90000|  24|
|          Jen|   Finance|   NY| 79000|  53|
|         Jeff| Marketing|   CA| 80000|  25|
|        Kumar| Marketing|   NY| 91000|  50|
|        James|     Sales|   NY|  NULL|NULL|
|      Michael|     Sales|   NY|  NULL|NULL|
|       Robert|     Sales|   CA|  NULL|NULL|
|        Maria|   Finance|   CA|  NULL|NULL|
|          Jen|   Finance|   NY|  NULL|NULL|
|         Jeff| Marketing|   CA|  NULL|NULL|
|        Kumar| Marketing|   NY|  NULL|NULL|
+-------------+----------+-----+------+----+



In [8]:
merged2_df = regular_df.union(df_full.select(regular_df.columns))

merged2_df.show()

+-------------+----------+-----+------+---+
|employee_name|department|state|salary|age|
+-------------+----------+-----+------+---+
|        James|     Sales|   NY| 90000| 34|
|      Michael|     Sales|   NY| 86000| 56|
|       Robert|     Sales|   CA| 81000| 30|
|        Maria|   Finance|   CA| 90000| 24|
|          Jen|   Finance|   NY| 79000| 53|
|         Jeff| Marketing|   CA| 80000| 25|
|        Kumar| Marketing|   NY| 91000| 50|
|        James|     Sales|   NY| 90000| 34|
|      Michael|     Sales|   NY| 86000| 56|
|       Robert|     Sales|   CA| 81000| 30|
|        Maria|   Finance|   CA| 90000| 24|
|          Jen|   Finance|   NY| 79000| 53|
|         Jeff| Marketing|   CA| 80000| 25|
|        Kumar| Marketing|   NY| 91000| 50|
+-------------+----------+-----+------+---+



In [9]:
regular_df.show()

+-------------+----------+-----+------+---+
|employee_name|department|state|salary|age|
+-------------+----------+-----+------+---+
|        James|     Sales|   NY| 90000| 34|
|      Michael|     Sales|   NY| 86000| 56|
|       Robert|     Sales|   CA| 81000| 30|
|        Maria|   Finance|   CA| 90000| 24|
|          Jen|   Finance|   NY| 79000| 53|
|         Jeff| Marketing|   CA| 80000| 25|
|        Kumar| Marketing|   NY| 91000| 50|
+-------------+----------+-----+------+---+



In [10]:
def multi_column_union(df1:pyspark.sql.dataframe.DataFrame, df2:pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
    s1 = set(df1.columns)
    s2 = set(df2.columns)
    l1 = list(s1 - s2)
    l2 = list(s2 - s1)

    if(len(df2.columns)==0):
        return df1

    if (len(l1 + l2 )) == 0:
        return df1.union(df2)

    elif ((len(l1) > 0) & (len(l2 ) == 0)):
        from pyspark.sql.functions import lit
        for column in [column for column in df1.columns if column not in df2.columns]:
            df2 = df2.withColumn(column, lit(None))
        return df1.union(df2)
        
    elif ((len(l1) == 0) & (len(l2 ) > 0)):
        return df1.union(df2.select(df1.columns))
    
    else:
        return df1


In [11]:
kk = multi_column_union(regular_df, regular_df)
kk.show()

+-------------+----------+-----+------+---+
|employee_name|department|state|salary|age|
+-------------+----------+-----+------+---+
|        James|     Sales|   NY| 90000| 34|
|      Michael|     Sales|   NY| 86000| 56|
|       Robert|     Sales|   CA| 81000| 30|
|        Maria|   Finance|   CA| 90000| 24|
|          Jen|   Finance|   NY| 79000| 53|
|         Jeff| Marketing|   CA| 80000| 25|
|        Kumar| Marketing|   NY| 91000| 50|
|        James|     Sales|   NY| 90000| 34|
|      Michael|     Sales|   NY| 86000| 56|
|       Robert|     Sales|   CA| 81000| 30|
|        Maria|   Finance|   CA| 90000| 24|
|          Jen|   Finance|   NY| 79000| 53|
|         Jeff| Marketing|   CA| 80000| 25|
|        Kumar| Marketing|   NY| 91000| 50|
+-------------+----------+-----+------+---+



In [12]:
kk = multi_column_union(regular_df, df_full.select(df_full.columns[0:-4]))
kk.show()

+-------------+----------+-----+------+----+
|employee_name|department|state|salary| age|
+-------------+----------+-----+------+----+
|        James|     Sales|   NY| 90000|  34|
|      Michael|     Sales|   NY| 86000|  56|
|       Robert|     Sales|   CA| 81000|  30|
|        Maria|   Finance|   CA| 90000|  24|
|          Jen|   Finance|   NY| 79000|  53|
|         Jeff| Marketing|   CA| 80000|  25|
|        Kumar| Marketing|   NY| 91000|  50|
|        James|     Sales|   NY|  NULL|NULL|
|      Michael|     Sales|   NY|  NULL|NULL|
|       Robert|     Sales|   CA|  NULL|NULL|
|        Maria|   Finance|   CA|  NULL|NULL|
|          Jen|   Finance|   NY|  NULL|NULL|
|         Jeff| Marketing|   CA|  NULL|NULL|
|        Kumar| Marketing|   NY|  NULL|NULL|
+-------------+----------+-----+------+----+



In [13]:
kk = multi_column_union(regular_df, df_full)
kk.show()

+-------------+----------+-----+------+---+
|employee_name|department|state|salary|age|
+-------------+----------+-----+------+---+
|        James|     Sales|   NY| 90000| 34|
|      Michael|     Sales|   NY| 86000| 56|
|       Robert|     Sales|   CA| 81000| 30|
|        Maria|   Finance|   CA| 90000| 24|
|          Jen|   Finance|   NY| 79000| 53|
|         Jeff| Marketing|   CA| 80000| 25|
|        Kumar| Marketing|   NY| 91000| 50|
|        James|     Sales|   NY| 90000| 34|
|      Michael|     Sales|   NY| 86000| 56|
|       Robert|     Sales|   CA| 81000| 30|
|        Maria|   Finance|   CA| 90000| 24|
|          Jen|   Finance|   NY| 79000| 53|
|         Jeff| Marketing|   CA| 80000| 25|
|        Kumar| Marketing|   NY| 91000| 50|
+-------------+----------+-----+------+---+



In [14]:
df_full.select(df_full.columns[0:-8]).show()

++
||
++
||
||
||
||
||
||
||
++



In [15]:
kk = multi_column_union(regular_df, df_full.select(df_full.columns[0:-8]))
kk.show()

+-------------+----------+-----+------+---+
|employee_name|department|state|salary|age|
+-------------+----------+-----+------+---+
|        James|     Sales|   NY| 90000| 34|
|      Michael|     Sales|   NY| 86000| 56|
|       Robert|     Sales|   CA| 81000| 30|
|        Maria|   Finance|   CA| 90000| 24|
|          Jen|   Finance|   NY| 79000| 53|
|         Jeff| Marketing|   CA| 80000| 25|
|        Kumar| Marketing|   NY| 91000| 50|
+-------------+----------+-----+------+---+

