# Merge two dataframe with different schema

In [None]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Merge Data Frames")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/21 15:57:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Example DataFrame 1
_data = [
    ["C101", "Akshay", 21, "22-10-2001"],
    ["C102", "Sivay", 20, "07-09-2000"],
    ["C103", "Aslam", 23, "04-05-1998"],
]

_cols = ["ID", "NAME", "AGE", "DOB"]

df_1 = spark.createDataFrame(data=_data, schema=_cols)
df_1.printSchema()
df_1.show(10, False)

root
 |-- ID: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- AGE: long (nullable = true)
 |-- DOB: string (nullable = true)



[Stage 1:>                                                          (0 + 1) / 1]

+----+------+---+----------+
|ID  |NAME  |AGE|DOB       |
+----+------+---+----------+
|C101|Akshay|21 |22-10-2001|
|C102|Sivay |20 |07-09-2000|
|C103|Aslam |23 |04-05-1998|
+----+------+---+----------+



                                                                                

In [None]:
# Example DataFrame 2
_data = [
    ["C106", "Suku", "Indore", ["Maths", "English"]],
    ["C110", "Jack", "Mumbai", ["Maths", "English", "Science"]],
    ["C113", "Gopi", "Rajkot", ["Social Science"]],
]

_cols = ["ID", "NAME", "ADDRESS", "SUBJECTS"]

df_2 = spark.createDataFrame(data=_data, schema=_cols)
df_2.printSchema()
df_2.show(10, False)

root
 |-- ID: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- SUBJECTS: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----+----+-------+-------------------------+
|ID  |NAME|ADDRESS|SUBJECTS                 |
+----+----+-------+-------------------------+
|C106|Suku|Indore |[Maths, English]         |
|C110|Jack|Mumbai |[Maths, English, Science]|
|C113|Gopi|Rajkot |[Social Science]         |
+----+----+-------+-------------------------+



In [4]:
# Lets do union with different schemas

df = df_1.unionByName(df_2)

AnalysisException: Cannot resolve column name "AGE" among (ID, NAME, ADDRESS, SUBJECTS).

In [None]:
# Now before we can merge the dataframes we have to add the extra columns from either dataframes
from pyspark.sql.functions import lit

# Lets add missing columns from df_2 to df_1
for col in df_2.columns:
    if col not in df_1.columns:
        df_1 = df_1.withColumn(col, lit(None))

# Lets add missing columns from df_1 to df_2
for col in df_1.columns:
    if col not in df_2.columns:
        df_2 = df_2.withColumn(col, lit(None))

# View the dataframes
df_1.show()
df_2.show()

+----+------+---+----------+-------+--------+
|  ID|  NAME|AGE|       DOB|ADDRESS|SUBJECTS|
+----+------+---+----------+-------+--------+
|C101|Akshay| 21|22-10-2001|   null|    null|
|C102| Sivay| 20|07-09-2000|   null|    null|
|C103| Aslam| 23|04-05-1998|   null|    null|
+----+------+---+----------+-------+--------+

+----+----+-------+--------------------+----+----+
|  ID|NAME|ADDRESS|            SUBJECTS| AGE| DOB|
+----+----+-------+--------------------+----+----+
|C106|Suku| Indore|    [Maths, English]|null|null|
|C110|Jack| Mumbai|[Maths, English, ...|null|null|
|C113|Gopi| Rajkot|    [Social Science]|null|null|
+----+----+-------+--------------------+----+----+



In [6]:
# Lets use unionByName to do the merge successfully
df = df_1.unionByName(df_2)
df.printSchema()
df.show(10, False)

root
 |-- ID: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- AGE: long (nullable = true)
 |-- DOB: string (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- SUBJECTS: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----+------+----+----------+-------+-------------------------+
|ID  |NAME  |AGE |DOB       |ADDRESS|SUBJECTS                 |
+----+------+----+----------+-------+-------------------------+
|C101|Akshay|21  |22-10-2001|null   |null                     |
|C102|Sivay |20  |07-09-2000|null   |null                     |
|C103|Aslam |23  |04-05-1998|null   |null                     |
|C106|Suku  |null|null      |Indore |[Maths, English]         |
|C110|Jack  |null|null      |Mumbai |[Maths, English, Science]|
|C113|Gopi  |null|null      |Rajkot |[Social Science]         |
+----+------+----+----------+-------+-------------------------+



In [7]:
spark.stop()