In [30]:
# To download dependencies, execute the following command in the terminal:
# ❯ spark-submit --packages com.databricks:spark-xml_2.12:0.13.0 xml.py

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [31]:
# Initialize SparkSession
spark = (
    SparkSession.builder
    .appName("ReadXML")
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.13.0")
    .getOrCreate()
        )

In [32]:
# Define the path to your XML file
xml_file_path = "./input/books.xml"

# Define row tag to be extracted
rowTag = "book"

In [33]:
# Read XML file into DataFrame
df = (
    spark.read
    .format('com.databricks.spark.xml')
    .option("rowTag", rowTag)
    .load(xml_file_path)
)

In [34]:
print("Reading XML...")
# Print DataFrame schema
df.printSchema()

Reading XML...
root
 |-- _id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- description: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- price: double (nullable = true)
 |-- publish_date: date (nullable = true)
 |-- title: string (nullable = true)



In [35]:
df.show(truncate=False)

+-----+--------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [36]:
(
    df.write
    .format("parquet")
    .saveAsTable("xml_tb")
)

                                                                                

In [37]:
(
    df.write
    .format("parquet")
    .save("xml_tb_parquet")
)

In [39]:
spark.sql(
    """show tables"""
).show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|   xml_tb|      false|
+---------+---------+-----------+



In [40]:
spark.sql("""select * from xml_tb limit 10""").show()

+-----+--------------------+--------------------+---------------+-----+------------+--------------------+
|  _id|              author|         description|          genre|price|publish_date|               title|
+-----+--------------------+--------------------+---------------+-----+------------+--------------------+
|bk101|Gambardella, Matthew|\n\n\n         An...|       Computer|44.95|  2000-10-01|XML Developer's G...|
|bk102|          Ralls, Kim|A former architec...|        Fantasy| 5.95|  2000-12-16|       Midnight Rain|
|bk103|         Corets, Eva|After the collaps...|        Fantasy| 5.95|  2000-11-17|     Maeve Ascendant|
|bk104|         Corets, Eva|In post-apocalyps...|        Fantasy| 5.95|  2001-03-10|     Oberon's Legacy|
|bk105|         Corets, Eva|The two daughters...|        Fantasy| 5.95|  2001-09-10|  The Sundered Grail|
|bk106|    Randall, Cynthia|When Carla meets ...|        Romance| 4.95|  2000-09-02|         Lover Birds|
|bk107|      Thurman, Paula|A deep sea diver .

In [29]:
# Stop SparkSession
spark.stop()