# DataFrameReader and DataFrameWriter

In [1]:
# Prerequisites
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
from pyspark.sql.types import *

In [2]:
# Get SparkSession
spark = SparkSession.builder.master("local") \
    .appName("hello_dataframereads_dataframewriter") \
    .getOrCreate() 
print("Spark Version: ", spark.version)

Spark Version:  3.5.0


### Read Parquet Data into DataFrame

In [3]:
data_file_path = "data/2010-summary.parquet"
df_parquet = spark.read.format("parquet").load(data_file_path)
df_parquet.show(truncate=False)
print("Total Number of records: ", df_parquet.count())

+--------------------------------+-------------------+-----+
|DEST_COUNTRY_NAME               |ORIGIN_COUNTRY_NAME|count|
+--------------------------------+-------------------+-----+
|United States                   |Romania            |1    |
|United States                   |Ireland            |264  |
|United States                   |India              |69   |
|Egypt                           |United States      |24   |
|Equatorial Guinea               |United States      |1    |
|United States                   |Singapore          |25   |
|United States                   |Grenada            |54   |
|Costa Rica                      |United States      |477  |
|Senegal                         |United States      |29   |
|United States                   |Marshall Islands   |44   |
|Guyana                          |United States      |17   |
|United States                   |Sint Maarten       |53   |
|Malta                           |United States      |1    |
|Bolivia                

In [4]:
df_parquet.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



### Read Parquet Data into SQL Table

In [5]:
spark.sql("""CREATE OR REPLACE TEMPORARY VIEW flights_table
          USING parquet
          OPTIONS (
            path "data/2010-summary.parquet/")
          """)

spark.sql("SELECT * FROM flights_table").show(truncate=False)

+--------------------------------+-------------------+-----+
|DEST_COUNTRY_NAME               |ORIGIN_COUNTRY_NAME|count|
+--------------------------------+-------------------+-----+
|United States                   |Romania            |1    |
|United States                   |Ireland            |264  |
|United States                   |India              |69   |
|Egypt                           |United States      |24   |
|Equatorial Guinea               |United States      |1    |
|United States                   |Singapore          |25   |
|United States                   |Grenada            |54   |
|Costa Rica                      |United States      |477  |
|Senegal                         |United States      |29   |
|United States                   |Marshall Islands   |44   |
|Guyana                          |United States      |17   |
|United States                   |Sint Maarten       |53   |
|Malta                           |United States      |1    |
|Bolivia                

### Save to a SQL Table

In [6]:
(df_parquet.write.mode("overwrite").saveAsTable("international_flights_table"))

### Read JSON Data into DataFrame

In [7]:
data_file_path = "data/flights_json/*"
df_json = spark.read.format("json").load(data_file_path)
df_json.show(truncate=False)
print("Total NUmber of records: ", df_json.count())

+------------------------+-------------------+-----+
|DEST_COUNTRY_NAME       |ORIGIN_COUNTRY_NAME|count|
+------------------------+-------------------+-----+
|United States           |Romania            |15   |
|United States           |Croatia            |1    |
|United States           |Ireland            |344  |
|Egypt                   |United States      |15   |
|United States           |India              |62   |
|United States           |Singapore          |1    |
|United States           |Grenada            |62   |
|Costa Rica              |United States      |588  |
|Senegal                 |United States      |40   |
|Moldova                 |United States      |1    |
|United States           |Sint Maarten       |325  |
|United States           |Marshall Islands   |39   |
|Guyana                  |United States      |64   |
|Malta                   |United States      |1    |
|Anguilla                |United States      |41   |
|Bolivia                 |United States      |

### Read JSON into SQL Table

In [8]:
spark.sql("""CREATE OR REPLACE TEMPORARY VIEW flights_table_from_json
          USING json
          OPTIONS (
            path "data/flights_json/*")
          """)

spark.sql("SELECT * FROM flights_table_from_json").show(truncate=False)

+------------------------+-------------------+-----+
|DEST_COUNTRY_NAME       |ORIGIN_COUNTRY_NAME|count|
+------------------------+-------------------+-----+
|United States           |Romania            |15   |
|United States           |Croatia            |1    |
|United States           |Ireland            |344  |
|Egypt                   |United States      |15   |
|United States           |India              |62   |
|United States           |Singapore          |1    |
|United States           |Grenada            |62   |
|Costa Rica              |United States      |588  |
|Senegal                 |United States      |40   |
|Moldova                 |United States      |1    |
|United States           |Sint Maarten       |325  |
|United States           |Marshall Islands   |39   |
|Guyana                  |United States      |64   |
|Malta                   |United States      |1    |
|Anguilla                |United States      |41   |
|Bolivia                 |United States      |

### Write Dataframe into JSON file

In [23]:
(df_json.write.format("json")
        .mode("overwrite")
        .save("data/international_flights_json")
)

### Read CSV Data into Dataframe

In [9]:
data_file_path = "data/flights_csv/*"

schema = "DEST_COUNTRY STRING, ORIGIN_COUNTRY STRING, COUNT INT"

df_csv = (spark.read.format("csv") 
    .option("header", "true")
    .schema(schema)
    .option("mode", "FAILFAST")     # Exit if any errors
    .option("nullValue", "")        # Replace any null data field with quotes
    .load(data_file_path)
)

df_csv.show(truncate=False)
print("Total NUmber of records: ", df_csv.count())

+--------------------------------+----------------+-----+
|DEST_COUNTRY                    |ORIGIN_COUNTRY  |COUNT|
+--------------------------------+----------------+-----+
|United States                   |Romania         |1    |
|United States                   |Ireland         |264  |
|United States                   |India           |69   |
|Egypt                           |United States   |24   |
|Equatorial Guinea               |United States   |1    |
|United States                   |Singapore       |25   |
|United States                   |Grenada         |54   |
|Costa Rica                      |United States   |477  |
|Senegal                         |United States   |29   |
|United States                   |Marshall Islands|44   |
|Guyana                          |United States   |17   |
|United States                   |Sint Maarten    |53   |
|Malta                           |United States   |1    |
|Bolivia                         |United States   |46   |
|Anguilla     

### Read CSV Data into SQL Table

In [10]:
spark.sql("""CREATE OR REPLACE TEMPORARY VIEW flights_table_from_csv
          USING csv
          OPTIONS (
            path "data/flights_csv/*", 
            header "true",
            inferSchema "true",
            mode "FAILFAST")
          """)

spark.sql("SELECT * FROM flights_table_from_csv").show(truncate=False)

+--------------------------------+-------------------+-----+
|DEST_COUNTRY_NAME               |ORIGIN_COUNTRY_NAME|count|
+--------------------------------+-------------------+-----+
|United States                   |Romania            |1    |
|United States                   |Ireland            |264  |
|United States                   |India              |69   |
|Egypt                           |United States      |24   |
|Equatorial Guinea               |United States      |1    |
|United States                   |Singapore          |25   |
|United States                   |Grenada            |54   |
|Costa Rica                      |United States      |477  |
|Senegal                         |United States      |29   |
|United States                   |Marshall Islands   |44   |
|Guyana                          |United States      |17   |
|United States                   |Sint Maarten       |53   |
|Malta                           |United States      |1    |
|Bolivia                

### Write Dataframe into CSV file

In [22]:
(df_csv.write.format("csv")
        .mode("overwrite")
        .save("data/international_flights_csv")
)