In [2]:
import findspark
findspark.init()

In [3]:
import pyspark
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder \
    .appName("ch09") \
    .master("yarn") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/15 15:29:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/15 15:29:13 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [5]:
spark.sparkContext

#### CSV File

In [23]:
# Reading CSV

df_csv1 = spark.read \
     .format("csv") \
     .option("mode", "FAILFAST") \
     .option("header", True) \
     .option("inferSchema", True) \
     .option("path", "hdfs:///data/flight-data/csv/2015-summary.csv") \
     .load()


In [24]:
df_csv1.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [28]:
# using Predefine Manual Schema

from pyspark.sql.types import StructType, StructField, IntegerType, StringType


flightSchema = StructType([
        StructField('DEST_COUNTRY_NAME', StringType(), True), 
        StructField('ORIGIN_COUNTRY_NAME', StringType(), True), 
        StructField('count', IntegerType(), True)
     ]) 

df_csv2 = spark.read \
     .format("csv") \
     .schema(flightSchema) \
     .option("mode", "FAILFAST") \
     .option("sep", ",") \
     .option("codec", "snappy") \
     .option("header", True) \
     .load("hdfs:///data/flight-data/csv/2015-summary.csv")

df_csv2.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
+-----------------+-------------------+-----+
only showing top 3 rows



In [27]:
df_csv2.rdd.getNumPartitions()

1

In [41]:
### Writing CSV files

df_csv2.repartition(5).write \
    .format("csv") \
    .mode("overwrite") \
    .option("header", True) \
    .option("quoteAll", True) \
    .option("sep", ",") \
    .save("hdfs:///tmp/flight-data.csv")
    

                                                                                

In [42]:
!hdfs dfs -ls /tmp/flight-data.csv

Found 6 items
-rw-r--r--   2 hadoop supergroup          0 2025-05-15 16:09 /tmp/flight-data.csv/_SUCCESS
-rw-r--r--   2 hadoop supergroup       1745 2025-05-15 16:08 /tmp/flight-data.csv/part-00000-bd023d73-9883-4272-b0b8-e3ed86df0708-c000.csv
-rw-r--r--   2 hadoop supergroup       1787 2025-05-15 16:08 /tmp/flight-data.csv/part-00001-bd023d73-9883-4272-b0b8-e3ed86df0708-c000.csv
-rw-r--r--   2 hadoop supergroup       1761 2025-05-15 16:08 /tmp/flight-data.csv/part-00002-bd023d73-9883-4272-b0b8-e3ed86df0708-c000.csv
-rw-r--r--   2 hadoop supergroup       1760 2025-05-15 16:08 /tmp/flight-data.csv/part-00003-bd023d73-9883-4272-b0b8-e3ed86df0708-c000.csv
-rw-r--r--   2 hadoop supergroup       1765 2025-05-15 16:09 /tmp/flight-data.csv/part-00004-bd023d73-9883-4272-b0b8-e3ed86df0708-c000.csv


In [39]:
!hdfs dfs -cat /tmp/flight-data.csv/part-00000-414cc8a4-10eb-44d6-bf5b-aa8965565ec6-c000.csv | head -n 5

"DEST_COUNTRY_NAME","ORIGIN_COUNTRY_NAME","count"
"Greece","United States","30"
"United States","Bermuda","193"
"United States","Portugal","134"
"United States","Trinidad and Tobago","217"


#### Json Files

In [50]:
df_json1 = spark.read \
           .format("json") \
           .schema(flightSchema) \
           .option("mode", "FAILFAST") \
           .load("hdfs:///data/flight-data/json/2010-summary.json")

df_json1.show(5)           

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [53]:
#### Writing to Json file

df_json1.repartition(5).write \
    .format("json") \
    .mode("overwrite") \
    .save("hdfs:///tmp/flight-data.json")
    

In [54]:
!hdfs dfs -ls /tmp/flight-data.json

Found 6 items
-rw-r--r--   2 hadoop supergroup          0 2025-05-15 16:20 /tmp/flight-data.json/_SUCCESS
-rw-r--r--   2 hadoop supergroup       4296 2025-05-15 16:20 /tmp/flight-data.json/part-00000-292f6bb8-8b89-4acb-be5e-a36354c881c8-c000.json
-rw-r--r--   2 hadoop supergroup       4326 2025-05-15 16:20 /tmp/flight-data.json/part-00001-292f6bb8-8b89-4acb-be5e-a36354c881c8-c000.json
-rw-r--r--   2 hadoop supergroup       4238 2025-05-15 16:20 /tmp/flight-data.json/part-00002-292f6bb8-8b89-4acb-be5e-a36354c881c8-c000.json
-rw-r--r--   2 hadoop supergroup       4251 2025-05-15 16:20 /tmp/flight-data.json/part-00003-292f6bb8-8b89-4acb-be5e-a36354c881c8-c000.json
-rw-r--r--   2 hadoop supergroup       4242 2025-05-15 16:20 /tmp/flight-data.json/part-00004-292f6bb8-8b89-4acb-be5e-a36354c881c8-c000.json


#### Parquet Files 

In [61]:
# Reading Parquet

df_parqut = spark.read \
            .format("parquet") \
            .option("mode", "FAILFAST") \
            .option("mergeSchema", True) \
            .load("hdfs:///data/flight-data/parquet/2010-summary.parquet")

df_parqut.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [62]:
# Writing to Parquet

df_parqut.repartition(5).write \
    .format("parquet") \
    .mode("overwrite") \
    .option("codec", "snappy") \
    .save("hdfs:///tmp/flight-data.parquet")

In [59]:
!hdfs dfs -ls hdfs:///tmp/flight-data.parquet

Found 6 items
-rw-r--r--   2 hadoop supergroup          0 2025-05-15 16:36 hdfs:///tmp/flight-data.parquet/_SUCCESS
-rw-r--r--   2 hadoop supergroup       2128 2025-05-15 16:36 hdfs:///tmp/flight-data.parquet/part-00000-2c662e19-e737-4fa3-b7a4-efe04ef7edee-c000.snappy.parquet
-rw-r--r--   2 hadoop supergroup       2139 2025-05-15 16:36 hdfs:///tmp/flight-data.parquet/part-00001-2c662e19-e737-4fa3-b7a4-efe04ef7edee-c000.snappy.parquet
-rw-r--r--   2 hadoop supergroup       2075 2025-05-15 16:36 hdfs:///tmp/flight-data.parquet/part-00002-2c662e19-e737-4fa3-b7a4-efe04ef7edee-c000.snappy.parquet
-rw-r--r--   2 hadoop supergroup       2104 2025-05-15 16:36 hdfs:///tmp/flight-data.parquet/part-00003-2c662e19-e737-4fa3-b7a4-efe04ef7edee-c000.snappy.parquet
-rw-r--r--   2 hadoop supergroup       2087 2025-05-15 16:36 hdfs:///tmp/flight-data.parquet/part-00004-2c662e19-e737-4fa3-b7a4-efe04ef7edee-c000.snappy.parquet


#### ORC File

In [64]:
# Reading ORC

df_orc = spark.read \
        .format("orc") \
        .option("mode", "permissive") \
        .load("hdfs:///data/flight-data/orc/2010-summary.orc")

df_orc.show(2)

[Stage 46:>                                                         (0 + 1) / 1]

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
+-----------------+-------------------+-----+
only showing top 2 rows



                                                                                

In [65]:
# Writing to ORC file

df_orc.write \
    .format("orc") \
    .mode("overwrite") \
    .save("hdfs:///tmp/flight-data.orc")

                                                                                

In [66]:
!hdfs dfs -ls hdfs:///tmp/flight-data.orc

Found 2 items
-rw-r--r--   2 hadoop supergroup          0 2025-05-15 16:46 hdfs:///tmp/flight-data.orc/_SUCCESS
-rw-r--r--   2 hadoop supergroup       3929 2025-05-15 16:46 hdfs:///tmp/flight-data.orc/part-00000-f282f10c-f9ac-48d4-a58f-03c625aa12f2-c000.snappy.orc


#### SQL Databases