In [1]:
import pyspark
from pyspark import SparkContext


# Initalize a Spark Context
sc = SparkContext()

# Create a Spark SQL session
spark = pyspark.sql.SparkSession(sparkContext=sc, jsparkSession=None)

In [2]:
import os

ROOT_PATH = os.path.join(os.getcwd(), "..")

DATA_PATH = os.path.join(ROOT_PATH, "data")
OUTPUT_PATH = os.path.join(ROOT_PATH, "output")

## Read CSV by Spark without Schema

In [6]:
# Read file bookcontents.csv by Sparksession above WITHOUT SCHEMA
bookDF = spark.read.csv(os.path.join(DATA_PATH, "bookcontents.csv"))

# Call action to print Schema of dataframe above
bookDF.printSchema()


# Call action show head 20 rows in book content dataframe
bookDF.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)

+-------+--------------------+----+
|    _c0|                 _c1| _c2|
+-------+--------------------+----+
|Chapter|                Name|Page|
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+



## Read CSV by Spark with Schema

In [11]:
# Read bookcontents csv file by SparkSesson with inference schema to parse datatype
bookDF = spark.read.option("inferSchema", "true").csv(os.path.join(DATA_PATH, "bookcontentsNoHeader.csv"))

# Call action to show schema of above dataframe
bookDF.printSchema()

# Call action to show several rows in the dataframe
bookDF.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)

+---+--------------------+---+
|_c0|                 _c1|_c2|
+---+--------------------+---+
|  1|        Introduction| 11|
|  2|Basic Engineering...| 19|
|  3|Advanced Engineer...| 28|
|  4|     Hands On Course| 60|
|  5|        Case Studies| 62|
|  6|Best Practices Cl...| 73|
|  7|130+ Data Sources...| 77|
|  8|1001 Interview Qu...| 82|
|  9|Recommended Books...| 87|
+---+--------------------+---+



## Read CSV with first row is header

In [20]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create a Spark Schema before opening and parsing the dataframe
schema = StructType([
    StructField("Chapter", StringType(), False),
    StructField("Name", StringType(), False),
    StructField("Page", IntegerType(), True)
])


# Read csv file with header and parse field with schema
bookDF = spark.read\
            .option("header", "true")\
            .schema(schema)\
            .csv(os.path.join(DATA_PATH, "bookcontents.csv"))

# Call action to print out the schema
bookDF.printSchema()


# Call action to show data in dataframe
bookDF.show()

root
 |-- Chapter: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Page: integer (nullable = true)

+-------+--------------------+----+
|Chapter|                Name|Page|
+-------+--------------------+----+
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+



## Write Dataframe to Parquet

In [21]:
# Write the dataframe above to parquet file with overwite mode
bookDF.write\
    .partitionBy("Page", "Name")\
    .mode("overwrite")\
    .parquet(os.path.join(OUTPUT_PATH, "book.parquet"))

## Read partition from Parquet

In [25]:
spark.sql("""
    CREATE OR REPLACE TEMPORARY VIEW book_page_11_view
    USING PARQUET
    OPTIONS (path \"{parquet_path}\")
""".format(
    parquet_path=os.path.join(OUTPUT_PATH, "book.parquet", "Page=11")
)
         )

book_page_11_df = spark.sql("""SELECT * FROM book_page_11_view""")

In [26]:
book_page_11_df.show()

+-------+------------+
|Chapter|        Name|
+-------+------------+
|      1|Introduction|
+-------+------------+

