# Spark DataFrame ReadWrite

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

## CSV Format

### DataFrameReader: `read`
Read data from CSV file without header assumption

In [None]:
df = spark.read.csv("s3a://datalake/examples/zipcodes.csv")

In [None]:
df.printSchema()

Check to ensure that the first line is header

In [None]:
df.show(n=2, vertical=True)

In [None]:
df2 = spark.read.option("header", True).csv("s3a://datalake/examples/zipcodes.csv")

In [None]:
df2.printSchema()

More options to control how to read a CSV data file

In [None]:
df3 = spark.read.options(header="True", delimiter=",").csv("s3a://datalake/examples/zipcodes.csv")

In [None]:
df3.printSchema()

Load data with `StructType` schema.

In [None]:
schema = (
    T.StructType()
      .add("RecordNumber",T.IntegerType(),True)
      .add("Zipcode",T.IntegerType(),True)
      .add("ZipCodeType",T.StringType(),True)
      .add("City",T.StringType(),True)
      .add("State",T.StringType(),True)
      .add("LocationType",T.StringType(),True)
      .add("Lat",T.DoubleType(),True)
      .add("Long",T.DoubleType(),True)
      .add("Xaxis",T.IntegerType(),True)
      .add("Yaxis",T.DoubleType(),True)
      .add("Zaxis",T.DoubleType(),True)
      .add("WorldRegion",T.StringType(),True)
      .add("Country",T.StringType(),True)
      .add("LocationText",T.StringType(),True)
      .add("Location",T.StringType(),True)
      .add("Decommisioned",T.BooleanType(),True)
      .add("TaxReturnsFiled",T.StringType(),True)
      .add("EstimatedPopulation",T.IntegerType(),True)
      .add("TotalWages",T.IntegerType(),True)
      .add("Notes",T.StringType(),True)
)

Alternative to use directly `csv()` is to use `load()` with `format()` (This is generic way for loading other data source)

In [None]:
df_with_schema = (
    spark
        .read
        .format("csv")
        .option("header", True)
        .schema(schema)
        .load("s3a://datalake/examples/zipcodes.csv")
)

In [None]:
df_with_schema.printSchema()

### DataFrameWriter: `write`

Test writing output after casting data with the schema

In [None]:
(df_with_schema
     .write
     .option("header", True)
     .csv("s3a://datalake/spark_output/zipcodes123")
)

## JSON

### a document in a line

If a JSON file contains a docuemnt in a line, we can directly read it without any options.

In [None]:
df = spark.read.json("s3a://datalake/examples/zipcodes.json")

In [None]:
df.printSchema()

In [None]:
df.show(n=2, truncate=False, vertical=True)

### a document span multiple line

If a JSON file contains multiple items in an array, but need parsing multiline for an item. Use `multiline` optin to read it.

In [None]:
df = (
    spark
        .read
        .option("multiline",  "true")
        .json("s3a://datalake/examples/multiline-zipcode.json")
)

In [None]:
df.show()

### read multiple json files

Use array of file names with `json` method.

In [None]:
df2 = spark.read.json(["s3a://datalake/examples/zipcode2.json", "s3a://datalake/examples/zipcode1.json"])

In [None]:
df2.show(n=2, vertical=True)

Sometimes it is easier to use wildcard `*` to indicate all files in path.

In [None]:
df3 = spark.read.json("s3a://datalake/examples/*.json")

In [None]:
df3.show(n=2, vertical=True)

As normally creating a dataframe, we can specifiy schema for loaded JSON documents.

In [None]:
schema = T.StructType([
      T.StructField("RecordNumber",T.IntegerType(),True),
      T.StructField("Zipcode",T.IntegerType(),True),
      T.StructField("ZipCodeType",T.StringType(),True),
      T.StructField("City",T.StringType(),True),
      T.StructField("State",T.StringType(),True),
      T.StructField("LocationType",T.StringType(),True),
      T.StructField("Lat",T.DoubleType(),True),
      T.StructField("Long",T.DoubleType(),True),
      T.StructField("Xaxis",T.IntegerType(),True),
      T.StructField("Yaxis",T.DoubleType(),True),
      T.StructField("Zaxis",T.DoubleType(),True),
      T.StructField("WorldRegion",T.StringType(),True),
      T.StructField("Country",T.StringType(),True),
      T.StructField("LocationText",T.StringType(),True),
      T.StructField("Location",T.StringType(),True),
      T.StructField("Decommisioned",T.BooleanType(),True),
      T.StructField("TaxReturnsFiled",T.StringType(),True),
      T.StructField("EstimatedPopulation",T.IntegerType(),True),
      T.StructField("TotalWages",T.IntegerType(),True),
      T.StructField("Notes",T.StringType(),True)
  ])

In [None]:
df_with_schema = (
    spark
        .read
        .schema(schema)
        .json("s3a://datalake/examples/zipcodes.json")
)

In [None]:
df_with_schema.printSchema()

In [None]:
df_with_schema.show(n=2, vertical=True)