In [None]:
PySpark can automatically infer the schema of CSV files, eliminating the need for manual schema definition in many cases.
Users have the flexibility to define custom schemas for CSV files, specifying data types and column names as needed.
PySpark offers options for handling headers in CSV files, allowing users to skip headers or treat them as data rows.
Provides robust error handling mechanisms for dealing with malformed or corrupted CSV files, ensuring data integrity.

In [7]:
# Import
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.master("local[1]").appName('SparkByExamples.com').getOrCreate()

# Read CSV File
df = spark.read.csv("/home/jovyan/work/data/zipcodes.csv")
df.printSchema()

# Using format().load()
df = spark.read.format("csv").load("/home/jovyan/work/data/zipcodes.csv")

# Use header record for column names
df2 = spark.read.option("header",True).csv("/home/jovyan/work/data/zipcodes.csv")

#PySpark reads all columns as a string (StringType) by default.

# Using delimiter option
df3 = spark.read.options(delimiter=',').csv("/home/jovyan/work/data/zipcodes.csv")

# Using inferschema and delimiter - The default value set to this option is False when setting to true it automatically infers column types 
#based on the data
df4 = spark.read.options(inferSchema='True',delimiter=',').csv("/home/jovyan/work/data/zipcodes.csv")

# Define read options
options = {
    "inferSchema": "True",
    "delimiter": ","
}

# Read a CSV file with specified options
df4 = spark.read.options(**options).csv("/home/jovyan/work/data/zipcodes.csv")

# Chaining multiple options
df4 = spark.read.option("inferSchema",True).option("delimiter",",") .csv("/home/jovyan/work/data/zipcodes.csv")

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: string (nullable = true)
 |-- _c19: string (nullable = true)

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)

In [12]:
# Imports
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType

# Using custom schema
schema = StructType() \
      .add("RecordNumber",IntegerType(),True) \
      .add("Zipcode",IntegerType(),True) \
      .add("ZipCodeType",StringType(),True) \
      .add("City",StringType(),True) \
      .add("State",StringType(),True) \
      .add("LocationType",StringType(),True) \
      .add("Lat",DoubleType(),True) \
      .add("Long",DoubleType(),True) \
      .add("Xaxis",IntegerType(),True) \
      .add("Yaxis",DoubleType(),True) \
      .add("Zaxis",DoubleType(),True) \
      .add("WorldRegion",StringType(),True) \
      .add("Country",StringType(),True) \
      .add("LocationText",StringType(),True) \
      .add("Location",StringType(),True) \
      .add("Decommisioned",BooleanType(),True) \
      .add("TaxReturnsFiled",StringType(),True) \
      .add("EstimatedPopulation",IntegerType(),True) \
      .add("TotalWages",IntegerType(),True) \
      .add("Notes",StringType(),True)
      
df_with_schema = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/home/jovyan/work/data/zipcodes.csv")

df_with_schema.printSchema()

#to include header writing to file
df2.write.option("header",True) \
 .csv("/home/jovyan/work/data/raw/spark_output/zipcodes123")

#Saving modes
#overwrite – Overwrite the existing file if already exists.
#append – New rows are appended to the existing rows.
#ignore – When this option is used, it ignores the writing operation when the file already exists.
#error – This option returns an error when the file already exists. This is a default option.
df2.write.mode('overwrite').csv("/tmp/spark_output/zipcodes")

# You can also use this
df2.write.format("csv").mode('overwrite').save("/home/jovyan/work/data/raw/spark_output/zipcodes")

root
 |-- RecordNumber: integer (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long: double (nullable = true)
 |-- Xaxis: integer (nullable = true)
 |-- Yaxis: double (nullable = true)
 |-- Zaxis: double (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Decommisioned: boolean (nullable = true)
 |-- TaxReturnsFiled: string (nullable = true)
 |-- EstimatedPopulation: integer (nullable = true)
 |-- TotalWages: integer (nullable = true)
 |-- Notes: string (nullable = true)



In [None]:
Commonly used options:
header: Specifies whether to include a header row with column names in the CSV file. Example: option("header", "true").
delimiter: Specifies the delimiter to use between fields in the CSV file. Example: option("delimiter", ",").
quote: Specifies the character used for quoting fields in the CSV file. Example: option("quote", "\"").
escape: Specifies the escape character used in the CSV file. Example: option("escape", "\\").
nullValue: Specifies the string to represent null values in the CSV file. Example: option("nullValue", "NA").
dateFormat: Specifies the date format to use for date columns. Example: option("dateFormat", "yyyy-MM-dd").
mode: Specifies the write mode for the output. Options include “overwrite”, “append”, “ignore”, and “error”. Example: option("mode", "overwrite").
compression: Specifies the compression codec to use for the output file. Example: option("compression", "gzip").