In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from termcolor import cprint 

spark = SparkSession.builder.appName('read_csv').getOrCreate()

### 1. PySpark Read CSV File into DataFrame
Using `csv("path")` or `format("csv").load("path")` of DataFrameReader, you can read a CSV file into a PySpark DataFrame, These methods take a file path to read from as an argument. When you use format("csv") method, you can also specify the Data sources by their fully qualified name, but for built-in sources, you can simply use their short names (csv,json, parquet, jdbc, text e.t.c)

In [19]:

df = spark.read.csv("./resources/zipcodes.csv")
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: string (nullable = true)
 |-- _c19: string (nullable = true)



In [20]:
df = spark.read.format("csv").load("./resources/zipcodes.csv")
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: string (nullable = true)
 |-- _c19: string (nullable = true)



#### 1.1 Using Header Record For Column Names
If you have a header with column names on your input file, you need to explicitly specify True for header option

In [21]:
df2 = spark.read.option("header",True).csv("./resources/zipcodes.csv")
df2.show(n=2)

+------------+-------+-----------+-------------------+-----+--------------+-----+------+-----+-----+-----+-----------+-------+--------------------+--------------------+--------------------+---------------+-------------------+----------+-----+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  LocationType|  Lat|  Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|       Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|Notes|
+------------+-------+-----------+-------------------+-----+--------------+-----+------+-----+-----+-----+-----------+-------+--------------------+--------------------+--------------------+---------------+-------------------+----------+-----+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|NOT ACCEPTABLE|17.96|-66.22| null|-0.87|  0.3|         NA|     US|         Parc Parque|                  PR|NA-US-PR-PARC PARQUE|          false|               null|      null| null|
|           2|    704|   STA

#### 1.2 Read Multiple CSV Files
Using the read.csv() method you can also read multiple csv files, just pass all file names by separating comma as a path

In [22]:
# df = spark.read.csv("path1,path2,path3")

#### 1.3 Read all CSV Files in a Directory
 We can read all CSV files from a directory into DataFrame just by passing directory as a path to the `csv()` method.

In [23]:
# df = spark.read.csv("Folder path")

### 2. Options While Reading CSV File
PySpark CSV dataset provides multiple options to work with CSV files.

You can either use chaining `option(self, key, value)` to use multiple options or use alternate options(self, **options) method.

#### 2.1 delimiter
delimiter option is used to specify the column delimiter of the CSV file. By default, it is comma (,) character, but can be set to any character like pipe(|), tab (\t), space using this option.

In [24]:
df3 = spark.read.option('header',True).option('delimiter',',').csv("./resources/zipcodes.csv")

#### 2.2 inferSchema
The default value set to this option is False when setting to true it automatically infers column types based on the data. Note that, it requires reading the data one more time to infer the schema.

In [25]:
df4 = spark.read.option("inferSchema",True).option("delimiter",",").csv("./resources/zipcodes.csv")

### 3. Reading CSV files with a user-specified custom schema
If you know the schema of the file ahead and do not want to use the inferSchema option for column names and types, use user-defined custom column names and 

In [26]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType

schema = StructType() \
      .add("RecordNumber",IntegerType(),True) \
      .add("Zipcode",IntegerType(),True) \
      .add("ZipCodeType",StringType(),True) \
      .add("City",StringType(),True) \
      .add("State",StringType(),True) \
      .add("LocationType",StringType(),True) \
      .add("Lat",DoubleType(),True) \
      .add("Long",DoubleType(),True) \
      .add("Xaxis",IntegerType(),True) \
      .add("Yaxis",DoubleType(),True) \
      .add("Zaxis",DoubleType(),True) \
      .add("WorldRegion",StringType(),True) \
      .add("Country",StringType(),True) \
      .add("LocationText",StringType(),True) \
      .add("Location",StringType(),True) \
      .add("Decommisioned",BooleanType(),True) \
      .add("TaxReturnsFiled",StringType(),True) \
      .add("EstimatedPopulation",IntegerType(),True) \
      .add("TotalWages",IntegerType(),True) \
      .add("Notes",StringType(),True)
      
df_with_schema = spark.read.format("csv").option("header", True).schema(schema) .load("./resources/zipcodes.csv")
df_with_schema.select("RecordNumber","Zipcode","ZipCodeType","City","State","Lat","Long").show(n=5)

+------------+-------+-----------+-------------------+-----+-----+------+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  Lat|  Long|
+------------+-------+-----------+-------------------+-----+-----+------+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|17.96|-66.22|
|           2|    704|   STANDARD|PASEO COSTA DEL SUR|   PR|17.96|-66.22|
|          10|    709|   STANDARD|       BDA SAN LUIS|   PR|18.14|-66.26|
|       61391|  76166|     UNIQUE|  CINGULAR WIRELESS|   TX|32.72|-97.31|
+------------+-------+-----------+-------------------+-----+-----+------+



#### 3.1 Limit the number of rows

In [27]:
df_with_schema = spark.read.format("csv").option("header", True).schema(schema) .load("./resources/zipcodes.csv").limit(4)
df_with_schema.show()

+------------+-------+-----------+-------------------+-----+--------------+-----+------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-----+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  LocationType|  Lat|  Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|Notes|
+------------+-------+-----------+-------------------+-----+--------------+-----+------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-----+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|NOT ACCEPTABLE|17.96|-66.22| NULL|-0.87|  0.3|         NA|     US|         Parc Parque|                  PR|         NULL|          false|               NULL|      NULL| null|
|           2|    704|   STANDARD|PASEO COSTA DEL SUR|  

### 4. Write PySpark DataFrame to CSV file
Use the `write()` method of the PySpark DataFrameWriter object to write PySpark DataFrame to a CSV file.

In [28]:
df.write.option("header",True).csv("./spark_output/zipcodes")

#### 4.1 Saving modes
PySpark DataFrameWriter also has a method mode() to specify saving mode.

* overwrite – mode is used to overwrite the existing file.

* append – To add the data to the existing file.

* ignore – Ignores write operation when the file already exists.

* error – This is a default option when the file already exists, it returns an error.

In [29]:
df2.write.mode('overwrite').csv("./spark_output/zipcodes")
# you can also use this
df2.write.format("csv").mode('overwrite').save("./spark_output/zipcodes")