In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType
import pyspark.sql.functions as F

In [8]:

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Iniciando com Spark") \
    .getOrCreate()

### Lendo CSV

Opções mais comuns:
* header
* inferSchema
* sep
* encoding

### DataFrameReader

```
spark.read.format(format).option(args).load(file/path)
```

### Opção de leitura 1

In [9]:
path_countries = '../../datalake/transient/company/countries/'
df_countries = spark.read.format('csv').load(path_countries,header='True',sep=",",quote="'")

In [10]:
df_countries.show(2)

+----------+------------+---------+
|country_id|country_name|region_id|
+----------+------------+---------+
|        AR|   Argentina|        2|
|        AU|   Australia|        3|
+----------+------------+---------+
only showing top 2 rows



### Opção de leitura 2

In [11]:
path_countries = '../../datalake/transient/company/countries/'

df_countries = spark.read.format("csv")\
            .option("header", True)\
            .option("sep", ",")\
            .option("quote","'")\
            .option("inferSchema",True)\
            .load(path_countries)

In [12]:
df_countries.show(2,truncate=False)

+----------+------------+---------+
|country_id|country_name|region_id|
+----------+------------+---------+
|AR        |Argentina   |2        |
|AU        |Australia   |3        |
+----------+------------+---------+
only showing top 2 rows



### Opção de leitura 3

In [13]:
path_countries = '../../datalake/transient/company/countries/'
options_dict = {
    'sep': ',',
    'header': 'true',
    'quote': '\''
}

df = (
    spark.read
    .format('csv')
    .options(**options_dict)
    .schema('country_id string, country_name string, region_id string')
    .load(path_countries)
)
df.limit(2).show()

+----------+------------+---------+
|country_id|country_name|region_id|
+----------+------------+---------+
|        AR|   Argentina|        2|
|        AU|   Australia|        3|
+----------+------------+---------+



### Escrevendo CSV

```
df.write.format(format).option(args).save(file/path)
```

In [15]:
path_countries = '../../datalake/transient/wrote/csv/company/countries/'
df.write.format('csv').save(path_countries + 'countries.csv', header=True,sep = ',',  quote = '\'' )

### Lendo Json

In [32]:
path_json = '../../datalake/transient/json/invoices/'
df_json = spark.read.format('json').load(path_json)

In [34]:
df_json.show(truncate=False)

+------------------+------------------+---------+-------------+--------------+------------+---------------------------------------------------------------------------------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+-------------+-------------+------+------------------+-------+-------------+-----------+
|CESS              |CGST              |CashierID|CreatedTime  |CustomerCardNo|CustomerType|DeliveryAddress                                                                        |DeliveryType |InvoiceLineItems                                                                                                                                                                |InvoiceNumber|NumberOfItems|PaymentMethod|PosID |SGST              |StoreID|TaxableAmount|TotalAmount|
+------------------+------------------+---------+-----

In [58]:
path_json = '../../datalake/transient/json/zipcodes/'
df_zip = spark.read.format('json').option("multiline","true").load(path_json)

In [59]:
df_zip.show()

+-------------------+------------+-----+-----------+-------+
|               City|RecordNumber|State|ZipCodeType|Zipcode|
+-------------------+------------+-----+-----------+-------+
|PASEO COSTA DEL SUR|           2|   PR|   STANDARD|    704|
|       BDA SAN LUIS|          10|   PR|   STANDARD|    709|
+-------------------+------------+-----+-----------+-------+



### Escrevendo JSON

In [62]:
path_json = '../../datalake/transient/wrote/json/zipcodes/'
df_zip.write.format('json').save(path_json )

### Lendo e Escrevendo Parquet

* Armazenamento colunar, operações bem mais eficientes ;
* Salva os Metadados, como os tipos das colunas, não é necessário especificar schemas para arquivos parquet;
* Suporta dados estruturados de forma aninhada ( listas)
* Processamento de dados particionados com volume na casa dos gigabytes por arquivo arquivo;
* Compressão de dados na escrita, de forma a ocupar menos espaço;
* Integrado com AWS Athena, Amazon Redshift Spectrum, Google BigQuery , Google Dataproc, e outras

### mode:

* append: arquivos empilhados aos ja existentes
* ignore: retorna um erro silencioso
* overwrite: sobrescreve os dados já existente
* error (default): retorne erro se já existem dados

In [63]:
path_json = '../../datalake/transient/json/invoices/'
df_json = spark.read.format('json').load(path_json)



path_dest = '../../datalake/bronze/invoices/'
df_json.write.format('parquet').mode('overwrite').save(path_dest )

In [64]:
df_parquet = spark.read.format('parquet').load(path_dest)

In [65]:
df_parquet.printSchema()

root
 |-- CESS: double (nullable = true)
 |-- CGST: double (nullable = true)
 |-- CashierID: string (nullable = true)
 |-- CreatedTime: long (nullable = true)
 |-- CustomerCardNo: string (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- DeliveryAddress: struct (nullable = true)
 |    |-- AddressLine: string (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- ContactNumber: string (nullable = true)
 |    |-- PinCode: string (nullable = true)
 |    |-- State: string (nullable = true)
 |-- DeliveryType: string (nullable = true)
 |-- InvoiceLineItems: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ItemCode: string (nullable = true)
 |    |    |-- ItemDescription: string (nullable = true)
 |    |    |-- ItemPrice: double (nullable = true)
 |    |    |-- ItemQty: long (nullable = true)
 |    |    |-- TotalValue: double (nullable = true)
 |-- InvoiceNumber: string (nullable = true)
 |-- NumberOfItems: long (nullable = t