# Boilerplate - inicialização padrão

In [33]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession

In [35]:
conf = SparkConf().setAppName("pyspark-bolsa-familia")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

# Caso de Uso: Análise Pagamentos Bolsa Família

#### Adicionando arquivo "202001-bolsa-familia-pgto-sample.csv" no HDFS

In [4]:
! hadoop fs -put datasets/202001-bolsa-familia-pgto-sample.csv

#### Verificando a adição no HDFS

In [13]:
! hadoop fs -ls hdfs://node-master:9000/user/root

Found 2 items
drwxr-xr-x   - root supergroup          0 2023-02-17 20:05 hdfs://node-master:9000/user/root/.sparkStaging
-rw-r--r--   2 root supergroup    5604051 2023-02-17 20:07 hdfs://node-master:9000/user/root/202001-bolsa-familia-pgto-sample.csv


In [11]:
! hdfs fsck hdfs://node-master:9000/user/root/202001-bolsa-familia-pgto-sample.csv -files -blocks

Connecting to namenode via http://node-master:50070/fsck?ugi=root&files=1&blocks=1&path=%2Fuser%2Froot%2F202001-bolsa-familia-pgto-sample.csv
FSCK started by root (auth:SIMPLE) from /172.20.0.4 for path /user/root/202001-bolsa-familia-pgto-sample.csv at Fri Feb 17 20:15:45 GMT 2023
/user/root/202001-bolsa-familia-pgto-sample.csv 5604051 bytes, 1 block(s):  OK
0. BP-1939772364-172.20.0.4-1676664136898:blk_1073741830_1006 len=5604051 repl=2

Status: HEALTHY
 Total size:	5604051 B
 Total dirs:	0
 Total files:	1
 Total symlinks:		0
 Total blocks (validated):	1 (avg. block size 5604051 B)
 Minimally replicated blocks:	1 (100.0 %)
 Over-replicated blocks:	0 (0.0 %)
 Under-replicated blocks:	0 (0.0 %)
 Mis-replicated blocks:		0 (0.0 %)
 Default replication factor:	2
 Average block replication:	2.0
 Corrupt blocks:		0
 Missing replicas:		0 (0.0 %)
 Number of data-nodes:		2
 Number of racks:		1
FSCK ended at Fri Feb 17 20:15:45 GMT 2023 in 2 milliseconds


The filesystem under path '/user/root/

#### Carregando DataFrame a partir de um caminho no HDFS

In [14]:
HDFS_PATH_BOLSA_FAMILIA = "hdfs://node-master:9000/user/root/202001-bolsa-familia-pgto-sample.csv"

#### Encoding errado => UTF-8

In [61]:
dfe = spark.read\
          .options(delimiter=";", header=True, encoding="utf-8")\
          .csv(HDFS_PATH_BOLSA_FAMILIA)

In [62]:
dfe.head()

Row(M�S COMPET�NCIA='202001', M�S REFER�NCIA='201901', UF='MG', C�DIGO MUNIC�PIO SIAFI='4123', NOME MUNIC�PIO='BELO HORIZONTE', CPF FAVORECIDO='***.361.206-**', NIS FAVORECIDO='12581466091', NOME FAVORECIDO='ADRIANA RANGEL SANSAO', VALOR PARCELA='253,00')

#### Encoding certo => ISO-8859-1

In [63]:
df = spark.read\
          .options(delimiter=";", header=True, encoding="iso-8859-1")\
          .csv(HDFS_PATH_BOLSA_FAMILIA)

In [64]:
df.head()

Row(MÊS COMPETÊNCIA='202001', MÊS REFERÊNCIA='201901', UF='MG', CÓDIGO MUNICÍPIO SIAFI='4123', NOME MUNICÍPIO='BELO HORIZONTE', CPF FAVORECIDO='***.361.206-**', NIS FAVORECIDO='12581466091', NOME FAVORECIDO='ADRIANA RANGEL SANSAO', VALOR PARCELA='253,00')

#### Registrando DataFrame como Tabela

In [65]:
df.registerTempTable("beneficiario")

#### Consultando dados com o comando `spark.sql`

In [75]:
total = spark.sql("""
    select count(0) from beneficiario
""")

In [76]:
total

DataFrame[count(0): bigint]

In [77]:
total.show()

+--------+
|count(0)|
+--------+
|   49999|
+--------+



In [86]:
por_uf = spark.sql("""
    select UF, count(0) total
    from beneficiario
    group by UF
    order by total desc
""")

In [88]:
por_uf.show(10)

+---+-----+
| UF|total|
+---+-----+
| SP| 7411|
| BA| 7356|
| RJ| 4085|
| PA| 3602|
| MA| 3440|
| CE| 3338|
| MG| 3239|
| AM| 2646|
| AL| 2389|
| PE| 2321|
+---+-----+
only showing top 10 rows

