## Configuração do ambiente para utilização do Spark

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Fazendo download
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz

# Descompactando os arquivos
!tar xf spark-3.1.2-bin-hadoop2.7.tgz

# Importando a biblioteca os
import os

# Definindo a variável de ambiente do Java
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# Definindo a variável de ambiente do Spark
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"


# instalando a findspark
!pip install -q findspark

# Importando a findspark
import findspark

# Iniciando o findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName("Minha Primeira Aplicação no Pyspark") \
        .getOrCreate()

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

## Leitura dos dados

In [2]:
# Lê o arquivo Parquet
dados = spark.read.csv("/content/drive/MyDrive/bases_dados_projeto_credito/bureau_balance.csv",header=True)

# Mostra os dados
dados.show()

+------------+--------------+------+
|SK_ID_BUREAU|MONTHS_BALANCE|STATUS|
+------------+--------------+------+
|     5715448|             0|     C|
|     5715448|            -1|     C|
|     5715448|            -2|     C|
|     5715448|            -3|     C|
|     5715448|            -4|     C|
|     5715448|            -5|     C|
|     5715448|            -6|     C|
|     5715448|            -7|     C|
|     5715448|            -8|     C|
|     5715448|            -9|     0|
|     5715448|           -10|     0|
|     5715448|           -11|     X|
|     5715448|           -12|     X|
|     5715448|           -13|     X|
|     5715448|           -14|     0|
|     5715448|           -15|     0|
|     5715448|           -16|     0|
|     5715448|           -17|     0|
|     5715448|           -18|     0|
|     5715448|           -19|     0|
+------------+--------------+------+
only showing top 20 rows



## Sumarização na visão cliente

## Criação de flags para nos auxiliar na visão temporal dos dados

In [17]:
## Habilitando uso do SparkSQL
dados.createOrReplaceTempView("dados")

df_temp_01 = spark.sql("""
SELECT
    *,
      CASE
        WHEN MONTHS_BALANCE >= -3 THEN 1
        ELSE 0
    END AS ultimos_3_meses,
    CASE
        WHEN MONTHS_BALANCE >= -6 THEN 1
        ELSE 0
    END AS ultimos_6_meses,
    CASE
        WHEN MONTHS_BALANCE >= -12 THEN 1
        ELSE 0
    END AS ultimos_12_meses,
    CASE
        WHEN MONTHS_BALANCE >= -24 THEN 1
        ELSE 0
    END AS ultimos_24_meses,
    CASE
        WHEN MONTHS_BALANCE >= -36 THEN 1
        ELSE 0
    END AS ultimos_36_meses
FROM dados
ORDER BY `SK_ID_BUREAU`;
""")
df_temp_01.createOrReplaceTempView("df_temp_01")
df_temp_01.show()

+------------+--------------+------+---------------+---------------+----------------+----------------+----------------+
|SK_ID_BUREAU|MONTHS_BALANCE|STATUS|ultimos_3_meses|ultimos_6_meses|ultimos_12_meses|ultimos_24_meses|ultimos_36_meses|
+------------+--------------+------+---------------+---------------+----------------+----------------+----------------+
|     5001709|           -18|     C|              0|              0|               0|               1|               1|
|     5001709|           -10|     C|              0|              0|               1|               1|               1|
|     5001709|           -17|     C|              0|              0|               0|               1|               1|
|     5001709|            -5|     C|              0|              1|               1|               1|               1|
|     5001709|            -9|     C|              0|              0|               1|               1|               1|
|     5001709|           -13|     C|    

In [20]:
## Habilitando uso do SparkSQL
df_temp_01.createOrReplaceTempView("df_temp_01")

df_temp_02 = spark.sql("""
SELECT
    *,
      CASE
        WHEN STATUS = "C" THEN 1
        ELSE 0
    END AS STATUS_C,
    CASE
        WHEN STATUS = "3" THEN 1
        ELSE 0
    END AS STATUS_3,
    CASE
        WHEN STATUS = "0" THEN 1
        ELSE 0
    END AS STATUS_0,
    CASE
        WHEN STATUS = "2" THEN 1
        ELSE 0
    END AS STATUS_2,
    CASE
        WHEN STATUS = "X" THEN 1
        ELSE 0
    END AS STATUS_X,
    CASE
        WHEN STATUS = "5" THEN 1
        ELSE 0
    END AS STATUS_5,
    CASE
        WHEN STATUS = "1" THEN 1
        ELSE 0
    END AS STATUS_1
FROM df_temp_01
ORDER BY `SK_ID_BUREAU`;
""")
df_temp_02.createOrReplaceTempView("df_temp_01")
df_temp_02.show()

+------------+--------------+------+---------------+---------------+----------------+----------------+----------------+--------+--------+--------+--------+--------+--------+--------+
|SK_ID_BUREAU|MONTHS_BALANCE|STATUS|ultimos_3_meses|ultimos_6_meses|ultimos_12_meses|ultimos_24_meses|ultimos_36_meses|STATUS_C|STATUS_3|STATUS_0|STATUS_2|STATUS_X|STATUS_5|STATUS_1|
+------------+--------------+------+---------------+---------------+----------------+----------------+----------------+--------+--------+--------+--------+--------+--------+--------+
|     5001709|           -18|     C|              0|              0|               0|               1|               1|       1|       0|       0|       0|       0|       0|       0|
|     5001709|           -10|     C|              0|              0|               1|               1|               1|       1|       0|       0|       0|       0|       0|       0|
|     5001709|           -17|     C|              0|              0|               0|

## Sumarizar na visão cliente (Automatizada)

In [21]:
from pyspark.sql.functions import col, round, sum, avg, max, min, when, count

# Definir as colunas para agregação
colunas_agregacao_total = ['STATUS_C','STATUS_3','STATUS_0','STATUS_2','STATUS_X','STATUS_5','STATUS_1']

colunas_flags = ['ultimos_3_meses','ultimos_6_meses','ultimos_12_meses','ultimos_24_meses','ultimos_36_meses']

expressoes_agregacao = []

for flag in colunas_flags:
  for coluna in colunas_agregacao_total:
    expressoes_agregacao.append(round(count(when(col(flag) == 1, col(coluna))), 2).alias(f"QTD_{coluna.upper()}_{flag.upper()}"))

expressoes_agregacao = tuple(expressoes_agregacao)

# Aplicar as expressões de agregação
df_temp_03 = df_temp_02.groupBy("SK_ID_BUREAU").agg(*expressoes_agregacao).orderBy("SK_ID_BUREAU")

# Mostrar o DataFrame resultante
df_temp_03.show()

+------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+----------

## Salvar a tabela sumarizada

In [22]:
df_temp_03 = df_temp_03.repartition(1)
df_temp_03.write.mode("overwrite").csv("bureau_balance_agg.csv",header=True)