In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import to_date, lit, date_format

try:
    spark.stop()
except:
    pass

spark = SparkSession.builder \
    .appName("Bureau-Balance") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/21 12:03:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:

bureau_bl = spark.read.csv("/data/raw/bureau_balance.csv",
                               header=True,
                               inferSchema=True)

bureau_bl.createOrReplaceTempView("bureau_bl")

# Contagem de linhas e colunas
num_rows = bureau_bl.count()
num_columns = len(bureau_bl.columns)

print(f'Quantidade de linhas: {num_rows}')
print(f'Quantidade de variaveis (colunas): {num_columns}')

bureau_bl.show(5, truncate=False)


                                                                                

Quantidade de linhas: 27299925
Quantidade de variaveis (colunas): 3
+------------+--------------+------+
|SK_ID_BUREAU|MONTHS_BALANCE|STATUS|
+------------+--------------+------+
|5715448     |0             |C     |
|5715448     |-1            |C     |
|5715448     |-2            |C     |
|5715448     |-3            |C     |
|5715448     |-4            |C     |
+------------+--------------+------+
only showing top 5 rows



## Criando variáveis de flag temporal

In [15]:

df_temp_01 = spark.sql("""
SELECT
    *,
      CASE
        WHEN MONTHS_BALANCE >= -3 THEN 1
        ELSE 0
    END AS U3M,
    CASE
        WHEN MONTHS_BALANCE >= -6 THEN 1
        ELSE 0
    END AS U6M,
    CASE
        WHEN MONTHS_BALANCE >= -9 THEN 1
        ELSE 0
    END AS U9M,
    CASE
        WHEN MONTHS_BALANCE >= -12 THEN 1
        ELSE 0
    END AS U12M
FROM bureau_bl
ORDER BY `SK_ID_BUREAU`;
""")
df_temp_01.createOrReplaceTempView("df_temp_01")
df_temp_01.show(5)





+------------+--------------+------+---+---+---+----+
|SK_ID_BUREAU|MONTHS_BALANCE|STATUS|U3M|U6M|U9M|U12M|
+------------+--------------+------+---+---+---+----+
|     5001709|             0|     C|  1|  1|  1|   1|
|     5001709|            -5|     C|  0|  1|  1|   1|
|     5001709|            -1|     C|  1|  1|  1|   1|
|     5001709|            -2|     C|  1|  1|  1|   1|
|     5001709|            -3|     C|  1|  1|  1|   1|
+------------+--------------+------+---+---+---+----+
only showing top 5 rows



                                                                                

In [12]:
qtd_linhas = df_temp_01.count()

                                                                                

In [13]:

spark.sql("""
            Select
                STATUS,
                count(*) as VOLUME,
                round(100*(count(*)/{}),2) as VOL_PERCENT
            from 
                df_temp_01
            group by 
                STATUS
            order by 
                VOLUME desc
""".format(qtd_linhas)).show(50,False)




+------+--------+-----------+
|STATUS|VOLUME  |VOL_PERCENT|
+------+--------+-----------+
|C     |13646993|49.99      |
|0     |7499507 |27.47      |
|X     |5810482 |21.28      |
|1     |242347  |0.89       |
|5     |62406   |0.23       |
|2     |23419   |0.09       |
|3     |8924    |0.03       |
|4     |5847    |0.02       |
+------+--------+-----------+



                                                                                

In [16]:

df_temp_02 = spark.sql("""
SELECT
    *,
      CASE
        WHEN STATUS = "C" THEN 1
        ELSE 0
    END AS STATUS_C,
    CASE
        WHEN STATUS = "0" THEN 1
        ELSE 0
    END AS STATUS_0,
    CASE
        WHEN STATUS = "X" THEN 1
        ELSE 0
    END AS STATUS_X,
    CASE
        WHEN STATUS = "1" THEN 1
        ELSE 0
    END AS STATUS_1
FROM df_temp_01
ORDER BY `SK_ID_BUREAU`;
""")
df_temp_02.createOrReplaceTempView("df_temp_01")
df_temp_02.show(5)



+------------+--------------+------+---+---+---+----+--------+--------+--------+--------+
|SK_ID_BUREAU|MONTHS_BALANCE|STATUS|U3M|U6M|U9M|U12M|STATUS_C|STATUS_0|STATUS_X|STATUS_1|
+------------+--------------+------+---+---+---+----+--------+--------+--------+--------+
|     5001709|             0|     C|  1|  1|  1|   1|       1|       0|       0|       0|
|     5001709|            -5|     C|  0|  1|  1|   1|       1|       0|       0|       0|
|     5001709|            -1|     C|  1|  1|  1|   1|       1|       0|       0|       0|
|     5001709|            -2|     C|  1|  1|  1|   1|       1|       0|       0|       0|
|     5001709|            -3|     C|  1|  1|  1|   1|       1|       0|       0|       0|
+------------+--------------+------+---+---+---+----+--------+--------+--------+--------+
only showing top 5 rows



                                                                                

## Criando variáveis de primeira camada

In [17]:
from pyspark.sql.functions import col, round, sum, avg, max, min, when, count, lit

colunas_agregacao_total = ['STATUS_C','STATUS_0','STATUS_X','STATUS_1']

colunas_flags = ['U3M','U6M', 'U9M', 'U12M']
expressoes_agregacao = []

for flag in colunas_flags:
  for coluna in colunas_agregacao_total:
    expressoes_agregacao.append(round(count(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_TT_{coluna.upper()}_{flag.upper()}_BUREAU_BL"))
    expressoes_agregacao.append(round(avg(when(col(flag) == 1, col(coluna)).otherwise(lit(None))), 2).alias(f"QT_MED_{coluna.upper()}_{flag.upper()}_BUREAU_BL"))
    expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MAX_{coluna.upper()}_{flag.upper()}_BUREAU_BL"))
    expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MIN_{coluna.upper()}_{flag.upper()}_BUREAU_BL"))


expressoes_agregacao = tuple(expressoes_agregacao)

# Aplicar as expressões de agregação
df_temp_03 = df_temp_02.groupBy("SK_ID_BUREAU").agg(*expressoes_agregacao).orderBy("SK_ID_BUREAU")


# Quantidade e nome das variáveis criadas.
nomes_cols = df_temp_03.columns
nomes_cols_novas = nomes_cols[1:]
print('Quantidade Total de Variáveis Criadas:', len(df_temp_03.columns) - 1)
print('Nomes das Variáveis Criadas:', nomes_cols_novas)
print('')
print('')

# Quantidade de linhas do DataFrame.
num_rows_df = df_temp_03.count()

# Quantidade de colunas do DataFrame.
num_columns_df = len(df_temp_03.columns)

# Imprimir o resultado de número de linhas e colunas.
print(f'Quantidade de linhas do DataFrame: {num_rows_df}')
print(f'Quantidade de colunas do DataFrame: {num_columns_df}')
print('')
print('')

# Mostrando o novo DataFrame com as variáveis criadas.
df_temp_03.show(5, False)

Quantidade Total de Variáveis Criadas: 64
Nomes das Variáveis Criadas: ['QT_TT_STATUS_C_U3M_BUREAU_BL', 'QT_MED_STATUS_C_U3M_BUREAU_BL', 'QT_MAX_STATUS_C_U3M_BUREAU_BL', 'QT_MIN_STATUS_C_U3M_BUREAU_BL', 'QT_TT_STATUS_0_U3M_BUREAU_BL', 'QT_MED_STATUS_0_U3M_BUREAU_BL', 'QT_MAX_STATUS_0_U3M_BUREAU_BL', 'QT_MIN_STATUS_0_U3M_BUREAU_BL', 'QT_TT_STATUS_X_U3M_BUREAU_BL', 'QT_MED_STATUS_X_U3M_BUREAU_BL', 'QT_MAX_STATUS_X_U3M_BUREAU_BL', 'QT_MIN_STATUS_X_U3M_BUREAU_BL', 'QT_TT_STATUS_1_U3M_BUREAU_BL', 'QT_MED_STATUS_1_U3M_BUREAU_BL', 'QT_MAX_STATUS_1_U3M_BUREAU_BL', 'QT_MIN_STATUS_1_U3M_BUREAU_BL', 'QT_TT_STATUS_C_U6M_BUREAU_BL', 'QT_MED_STATUS_C_U6M_BUREAU_BL', 'QT_MAX_STATUS_C_U6M_BUREAU_BL', 'QT_MIN_STATUS_C_U6M_BUREAU_BL', 'QT_TT_STATUS_0_U6M_BUREAU_BL', 'QT_MED_STATUS_0_U6M_BUREAU_BL', 'QT_MAX_STATUS_0_U6M_BUREAU_BL', 'QT_MIN_STATUS_0_U6M_BUREAU_BL', 'QT_TT_STATUS_X_U6M_BUREAU_BL', 'QT_MED_STATUS_X_U6M_BUREAU_BL', 'QT_MAX_STATUS_X_U6M_BUREAU_BL', 'QT_MIN_STATUS_X_U6M_BUREAU_BL', 'QT_TT_STAT

25/07/21 12:14:56 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Quantidade de linhas do DataFrame: 817395
Quantidade de colunas do DataFrame: 65




[Stage 30:>                                                         (0 + 6) / 6]

+------------+----------------------------+-----------------------------+-----------------------------+-----------------------------+----------------------------+-----------------------------+-----------------------------+-----------------------------+----------------------------+-----------------------------+-----------------------------+-----------------------------+----------------------------+-----------------------------+-----------------------------+-----------------------------+----------------------------+-----------------------------+-----------------------------+-----------------------------+----------------------------+-----------------------------+-----------------------------+-----------------------------+----------------------------+-----------------------------+-----------------------------+-----------------------------+----------------------------+-----------------------------+-----------------------------+-----------------------------+----------------------------+-----

                                                                                

In [19]:

df_temp_03.write.mode("overwrite").parquet('/data/books/bureau_balance')



                                                                                

In [20]:
spark.stop()