In [1]:
# Instalação e configuração de variaveis ambiente para utilizar Google Drive
# Se utilizar o Colab ajustar para True para instalação dos pre-requisitos
colab = True

if colab==True:
    from google.colab import drive
    drive.mount('/content/drive')

    # Instalação de requisitos
    !apt-get update # Update apt-get repository.
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null # Install Java.
    !wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
    !tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
    !pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.

    # Importando a biblioteca os
    import os
    # Definindo a variável de ambiente do Java
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
    # Definindo a variável de ambiente do Spark
    #os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
    os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

Mounted at /content/drive
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:10 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,920 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,357 kB]
Fetched 3,510 kB in 1s (2,609 kB/s)
R

In [2]:
# Variaveis de configuração
# Diretorio base dos dados

# Local PC
#dir_base = "data/"
# Google Drive
dir_base = "/content/drive/MyDrive/jupyter/pcd_0124_analise_de_credito/data/"

In [3]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Análise de Crédito - Pos Cash Balance") \
    .getOrCreate()

import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

# Leitura dos dados - Post Cash Balance

In [4]:
data_pc = spark.read.csv(dir_base + "pos_cash_balance.csv", header = True)
data_pc.show(10)

+----------+----------+--------------+--------------+---------------------+--------------------+------+----------+
|SK_ID_PREV|SK_ID_CURR|MONTHS_BALANCE|CNT_INSTALMENT|CNT_INSTALMENT_FUTURE|NAME_CONTRACT_STATUS|SK_DPD|SK_DPD_DEF|
+----------+----------+--------------+--------------+---------------------+--------------------+------+----------+
|   1803195|    182943|           -31|          48.0|                 45.0|              Active|     0|         0|
|   1715348|    367990|           -33|          36.0|                 35.0|              Active|     0|         0|
|   1784872|    397406|           -32|          12.0|                  9.0|              Active|     0|         0|
|   1903291|    269225|           -35|          48.0|                 42.0|              Active|     0|         0|
|   2341044|    334279|           -35|          36.0|                 35.0|              Active|     0|         0|
|   2207092|    342166|           -32|          12.0|                 12.0|     

# Criação de flags para auxilio na visão temporal dos dados

In [5]:
# Verifica valor máximo da coluna e gera valores baseados no intervalo
max_value = 12 # No campo DAYS_CREDIT o valor é em dias
interval = 12 # Usar intervalos em dias
list_value = [3, 6] # Valores iniciais já adicionados na lista fora do intervalo

for i in range(0, max_value+1, interval):
    list_value.append(i)
list_value.remove(0)
column_name = 'MONTHS_BALANCE'

def case_when_flags_days(list_value, column_name):
    last = len(list_value)-1
    sql_case = ""
    list_columns_flag = []
    list_sql_case = []
    for i in list_value:
        sql_case += "CASE WHEN " + column_name + " >= -" + str(i) + " THEN 1 ELSE 0 END AS ultimos_" + str(i) + "_meses"
        list_columns_flag.append("ultimos_" + str(i) + "_meses")
        if i!=list_value[last]:
            sql_case += ", "

    list_sql_case.append(sql_case)
    list_sql_case.append(list_columns_flag)

    return list_sql_case

list_case = case_when_flags_days(list_value, column_name)
list_case_query = list_case[0]
list_flags_columns_m = list_case[1]

In [6]:
list_case_query

'CASE WHEN MONTHS_BALANCE >= -3 THEN 1 ELSE 0 END AS ultimos_3_meses, CASE WHEN MONTHS_BALANCE >= -6 THEN 1 ELSE 0 END AS ultimos_6_meses, CASE WHEN MONTHS_BALANCE >= -12 THEN 1 ELSE 0 END AS ultimos_12_meses'

In [7]:
# Query SQL com visão temporal
spark_query = f"""
SELECT *, {list_case_query}
    FROM data_pc
    ORDER BY SK_ID_PREV
"""

data_pc.createOrReplaceTempView("data_pc")

df_tmp_01 = spark.sql(spark_query)

df_tmp_01.createOrReplaceTempView("df_tmp_01")
df_tmp_01.show()

+----------+----------+--------------+--------------+---------------------+--------------------+------+----------+---------------+---------------+----------------+
|SK_ID_PREV|SK_ID_CURR|MONTHS_BALANCE|CNT_INSTALMENT|CNT_INSTALMENT_FUTURE|NAME_CONTRACT_STATUS|SK_DPD|SK_DPD_DEF|ultimos_3_meses|ultimos_6_meses|ultimos_12_meses|
+----------+----------+--------------+--------------+---------------------+--------------------+------+----------+---------------+---------------+----------------+
|   1000001|    158271|           -10|          12.0|                 12.0|              Active|     0|         0|              0|              0|               1|
|   1000001|    158271|            -8|           2.0|                  0.0|           Completed|     0|         0|              0|              0|               1|
|   1000001|    158271|            -9|          12.0|                 11.0|              Active|     0|         0|              0|              0|               1|
|   1000002|    

In [8]:
# Query SQL com visão temporal
spark_query = f"""
SELECT distinct NAME_CONTRACT_STATUS FROM data_pc
"""
data_pc.createOrReplaceTempView("data_pc")
df_tmp_teste = spark.sql(spark_query)
df_tmp_teste.createOrReplaceTempView("df_tmp_teste")
df_tmp_teste.show()


+--------------------+
|NAME_CONTRACT_STATUS|
+--------------------+
|              Demand|
|            Approved|
|           Completed|
|      Amortized debt|
|Returned to the s...|
|                 XNA|
|              Active|
|              Signed|
|            Canceled|
+--------------------+



In [9]:
# Gera Case When pra uma lista

spark_query = f"""
SELECT distinct NAME_CONTRACT_STATUS FROM data_pc
"""
data_pc.createOrReplaceTempView("data_pc")
df_tmp_teste = spark.sql(spark_query)
df_tmp_teste.createOrReplaceTempView("df_tmp_teste")

name_list = df_tmp_teste.select('NAME_CONTRACT_STATUS').rdd.flatMap(lambda x: x).collect()
contract_list_status = [ status.replace(" ", "_").upper() for status in name_list]
list_value = contract_list_status
column_name = 'NAME_CONTRACT_STATUS'

def case_when_flags_column(list_value, column_name):
    last = len(list_value)-1
    sql_case = ""
    list_columns_flag = []
    list_sql_case = []
    for i in list_value:
        sql_case += "CASE WHEN " + column_name + "='" + str(i) + "' THEN 1 ELSE 0 END AS " + column_name + "_" + str(i)
        list_columns_flag.append(column_name + "_" + str(i))
        if i!=list_value[last]:
            sql_case += ", "

    list_sql_case.append(sql_case)
    list_sql_case.append(list_columns_flag)

    return list_sql_case

list_case = case_when_flags_column(list_value, column_name)
list_case_query = list_case[0]
list_flags_columns = list_case[1]

print(list_case_query)
print(list_flags_columns)

CASE WHEN NAME_CONTRACT_STATUS='DEMAND' THEN 1 ELSE 0 END AS NAME_CONTRACT_STATUS_DEMAND, CASE WHEN NAME_CONTRACT_STATUS='APPROVED' THEN 1 ELSE 0 END AS NAME_CONTRACT_STATUS_APPROVED, CASE WHEN NAME_CONTRACT_STATUS='COMPLETED' THEN 1 ELSE 0 END AS NAME_CONTRACT_STATUS_COMPLETED, CASE WHEN NAME_CONTRACT_STATUS='AMORTIZED_DEBT' THEN 1 ELSE 0 END AS NAME_CONTRACT_STATUS_AMORTIZED_DEBT, CASE WHEN NAME_CONTRACT_STATUS='RETURNED_TO_THE_STORE' THEN 1 ELSE 0 END AS NAME_CONTRACT_STATUS_RETURNED_TO_THE_STORE, CASE WHEN NAME_CONTRACT_STATUS='XNA' THEN 1 ELSE 0 END AS NAME_CONTRACT_STATUS_XNA, CASE WHEN NAME_CONTRACT_STATUS='ACTIVE' THEN 1 ELSE 0 END AS NAME_CONTRACT_STATUS_ACTIVE, CASE WHEN NAME_CONTRACT_STATUS='SIGNED' THEN 1 ELSE 0 END AS NAME_CONTRACT_STATUS_SIGNED, CASE WHEN NAME_CONTRACT_STATUS='CANCELED' THEN 1 ELSE 0 END AS NAME_CONTRACT_STATUS_CANCELED
['NAME_CONTRACT_STATUS_DEMAND', 'NAME_CONTRACT_STATUS_APPROVED', 'NAME_CONTRACT_STATUS_COMPLETED', 'NAME_CONTRACT_STATUS_AMORTIZED_DEBT',

In [10]:
# Query SQL com visão temporal
spark_query = f"""
SELECT *, {list_case_query}
    FROM df_tmp_01
    ORDER BY SK_ID_PREV
"""

df_tmp_01.createOrReplaceTempView("df_tmp_01")

df_tmp_02 = spark.sql(spark_query)

df_tmp_02.createOrReplaceTempView("df_tmp_02")
df_tmp_02.show(10)

+----------+----------+--------------+--------------+---------------------+--------------------+------+----------+---------------+---------------+----------------+---------------------------+-----------------------------+------------------------------+-----------------------------------+------------------------------------------+------------------------+---------------------------+---------------------------+-----------------------------+
|SK_ID_PREV|SK_ID_CURR|MONTHS_BALANCE|CNT_INSTALMENT|CNT_INSTALMENT_FUTURE|NAME_CONTRACT_STATUS|SK_DPD|SK_DPD_DEF|ultimos_3_meses|ultimos_6_meses|ultimos_12_meses|NAME_CONTRACT_STATUS_DEMAND|NAME_CONTRACT_STATUS_APPROVED|NAME_CONTRACT_STATUS_COMPLETED|NAME_CONTRACT_STATUS_AMORTIZED_DEBT|NAME_CONTRACT_STATUS_RETURNED_TO_THE_STORE|NAME_CONTRACT_STATUS_XNA|NAME_CONTRACT_STATUS_ACTIVE|NAME_CONTRACT_STATUS_SIGNED|NAME_CONTRACT_STATUS_CANCELED|
+----------+----------+--------------+--------------+---------------------+--------------------+------+----------+

# Sumarizar na visão cliente (Automatizada)



In [11]:
# Importar bibliotecas
from pyspark.sql.functions import col, round, sum, avg, max, min, when, count

In [12]:
df_tmp_02.columns

['SK_ID_PREV',
 'SK_ID_CURR',
 'MONTHS_BALANCE',
 'CNT_INSTALMENT',
 'CNT_INSTALMENT_FUTURE',
 'NAME_CONTRACT_STATUS',
 'SK_DPD',
 'SK_DPD_DEF',
 'ultimos_3_meses',
 'ultimos_6_meses',
 'ultimos_12_meses',
 'NAME_CONTRACT_STATUS_DEMAND',
 'NAME_CONTRACT_STATUS_APPROVED',
 'NAME_CONTRACT_STATUS_COMPLETED',
 'NAME_CONTRACT_STATUS_AMORTIZED_DEBT',
 'NAME_CONTRACT_STATUS_RETURNED_TO_THE_STORE',
 'NAME_CONTRACT_STATUS_XNA',
 'NAME_CONTRACT_STATUS_ACTIVE',
 'NAME_CONTRACT_STATUS_SIGNED',
 'NAME_CONTRACT_STATUS_CANCELED']

In [13]:
# Definir as colunas para agregação
columns_agg_total = df_tmp_01.columns
columns_agg_total.remove('SK_ID_CURR')
columns_agg_total.remove('SK_ID_PREV')
columns_agg_total.remove('MONTHS_BALANCE')

columns_flags = list_flags_columns

suffix = "_POS_CASH"

expressions_agg = []

for flag in columns_flags:
    for column in columns_agg_total:
        if "DPD" in column:
            expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'QT_MAX_{column.upper()}_{flag.upper()}{suffix}'))
            expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'QT_MIN_{column.upper()}_{flag.upper()}{suffix}'))
        else:
            expressions_agg.append(round(sum(when(col(flag)==1, col(column))), 2).alias(f'VL_TOT_{column.upper()}_{flag.upper()}{suffix}'))
            expressions_agg.append(round(avg(when(col(flag)==1, col(column))), 2).alias(f'VL_MED_{column.upper()}_{flag.upper()}{suffix}'))
            expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'VL_MAX_{column.upper()}_{flag.upper()}{suffix}'))
            expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'VL_MIN_{column.upper()}_{flag.upper()}{suffix}'))

expressions_agg = tuple(expressions_agg)

#print(expressions_agg)

#aplicar as expressões de agregação
df_tmp_03 = df_tmp_02.groupBy("SK_ID_PREV").agg(*expressions_agg).orderBy("SK_ID_PREV")

In [14]:
#exibir o dados resultantes
df_tmp_03.show(10)

+----------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+-----------------------------------------------------------------+-----------------------------------------------------------------+-----------------------------------------------------------------+-----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------+------------------------------------------------------+------------------------------------------------------+----------------

# Salvar a tabela sumarizada

In [15]:
df_tmp_04 = df_tmp_03.repartition(1)
df_tmp_04.write.mode("overwrite").csv(dir_base + "pos_cash_balance_agg.csv",header=True)