## Setup do ambiente para utilização do Spark

In [1]:
# Instalação e configuração de variaveis ambiente para utilizar Google Drive
# Se utilizar o Colab ajustar para True para instalação dos pre-requisitos
colab = True

if colab==True:
    from google.colab import drive
    drive.mount('/content/drive')

    # Instalação de requisitos
    !apt-get update # Update apt-get repository.
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null # Install Java.
    !wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
    !tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
    !pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.

    # Importando a biblioteca os
    import os
    # Definindo a variável de ambiente do Java
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
    # Definindo a variável de ambiente do Spark
    #os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
    os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

Mounted at /content/drive
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,357 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,920 kB]
Fetched 3,510 kB in 2s (1,875 kB/s)
R

In [2]:
# Variaveis de configuração
# Diretorio base dos dados

# Local PC
# dir_base = "data/"
# Google Drive
dir_base = "/content/drive/MyDrive/jupyter/pcd_0124_analise_de_credito/data/"

In [3]:
# Importando a findspark
import findspark

# Iniciando o findspark
findspark.init()
from  pyspark.sql import  SparkSession
spark = SparkSession \
    .builder \
    .appName("Análise de Crédito - Credit Card Balance") \
    .getOrCreate()

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Leitura dos dados

In [4]:
# Leitura do arquivo CSV com os dados
data_cc = spark.read.csv(dir_base + "credit_card_balance.csv", header=True)

In [5]:
# Visualização dos dados
data_cc.show()

+----------+----------+--------------+-----------+-----------------------+------------------------+--------------------+--------------------------+------------------------+-----------------------+-------------------+-------------------------+------------------------+-------------+--------------------+------------------------+--------------------+--------------------------+------------------------+-------------------------+--------------------+------+----------+
|SK_ID_PREV|SK_ID_CURR|MONTHS_BALANCE|AMT_BALANCE|AMT_CREDIT_LIMIT_ACTUAL|AMT_DRAWINGS_ATM_CURRENT|AMT_DRAWINGS_CURRENT|AMT_DRAWINGS_OTHER_CURRENT|AMT_DRAWINGS_POS_CURRENT|AMT_INST_MIN_REGULARITY|AMT_PAYMENT_CURRENT|AMT_PAYMENT_TOTAL_CURRENT|AMT_RECEIVABLE_PRINCIPAL|AMT_RECIVABLE|AMT_TOTAL_RECEIVABLE|CNT_DRAWINGS_ATM_CURRENT|CNT_DRAWINGS_CURRENT|CNT_DRAWINGS_OTHER_CURRENT|CNT_DRAWINGS_POS_CURRENT|CNT_INSTALMENT_MATURE_CUM|NAME_CONTRACT_STATUS|SK_DPD|SK_DPD_DEF|
+----------+----------+--------------+-----------+------------------

# Criação de flags para auxílio na visão temporal dos dados

In [6]:
# Verifica valor máximo da coluna e gera valores baseados no intervalo
max_value = 12
interval = 12
list_value = [3,6]
for i in range(0, max_value+1, interval):
    list_value.append(i)
list_value.remove(0)

def case_when_flags(list_value):
    last = len(list_value)-1
    sql_case = ""
    list_columns_flag = []
    list_sql_case = []
    for i in list_value:
        sql_case += "CASE WHEN MONTHS_BALANCE >= -" + str(i) + " THEN 1 ELSE 0 END AS ultimos_" + str(i) + "_meses "
        list_columns_flag.append("ultimos_" + str(i) + "_meses")
        if i!=list_value[last]:
            sql_case += ", "

    list_sql_case.append(sql_case)
    list_sql_case.append(list_columns_flag)

    return list_sql_case

list_case = case_when_flags(list_value)
list_case_query = list_case[0]
list_flags_columns = list_case[1]

In [7]:
list_case_query

'CASE WHEN MONTHS_BALANCE >= -3 THEN 1 ELSE 0 END AS ultimos_3_meses , CASE WHEN MONTHS_BALANCE >= -6 THEN 1 ELSE 0 END AS ultimos_6_meses , CASE WHEN MONTHS_BALANCE >= -12 THEN 1 ELSE 0 END AS ultimos_12_meses '

In [8]:
# Habilitando o uso do SparkSQL
data_cc.createOrReplaceTempView("data_cc")
spark_query = f"""
SELECT *, {list_case_query} FROM data_cc
ORDER BY `SK_ID_PREV`
"""
df_tmp_01 = spark.sql(spark_query)
df_tmp_01.createOrReplaceTempView("df_tmp_01")
df_tmp_01.show()

+----------+----------+--------------+-----------+-----------------------+------------------------+--------------------+--------------------------+------------------------+-----------------------+-------------------+-------------------------+------------------------+-------------+--------------------+------------------------+--------------------+--------------------------+------------------------+-------------------------+--------------------+------+----------+---------------+---------------+----------------+
|SK_ID_PREV|SK_ID_CURR|MONTHS_BALANCE|AMT_BALANCE|AMT_CREDIT_LIMIT_ACTUAL|AMT_DRAWINGS_ATM_CURRENT|AMT_DRAWINGS_CURRENT|AMT_DRAWINGS_OTHER_CURRENT|AMT_DRAWINGS_POS_CURRENT|AMT_INST_MIN_REGULARITY|AMT_PAYMENT_CURRENT|AMT_PAYMENT_TOTAL_CURRENT|AMT_RECEIVABLE_PRINCIPAL|AMT_RECIVABLE|AMT_TOTAL_RECEIVABLE|CNT_DRAWINGS_ATM_CURRENT|CNT_DRAWINGS_CURRENT|CNT_DRAWINGS_OTHER_CURRENT|CNT_DRAWINGS_POS_CURRENT|CNT_INSTALMENT_MATURE_CUM|NAME_CONTRACT_STATUS|SK_DPD|SK_DPD_DEF|ultimos_3_meses|ulti

# Sumarizar na visão cliente (Automatizada)

In [9]:
from pyspark.sql.functions import col, round, sum, avg, max, min, when

In [10]:
#Lista das colunas
print(df_tmp_01.columns)

['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT', 'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY', 'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE', 'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'CNT_INSTALMENT_MATURE_CUM', 'NAME_CONTRACT_STATUS', 'SK_DPD', 'SK_DPD_DEF', 'ultimos_3_meses', 'ultimos_6_meses', 'ultimos_12_meses']


In [11]:
# Definir as colunas para agregação
columns_agg_total = df_tmp_01.columns
columns_agg_total.remove('SK_ID_CURR')
columns_agg_total.remove('SK_ID_PREV')
columns_agg_total.remove('MONTHS_BALANCE')

columns_flags = list_flags_columns

expressions_agg = []

for flag in columns_flags:
    for column in columns_agg_total:
        if "DPD" in column:
            expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'QT_MAX_{column.upper()}_{flag.upper()}'))
            expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'QT_MIN_{column.upper()}_{flag.upper()}'))
        else:
            expressions_agg.append(round(sum(when(col(flag)==1, col(column))), 2).alias(f'VL_TOT_{column.upper()}_{flag.upper()}'))
            expressions_agg.append(round(avg(when(col(flag)==1, col(column))), 2).alias(f'VL_MED_{column.upper()}_{flag.upper()}'))
            expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'VL_MAX_{column.upper()}_{flag.upper()}'))
            expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'VL_MIN_{column.upper()}_{flag.upper()}'))

expressions_agg = tuple(expressions_agg)

#print(expressions_agg)

#aplicar as expressões de agregação
df_tmp_02 = df_tmp_01.groupBy("SK_ID_PREV").agg(*expressions_agg).orderBy("SK_ID_PREV")

#exibir o dados resultantes
df_tmp_02.show()



+----------+----------------------------------+----------------------------------+----------------------------------+----------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+-----------------------------------------------+-----------------------------------------------+-----------------------------------------------+-----------------------------------------------+-------------------------------------------+-------------------------------------------+-------------------------------------------+-------------------------------------------+-------------------------------------------------+-------------------------------------------------+-------------------------------------------------+-------------------------------------------------+-----------------------------------------------+--------------------------------------------

# Salvar a tabela sumarizada

In [12]:
df_tmp_02 = df_tmp_02.repartition(1)
df_tmp_02.write.mode("overwrite").csv(dir_base + "credit_card_balance_agg.csv",header=True)