# Setup do ambiente para utilização do Spark

In [1]:
# Importando a biblioteca os
import os
import sys

# Instalação e configuração de variaveis ambiente para utilizar Google Drive
# Se utilizar o Colab ajustar para True para instalação dos pre-requisitos
colab = True

if colab==True:
    from google.colab import drive
    drive.mount('/content/drive')

    # Instalação de requisitos
    !apt-get update # Update apt-get repository.
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null # Install Java.
    !wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
    #!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz
    !tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
    #!tar xf spark-3.1.2-bin-hadoop2.7.tgz

    !pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.

    # Definindo a variável de ambiente do Java
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

    # Definindo a variável de ambiente do Spark
    #os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
    os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

Mounted at /content/drive
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,920 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,358 kB]
Fetched 3,510 kB in 2s (1,890 kB/s)
R

In [2]:
# Variaveis de configuração
# Diretorio base dos dados

# Local PC
#dir_base = "data/"
# Google Drive
dir_base = "/content/drive/MyDrive/jupyter/pcd_0124_analise_de_credito/data/"

In [3]:
# Importando a findspark
import findspark

# Iniciando o findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Análise de Crédito - Previous Application") \
    .getOrCreate()

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Leitura dos dados

In [4]:
# Le os dados no arquivo base
data_pa = spark.read.csv(dir_base + "previous_application.csv", header=True)

In [5]:
data_pa.show(10)

+----------+----------+------------------+-----------+---------------+----------+----------------+---------------+--------------------------+-----------------------+---------------------------+----------------------+-----------------+---------------------+------------------------+----------------------+--------------------+-------------+--------------------+------------------+---------------+----------------+-------------------+--------------+-----------------+--------------------+----------------+--------------------+-----------+----------------+--------------------+------------------+--------------+-------------------------+-------------+----------------+-------------------------+
|SK_ID_PREV|SK_ID_CURR|NAME_CONTRACT_TYPE|AMT_ANNUITY|AMT_APPLICATION|AMT_CREDIT|AMT_DOWN_PAYMENT|AMT_GOODS_PRICE|WEEKDAY_APPR_PROCESS_START|HOUR_APPR_PROCESS_START|FLAG_LAST_APPL_PER_CONTRACT|NFLAG_LAST_APPL_IN_DAY|RATE_DOWN_PAYMENT|RATE_INTEREST_PRIMARY|RATE_INTEREST_PRIVILEGED|NAME_CASH_LOAN_PURPOSE|NAME

In [6]:
# Query SQL com visão temporal
spark_query = f"""
SELECT distinct NAME_CONTRACT_STATUS FROM data_pa
"""
data_pa.createOrReplaceTempView("data_pa")
df_tmp_teste = spark.sql(spark_query)
df_tmp_teste.createOrReplaceTempView("df_tmp_teste")

#df_tmp_teste.show(10)
name_list = df_tmp_teste.select('NAME_CONTRACT_STATUS').rdd.flatMap(lambda x: x).collect()

contract_list_status = [ status.replace(" ", "_").upper() for status in name_list]

In [7]:
contract_list_status

['APPROVED', 'UNUSED_OFFER', 'CANCELED', 'REFUSED']

# Criação de flags para auxílio na visão temporal dos dados

In [8]:
# Verifica valor máximo da coluna e gera valores baseados no intervalo
max_value = 360 # No campo DAYS_CREDIT o valor é em dias
interval = 360 # Usar intervalos em dias
list_value = [90,180] # Valores iniciais já adicionados na lista fora do intervalo

for i in range(0, max_value+1, interval):
    list_value.append(i)
list_value.remove(0)
column_name = 'DAYS_DECISION'

def case_when_flags_days(list_value, column_name):
    last = len(list_value)-1
    sql_case = ""
    list_columns_flag = []
    list_sql_case = []
    for i in list_value:
        sql_case += "CASE WHEN " + column_name + " >= -" + str(i) + " THEN 1 ELSE 0 END AS ultimos_" + str(int(i/30)) + "_meses"
        list_columns_flag.append("ultimos_" + str(int(i/30)) + "_meses")
        if i!=list_value[last]:
            sql_case += ", "

    list_sql_case.append(sql_case)
    list_sql_case.append(list_columns_flag)

    return list_sql_case

list_case = case_when_flags_days(list_value, column_name)
list_case_query = list_case[0]
list_flags_columns = list_case[1]

In [9]:
print(list_case_query)
print(list_flags_columns)

CASE WHEN DAYS_DECISION >= -90 THEN 1 ELSE 0 END AS ultimos_3_meses, CASE WHEN DAYS_DECISION >= -180 THEN 1 ELSE 0 END AS ultimos_6_meses, CASE WHEN DAYS_DECISION >= -360 THEN 1 ELSE 0 END AS ultimos_12_meses
['ultimos_3_meses', 'ultimos_6_meses', 'ultimos_12_meses']


In [10]:
# Query SQL com visão temporal
spark_query = f"""
SELECT *, {list_case_query}
    FROM data_pa
    ORDER BY SK_ID_PREV
"""

data_pa.createOrReplaceTempView("data_pa")
df_tmp_01 = spark.sql(spark_query)
df_tmp_01.createOrReplaceTempView("df_tmp_01")

In [11]:
#df_tmp_01.columns

In [12]:
#df_tmp_01.show(10)

# Join das bases:
- Pos Cash
- Credit Card Balance
- Installments Payments

Bases já prontas contendo apenas variaveis selecionadas para a próxima etapa

In [13]:
pos_cash = spark.read.csv(dir_base + "pos_cash_balance_agg.csv", header=True)
credit_card_balance = spark.read.csv(dir_base + "credit_card_balance_agg.csv", header=True)
installments = spark.read.csv(dir_base + "installments_agg.csv", header=True)

In [14]:
pos_cash.show(2)

+----------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+-----------------------------------------------------------------+-----------------------------------------------------------------+-----------------------------------------------------------------+-----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------+------------------------------------------------------+------------------------------------------------------+----------------

In [15]:
credit_card_balance.show(2)

+----------+----------------------------------+----------------------------------+----------------------------------+----------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+-----------------------------------------------+-----------------------------------------------+-----------------------------------------------+-----------------------------------------------+-------------------------------------------+-------------------------------------------+-------------------------------------------+-------------------------------------------+-------------------------------------------------+-------------------------------------------------+-------------------------------------------------+-------------------------------------------------+-----------------------------------------------+--------------------------------------------

In [None]:
installments.show(2)

+----------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------+---------------------------------------------------+------------------------------------------------------+------------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-----------------------------------------------+-----------------------------------------------+------

In [16]:
df_tmp_02 = df_tmp_01.join(pos_cash, "SK_ID_PREV", how="left") \
    .join(credit_card_balance, "SK_ID_PREV", how="left") \
    .join(installments, "SK_ID_PREV", how="left")

In [17]:
#Número de colunas
len(df_tmp_02.columns)

652

In [None]:
df_tmp_02.show(2)

+----------+----------+------------------+-----------+---------------+----------+----------------+---------------+--------------------------+-----------------------+---------------------------+----------------------+-----------------+---------------------+------------------------+----------------------+--------------------+-------------+--------------------+------------------+---------------+----------------+-------------------+--------------+-----------------+--------------------+----------------+--------------------+-----------+----------------+-------------------+------------------+--------------+-------------------------+-------------+----------------+-------------------------+---------------+---------------+----------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+-------------------------

# Sumarizar na visão cliente (automatizada)

In [18]:
from pyspark.sql.functions import col, round, sum, avg, max, min, when

In [None]:
# Definir as colunas para agregação
columns_agg_total = df_tmp_02.columns

columns_agg_total_remove = ['SK_ID_CURR','SK_ID_PREV','DAYS_DECISION','NAME_CONTRACT_STATUS','WEEKDAY_APPR_PROCESS_START','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION',
                     'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
                     'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','CNT_PAYMENT','NAME_YIELD_GROUP','PRODUCT_COMBINATION']


columns_agg_total = [col for col in columns_agg_total if col not in columns_agg_total_remove]

expressions_agg = []

# Caso fique lento replique o código e rode por flags
flag = list_flags_columns[0]

suffix = "_PREVIOUS_APPLICATION"

#for flag in list_flags_columns:
for column in columns_agg_total:
  if "DPD" in column or "DAY" in column:
    expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'QT_MAX_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'QT_MIN_{column.upper()}_{flag.upper()}{suffix}'))
  else:
    expressions_agg.append(round(sum(when(col(flag)==1, col(column))), 2).alias(f'VL_TOT_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(avg(when(col(flag)==1, col(column))), 2).alias(f'VL_MED_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'VL_MAX_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'VL_MIN_{column.upper()}_{flag.upper()}{suffix}'))

# for categoria in nova_lista:
#   for flag in colunas_flags:
#     for coluna in colunas_agregacao_total:
#       if 'DPD' in coluna or 'DAY' in coluna:
#         expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MAX_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))
#         expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MIN_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))
#       else:
#         expressoes_agregacao.append(round(sum(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_TOT_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))
#         expressoes_agregacao.append(round(avg(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MED_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))
#         expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MAX_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))
#         expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MIN_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))



expressions_agg = tuple(expressions_agg)

#print(expressions_agg)

#aplicar as expressões de agregação
df_tmp_03 = df_tmp_02.groupBy("SK_ID_CURR").agg(*expressions_agg).orderBy("SK_ID_CURR")

In [None]:
#expressions_agg

In [None]:
len(df_tmp_03.columns)

2401

In [None]:
#exibir o dados resultantes
#df_tmp_03.show(10)

In [None]:
#df_tmp_03 = df_tmp_03.repartition(1)
#df_tmp_03.write.mode("overwrite").csv(dir_base + "previous_application_agg_1.csv",header=True)

In [None]:
#df_tmp_03 = df_tmp_03.repartition(1)
df_tmp_03.write.mode("overwrite").parquet(dir_base + "previous_application_agg_1.parquet")

In [19]:
# Flag - Ultimo 6 meses
# Definir as colunas para agregação
columns_agg_total = df_tmp_02.columns

columns_agg_total_remove = ['SK_ID_CURR','SK_ID_PREV','DAYS_DECISION','NAME_CONTRACT_STATUS','WEEKDAY_APPR_PROCESS_START','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION',
                     'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
                     'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','CNT_PAYMENT','NAME_YIELD_GROUP','PRODUCT_COMBINATION']


columns_agg_total = [col for col in columns_agg_total if col not in columns_agg_total_remove]

expressions_agg = []

# Caso fique lento replique o código e rode por flags - Ultimo 6 meses
flag = list_flags_columns[1]

suffix = "_PREVIOUS_APPLICATION"

#for flag in list_flags_columns:
for column in columns_agg_total:
  if "DPD" in column or "DAY" in column:
    expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'QT_MAX_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'QT_MIN_{column.upper()}_{flag.upper()}{suffix}'))
  else:
    expressions_agg.append(round(sum(when(col(flag)==1, col(column))), 2).alias(f'VL_TOT_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(avg(when(col(flag)==1, col(column))), 2).alias(f'VL_MED_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'VL_MAX_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'VL_MIN_{column.upper()}_{flag.upper()}{suffix}'))

expressions_agg = tuple(expressions_agg)

#print(expressions_agg)

#aplicar as expressões de agregação
df_tmp_04 = df_tmp_02.groupBy("SK_ID_CURR").agg(*expressions_agg).orderBy("SK_ID_CURR")

In [20]:
##df_tmp_04 = df_tmp_04.repartition(1)
df_tmp_04.write.mode("overwrite").parquet(dir_base + "previous_application_agg_2.parquet")

In [21]:
# Flag - Ultimo 12 meses
# Definir as colunas para agregação
columns_agg_total = df_tmp_02.columns

columns_agg_total_remove = ['SK_ID_CURR','SK_ID_PREV','DAYS_DECISION','NAME_CONTRACT_STATUS','WEEKDAY_APPR_PROCESS_START','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION',
                     'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
                     'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','CNT_PAYMENT','NAME_YIELD_GROUP','PRODUCT_COMBINATION']


columns_agg_total = [col for col in columns_agg_total if col not in columns_agg_total_remove]

expressions_agg = []

# Caso fique lento replique o código e rode por flags - Ultimo 6 meses
flag = list_flags_columns[2]

suffix = "_PREVIOUS_APPLICATION"

#for flag in list_flags_columns:
for column in columns_agg_total:
  if "DPD" in column or "DAY" in column:
    expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'QT_MAX_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'QT_MIN_{column.upper()}_{flag.upper()}{suffix}'))
  else:
    expressions_agg.append(round(sum(when(col(flag)==1, col(column))), 2).alias(f'VL_TOT_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(avg(when(col(flag)==1, col(column))), 2).alias(f'VL_MED_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'VL_MAX_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'VL_MIN_{column.upper()}_{flag.upper()}{suffix}'))

expressions_agg = tuple(expressions_agg)

#print(expressions_agg)

#aplicar as expressões de agregação
df_tmp_05 = df_tmp_02.groupBy("SK_ID_CURR").agg(*expressions_agg).orderBy("SK_ID_CURR")

In [22]:
##df_tmp_05 = df_tmp_05.repartition(1)
df_tmp_05.write.mode("overwrite").parquet(dir_base + "previous_application_agg_3.parquet")

In [None]:
"""
Visão temporal não utilizada

# Flag - Ultimo 24 meses
# Definir as colunas para agregação
columns_agg_total = df_tmp_02.columns

columns_agg_total_remove = ['SK_ID_CURR','SK_ID_PREV','DAYS_DECISION','NAME_CONTRACT_STATUS','WEEKDAY_APPR_PROCESS_START','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION',
                     'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
                     'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','CNT_PAYMENT','NAME_YIELD_GROUP','PRODUCT_COMBINATION']


columns_agg_total = [col for col in columns_agg_total if col not in columns_agg_total_remove]

expressions_agg = []

# Caso fique lento replique o código e rode por flags - Ultimo 6 meses
flag = list_flags_columns[3]

suffix = "_PREVIOUS_APPLICATION"

#for flag in list_flags_columns:
for column in columns_agg_total:
  if "DPD" in column or "DAY" in column:
    expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'QT_MAX_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'QT_MIN_{column.upper()}_{flag.upper()}{suffix}'))
  else:
    expressions_agg.append(round(sum(when(col(flag)==1, col(column))), 2).alias(f'VL_TOT_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(avg(when(col(flag)==1, col(column))), 2).alias(f'VL_MED_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'VL_MAX_{column.upper()}_{flag.upper()}{suffix}'))
    expressions_agg.append(round(min(when(col(flag)==1, col(column))), 2).alias(f'VL_MIN_{column.upper()}_{flag.upper()}{suffix}'))

expressions_agg = tuple(expressions_agg)

#print(expressions_agg)

#aplicar as expressões de agregação
df_tmp_06 = df_tmp_02.groupBy("SK_ID_CURR").agg(*expressions_agg).orderBy("SK_ID_CURR")

"""

In [None]:
"""
Visão temporal não utilizada

##df_tmp_06 = df_tmp_06.repartition(1)
df_tmp_06.write.mode("overwrite").parquet(dir_base + "previous_application_agg_4.parquet")

"""

In [None]:
print("Fim")