# Setup do ambiente para utilização do Spark

In [1]:
# Importando a biblioteca os
import os
import sys

In [2]:
# Instalação e configuração de variaveis ambiente para utilizar Google Drive
# Se utilizar o Colab ajustar para True para instalação dos pre-requisitos
colab = True

if colab==True:
    from google.colab import drive
    drive.mount('/content/drive')

    # Instalação de requisitos
    !apt-get update # Update apt-get repository.
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null # Install Java.
    !wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
    !tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
    !pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.

    # Definindo a variável de ambiente do Java
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
    # Definindo a variável de ambiente do Spark
    #os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
    os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

Mounted at /content/drive
Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,920 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,357 kB]
Fetched 3,510 kB in 2s (2,207 kB/s)
R

In [3]:
# Variaveis de configuração
# Diretorio base dos dados

# Local PC
#dir_base = "data/"
# Google Drive
dir_base = "/content/drive/MyDrive/jupyter/pcd_0124_analise_de_credito/data/"

In [4]:
# Importação das bibliotecas
import findspark

# Iniciando o findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Análise de Crédito - Installments") \
    .getOrCreate()

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Leitura dos dados

In [5]:
# Leitura do arquivo de dados
data_cc = spark.read.csv(dir_base + "installments_payments.csv", header=True)

In [6]:
# Visualização dos dados
data_cc.show()

+----------+----------+----------------------+---------------------+---------------+------------------+--------------+-----------+
|SK_ID_PREV|SK_ID_CURR|NUM_INSTALMENT_VERSION|NUM_INSTALMENT_NUMBER|DAYS_INSTALMENT|DAYS_ENTRY_PAYMENT|AMT_INSTALMENT|AMT_PAYMENT|
+----------+----------+----------------------+---------------------+---------------+------------------+--------------+-----------+
|   1054186|    161674|                   1.0|                    6|        -1180.0|           -1187.0|       6948.36|    6948.36|
|   1330831|    151639|                   0.0|                   34|        -2156.0|           -2156.0|      1716.525|   1716.525|
|   2085231|    193053|                   2.0|                    1|          -63.0|             -63.0|       25425.0|    25425.0|
|   2452527|    199697|                   1.0|                    3|        -2418.0|           -2426.0|      24350.13|   24350.13|
|   2714724|    167756|                   1.0|                    2|        -1383.0

# Criação de flags para auxílio na visão temporal dos dados

In [7]:
# Verifica valor máximo da coluna e gera valores baseados no intervalo
max_value = 360 # No campo DAYS_INSTALMENT o valor é em dias
interval = 360 # Usar intervalos em dias
list_value = [90,180] # Valores iniciais já adicionados na lista fora do intervalo

for i in range(0, max_value+1, interval):
    list_value.append(i)
list_value.remove(0)
column_name = 'DAYS_INSTALMENT'

def case_when_flags_days(list_value, column_name):
    last = len(list_value)-1
    sql_case = ""
    list_columns_flag = []
    list_sql_case = []
    for i in list_value:
        sql_case += "CASE WHEN " + column_name + " >= -" + str(i) + " THEN 1 ELSE 0 END AS ultimos_" + str(int(i/30)) + "_meses"
        list_columns_flag.append("ultimos_" + str(int(i/30)) + "_meses")
        if i!=list_value[last]:
            sql_case += ", "

    list_sql_case.append(sql_case)
    list_sql_case.append(list_columns_flag)

    return list_sql_case

list_case = case_when_flags_days(list_value, column_name)
list_case_query = list_case[0]
list_flags_columns = list_case[1]

print(list_case_query)

CASE WHEN DAYS_INSTALMENT >= -90 THEN 1 ELSE 0 END AS ultimos_3_meses, CASE WHEN DAYS_INSTALMENT >= -180 THEN 1 ELSE 0 END AS ultimos_6_meses, CASE WHEN DAYS_INSTALMENT >= -360 THEN 1 ELSE 0 END AS ultimos_12_meses


In [8]:
# Query SQL com visão temporal
spark_query = f"""
SELECT *, {list_case_query} FROM data_cc
ORDER BY `SK_ID_PREV`
"""

data_cc.createOrReplaceTempView("data_cc")

df_tmp_01 = spark.sql(spark_query)

df_tmp_01.createOrReplaceTempView("df_temp_01")
df_tmp_01.show()

+----------+----------+----------------------+---------------------+---------------+------------------+--------------+-----------+---------------+---------------+----------------+
|SK_ID_PREV|SK_ID_CURR|NUM_INSTALMENT_VERSION|NUM_INSTALMENT_NUMBER|DAYS_INSTALMENT|DAYS_ENTRY_PAYMENT|AMT_INSTALMENT|AMT_PAYMENT|ultimos_3_meses|ultimos_6_meses|ultimos_12_meses|
+----------+----------+----------------------+---------------------+---------------+------------------+--------------+-----------+---------------+---------------+----------------+
|   1000001|    158271|                   1.0|                    1|         -268.0|            -294.0|       6404.31|    6404.31|              0|              0|               1|
|   1000001|    158271|                   2.0|                    2|         -238.0|            -244.0|     62039.115|  62039.115|              0|              0|               1|
|   1000002|    101962|                   2.0|                    4|        -1510.0|           -1554

# Sumarizar na visão cliente (Automatizada)

In [9]:
from pyspark.sql.functions import col, round, sum, avg, max, min, when

In [10]:
# Definição da lista de colunas para agregação
columns_agg_total = df_tmp_01.columns
columns_agg_total.remove('SK_ID_CURR')
columns_agg_total.remove('SK_ID_PREV')

columns_flags = list_flags_columns

expressions_agg = []

for flag in columns_flags:
    for column in columns_agg_total:
        if "DAY" in column:
            expressions_agg.append(round(max(when(col(flag)==1, col(column))),2).alias(f"QT_MAX_{column.upper()}_{flag.upper()}_INSTALLMENTS"))
            expressions_agg.append(round(max(when(col(flag)==1, col(column))),2).alias(f"QT_MIN_{column.upper()}_{flag.upper()}_INSTALLMENTS"))
        else:
            expressions_agg.append(round(sum(when(col(flag)==1, col(column))),2).alias(f"QT_TOT_{column.upper()}_{flag.upper()}_INSTALLMENTS"))
            expressions_agg.append(round(avg(when(col(flag)==1, col(column))),2).alias(f"QT_MED_{column.upper()}_{flag.upper()}_INSTALLMENTS"))
            expressions_agg.append(round(max(when(col(flag)==1, col(column))),2).alias(f"QT_MAX_{column.upper()}_{flag.upper()}_INSTALLMENTS"))
            expressions_agg.append(round(min(when(col(flag)==1, col(column))),2).alias(f"QT_MIN_{column.upper()}_{flag.upper()}_INSTALLMENTS"))

expressions_agg = tuple(expressions_agg)
df_tmp_02 = df_tmp_01.groupBy("SK_ID_PREV").agg(*expressions_agg) #.orderBy("SK_ID_PREV")

In [11]:
df_tmp_02.show()

+----------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------+---------------------------------------------------+------------------------------------------------------+------------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-----------------------------------------------+-----------------------------------------------+------

# Salvar a tabela sumarizada

In [12]:
df_tmp_02 = df_tmp_02.repartition(1)
df_tmp_02.write.mode("overwrite").csv(dir_base + "installments_agg.csv",header=True)