## Funções auxiliares

In [1]:
import pandas as pd
import numpy as np
import random
import gc

# Definindo a semente
random.seed(123)

# Metadados referente ao conjunto de dados
def pod_academy_generate_metadata(dataframe):
    """
    Gera um dataframe contendo metadados das colunas do dataframe fornecido.

    :param dataframe: DataFrame para o qual os metadados serão gerados.
    :return: DataFrame contendo metadados.
    """

    # Coleta de metadados básicos
    metadata = pd.DataFrame({
        'nome_variavel': dataframe.columns,
        'tipo': dataframe.dtypes,
        'qt_nulos': dataframe.isnull().sum(),
        'percent_nulos': round((dataframe.isnull().sum() / len(dataframe))* 100,2),
        'cardinalidade': dataframe.nunique(),
    })
    metadata=metadata.sort_values(by='percent_nulos',ascending=False)
    metadata = metadata.reset_index(drop=True)

    return metadata

def remove_highly_correlated_features(df, threshold):
  # Calculate the correlation matrix
  corr_matrix = df.corr().abs()

  # Select the upper triangle of the correlation matrix
  upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

  # Identify columns to drop based on the threshold
  to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]

  # Drop the columns
  df_reduced = df.drop(columns=to_drop)

  return df_reduced, to_drop

def amostragem(df,tamanho_amostra):
  amostra = df.sample(n=tamanho_amostra,random_state=42)
  return amostra

def vars_selection_previous_app(df,percentual_preenchimento,threshold,tamanho_amostragem,chave_principal):
  df_aux = df.drop(columns = chave_principal)
  amostra = amostragem(df_aux, tamanho_amostragem)
  metadata_df = pod_academy_generate_metadata(amostra)

  vars = metadata_df[metadata_df.percent_nulos <= percentual_preenchimento]['nome_variavel']
  df_reduced, dropped_features = remove_highly_correlated_features(amostra[vars], threshold=threshold)
  vars_selected = df_reduced.columns.to_list()
  vars_selected.append(chave_principal)
  df_selected = df[vars_selected]
  return df_selected

## Configuração do ambiente para utilização do Spark

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Fazendo download
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz

# Descompactando os arquivos
!tar xf spark-3.1.2-bin-hadoop2.7.tgz

# Importando a biblioteca os
import os

# Definindo a variável de ambiente do Java
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# Definindo a variável de ambiente do Spark
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"


# instalando a findspark
!pip install -q findspark

# Importando a findspark
import findspark

# Iniciando o findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName("Minha Primeira Aplicação no Pyspark") \
        .getOrCreate()

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

## Leitura dos dados

In [3]:
# Lê o arquivo Parquet
dados = spark.read.csv("/content/drive/MyDrive/bases_dados_projeto_credito/previous_application.csv",header=True)

# Mostra os dados
dados.show()

+----------+----------+------------------+-----------+---------------+----------+----------------+---------------+--------------------------+-----------------------+---------------------------+----------------------+-------------------+---------------------+------------------------+----------------------+--------------------+-------------+--------------------+------------------+---------------+----------------+--------------------+--------------+-----------------+--------------------+----------------+--------------------+-----------+----------------+--------------------+------------------+--------------+-------------------------+-------------+----------------+-------------------------+
|SK_ID_PREV|SK_ID_CURR|NAME_CONTRACT_TYPE|AMT_ANNUITY|AMT_APPLICATION|AMT_CREDIT|AMT_DOWN_PAYMENT|AMT_GOODS_PRICE|WEEKDAY_APPR_PROCESS_START|HOUR_APPR_PROCESS_START|FLAG_LAST_APPL_PER_CONTRACT|NFLAG_LAST_APPL_IN_DAY|  RATE_DOWN_PAYMENT|RATE_INTEREST_PRIMARY|RATE_INTEREST_PRIVILEGED|NAME_CASH_LOAN_PURPOSE

In [4]:
import re

# Extraia os valores únicos da coluna 'NAME_CONTRACT_STATUS'
status = dados.select("NAME_CONTRACT_STATUS").distinct().rdd.flatMap(lambda x: x).collect()

# Defina a função de tratamento de palavra
def tratamento_palavra(palavra):
    palavra_transformada = re.sub(r'\s', '_', palavra).upper()
    return palavra_transformada

# Aplique a função de tratamento de palavra a cada valor único
nova_lista = [tratamento_palavra(status) for status in status]

# Exiba a lista resultante
print(nova_lista)

['APPROVED', 'UNUSED_OFFER', 'CANCELED', 'REFUSED']


## Criação de flags para nos auxiliar na visão temporal dos dados

In [5]:
## Habilitando uso do SparkSQL
dados.createOrReplaceTempView("dados")

df_temp_01 = spark.sql("""
SELECT
    *,
      CASE
        WHEN DAYS_DECISION >= -90 THEN 1
        ELSE 0
    END AS ultimos_3_meses,
    CASE
        WHEN DAYS_DECISION >= -180 THEN 1
        ELSE 0
    END AS ultimos_6_meses,
    CASE
        WHEN DAYS_DECISION >= -360 THEN 1
        ELSE 0
    END AS ultimos_12_meses,
    CASE
        WHEN DAYS_DECISION >= -720 THEN 1
        ELSE 0
    END AS ultimos_24_meses,
    CASE
        WHEN DAYS_DECISION >= -1080 THEN 1
        ELSE 0
    END AS ultimos_36_meses
FROM dados
ORDER BY `SK_ID_PREV`;
""")
df_temp_01.createOrReplaceTempView("df_temp_01")
df_temp_01.show()

+----------+----------+------------------+-----------+---------------+----------+----------------+---------------+--------------------------+-----------------------+---------------------------+----------------------+--------------------+---------------------+------------------------+----------------------+--------------------+-------------+--------------------+------------------+---------------+----------------+--------------------+--------------+-----------------+----------------+----------------+--------------------+-----------+----------------+--------------------+------------------+--------------+-------------------------+-------------+----------------+-------------------------+---------------+---------------+----------------+----------------+----------------+
|SK_ID_PREV|SK_ID_CURR|NAME_CONTRACT_TYPE|AMT_ANNUITY|AMT_APPLICATION|AMT_CREDIT|AMT_DOWN_PAYMENT|AMT_GOODS_PRICE|WEEKDAY_APPR_PROCESS_START|HOUR_APPR_PROCESS_START|FLAG_LAST_APPL_PER_CONTRACT|NFLAG_LAST_APPL_IN_DAY|   RATE_D

## Join com bases (Pos Cash / Credit Card Balance / Installments Payments) contendo apenas as variáveis selecionadas

In [6]:
pos_cash = spark.read.csv("/content/drive/MyDrive/bases_dados_projeto_credito/bases_tratadas/variaveis_selecionadas/pos_cash_agg_selected.csv",header=True)
credit_card_balance = spark.read.csv("/content/drive/MyDrive/bases_dados_projeto_credito/bases_tratadas/variaveis_selecionadas/credit_card_balance_agg_selected.csv",header=True)
installments = spark.read.csv("/content/drive/MyDrive/bases_dados_projeto_credito/bases_tratadas/variaveis_selecionadas/installments_agg_selected.csv",header=True)

In [7]:
df_temp_02 = df_temp_01.join(pos_cash,"SK_ID_PREV",how='left').join(credit_card_balance,"SK_ID_PREV",how='left').join(installments,"SK_ID_PREV",how='left')
df_temp_02.show()

+----------+----------+------------------+-----------+---------------+----------+----------------+---------------+--------------------------+-----------------------+---------------------------+----------------------+-------------------+---------------------+------------------------+----------------------+--------------------+-------------+--------------------+------------------+---------------+----------------+--------------------+--------------+-----------------+--------------------+----------------+--------------------+-----------+----------------+--------------------+------------------+--------------+-------------------------+-------------+----------------+-------------------------+---------------+---------------+----------------+----------------+----------------+----------------------------------------------+------------------------------------------------+------------------------------------------------+------------------------------------------------+-----------------------------

## Sumarizar na visão cliente (Automatizada)

In [8]:
colunas_flags = ['ultimos_3_meses','ultimos_6_meses','ultimos_12_meses','ultimos_24_meses','ultimos_36_meses']

In [9]:
from pyspark.sql.functions import col, round, sum, avg, max, min, when

# Definir as colunas para agregação
colunas_agregacao_total = df_temp_02.columns

variaveis_remover = ['SK_ID_CURR','SK_ID_PREV','DAYS_DECISION','NAME_CONTRACT_STATUS','WEEKDAY_APPR_PROCESS_START','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION',
                     'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
                     'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','CNT_PAYMENT','NAME_YIELD_GROUP','PRODUCT_COMBINATION']

colunas_agregacao_total = [col for col in colunas_agregacao_total if col not in variaveis_remover]

expressoes_agregacao = []

flag = colunas_flags[0]

for coluna in colunas_agregacao_total:
  if 'DPD' in coluna or 'DAY' in coluna:
    expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MAX_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MIN_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
  else:
    expressoes_agregacao.append(round(sum(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_TOT_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(avg(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MED_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MAX_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MIN_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))

expressoes_agregacao = tuple(expressoes_agregacao)

# Aplicar as expressões de agregação
df_temp_03 = df_temp_02.groupBy("SK_ID_CURR").agg(*expressoes_agregacao).orderBy("SK_ID_CURR")

# Mostrar o DataFrame resultante
df_temp_03.show()

+----------+--------------------------------------------------------------+--------------------------------------------------------------+--------------------------------------------------------------+--------------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-----------------------------------------------------------+-----------------------------------------------------------+-----------------------------------------------------------+-----------------------------------------------------------+------------------------------------------------------+------------------------------------------------------+------------------------------------------------------+------------------------------------------------------+----------------------------------------------------

In [16]:
df_temp_03 = df_temp_03.repartition(1)
df_temp_03.write.mode("overwrite").csv("/content/drive/MyDrive/bases_dados_projeto_credito/bases_tratadas/previous_application_agg_1.csv",header=True)

In [17]:
# Definir as colunas para agregação
colunas_agregacao_total = df_temp_02.columns

variaveis_remover = ['SK_ID_CURR','SK_ID_PREV','DAYS_DECISION','NAME_CONTRACT_STATUS','WEEKDAY_APPR_PROCESS_START','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION',
                     'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
                     'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','CNT_PAYMENT','NAME_YIELD_GROUP','PRODUCT_COMBINATION']

colunas_agregacao_total = [col for col in colunas_agregacao_total if col not in variaveis_remover]

expressoes_agregacao = []

flag = colunas_flags[1]

for coluna in colunas_agregacao_total:
  if 'DPD' in coluna or 'DAY' in coluna:
    expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MAX_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MIN_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
  else:
    expressoes_agregacao.append(round(sum(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_TOT_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(avg(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MED_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MAX_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MIN_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))

expressoes_agregacao = tuple(expressoes_agregacao)

# Aplicar as expressões de agregação
df_temp_04 = df_temp_02.groupBy("SK_ID_CURR").agg(*expressoes_agregacao).orderBy("SK_ID_CURR")

# Mostrar o DataFrame resultante
df_temp_04.show()

+----------+--------------------------------------------------------------+--------------------------------------------------------------+--------------------------------------------------------------+--------------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-----------------------------------------------------------+-----------------------------------------------------------+-----------------------------------------------------------+-----------------------------------------------------------+------------------------------------------------------+------------------------------------------------------+------------------------------------------------------+------------------------------------------------------+----------------------------------------------------

In [18]:
df_temp_04 = df_temp_04.repartition(1)
df_temp_04.write.mode("overwrite").csv("/content/drive/MyDrive/bases_dados_projeto_credito/bases_tratadas/previous_application_agg_2.csv",header=True)

In [19]:
# Definir as colunas para agregação
colunas_agregacao_total = df_temp_02.columns

variaveis_remover = ['SK_ID_CURR','SK_ID_PREV','DAYS_DECISION','NAME_CONTRACT_STATUS','WEEKDAY_APPR_PROCESS_START','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION',
                     'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
                     'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','CNT_PAYMENT','NAME_YIELD_GROUP','PRODUCT_COMBINATION']

colunas_agregacao_total = [col for col in colunas_agregacao_total if col not in variaveis_remover]

expressoes_agregacao = []

flag = colunas_flags[2]

for coluna in colunas_agregacao_total:
  if 'DPD' in coluna or 'DAY' in coluna:
    expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MAX_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MIN_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
  else:
    expressoes_agregacao.append(round(sum(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_TOT_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(avg(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MED_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MAX_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MIN_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))

expressoes_agregacao = tuple(expressoes_agregacao)

# Aplicar as expressões de agregação
df_temp_05 = df_temp_02.groupBy("SK_ID_CURR").agg(*expressoes_agregacao).orderBy("SK_ID_CURR")

# Mostrar o DataFrame resultante
df_temp_05.show()

+----------+---------------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+------------------------------------

In [20]:
df_temp_05 = df_temp_05.repartition(1)
df_temp_05.write.mode("overwrite").csv("/content/drive/MyDrive/bases_dados_projeto_credito/bases_tratadas/previous_application_agg_3.csv",header=True)

In [21]:
# Definir as colunas para agregação
colunas_agregacao_total = df_temp_02.columns

variaveis_remover = ['SK_ID_CURR','SK_ID_PREV','DAYS_DECISION','NAME_CONTRACT_STATUS','WEEKDAY_APPR_PROCESS_START','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION',
                     'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
                     'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','CNT_PAYMENT','NAME_YIELD_GROUP','PRODUCT_COMBINATION']

colunas_agregacao_total = [col for col in colunas_agregacao_total if col not in variaveis_remover]

expressoes_agregacao = []

flag = colunas_flags[3]

for coluna in colunas_agregacao_total:
  if 'DPD' in coluna or 'DAY' in coluna:
    expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MAX_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MIN_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
  else:
    expressoes_agregacao.append(round(sum(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_TOT_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(avg(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MED_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MAX_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MIN_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))

expressoes_agregacao = tuple(expressoes_agregacao)

# Aplicar as expressões de agregação
df_temp_06 = df_temp_02.groupBy("SK_ID_CURR").agg(*expressoes_agregacao).orderBy("SK_ID_CURR")

# Mostrar o DataFrame resultante
df_temp_06.show()

+----------+---------------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+------------------------------------

In [22]:
df_temp_06 = df_temp_06.repartition(1)
df_temp_06.write.mode("overwrite").csv("/content/drive/MyDrive/bases_dados_projeto_credito/bases_tratadas/previous_application_agg_4.csv",header=True)

In [23]:
# Definir as colunas para agregação
colunas_agregacao_total = df_temp_02.columns

variaveis_remover = ['SK_ID_CURR','SK_ID_PREV','DAYS_DECISION','NAME_CONTRACT_STATUS','WEEKDAY_APPR_PROCESS_START','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION',
                     'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
                     'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','CNT_PAYMENT','NAME_YIELD_GROUP','PRODUCT_COMBINATION']

colunas_agregacao_total = [col for col in colunas_agregacao_total if col not in variaveis_remover]

expressoes_agregacao = []

flag = colunas_flags[4]

for coluna in colunas_agregacao_total:
  if 'DPD' in coluna or 'DAY' in coluna:
    expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MAX_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MIN_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
  else:
    expressoes_agregacao.append(round(sum(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_TOT_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(avg(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MED_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MAX_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))
    expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MIN_{coluna.upper()}_{flag.upper()}_PREVIOUS_APPLICATION"))

expressoes_agregacao = tuple(expressoes_agregacao)

# Aplicar as expressões de agregação
df_temp_07 = df_temp_02.groupBy("SK_ID_CURR").agg(*expressoes_agregacao).orderBy("SK_ID_CURR")

# Mostrar o DataFrame resultante
df_temp_07.show()

+----------+---------------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+------------------------------------

In [24]:
df_temp_07 = df_temp_07.repartition(1)
df_temp_07.write.mode("overwrite").csv("/content/drive/MyDrive/bases_dados_projeto_credito/bases_tratadas/previous_application_agg_5.csv",header=True)

In [13]:
# # Definir as colunas para agregação
# colunas_agregacao_total = df_temp_02.columns

# variaveis_remover = ['SK_ID_CURR','SK_ID_PREV','DAYS_DECISION','NAME_CONTRACT_STATUS','WEEKDAY_APPR_PROCESS_START','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION',
#                      'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
#                      'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','CNT_PAYMENT','NAME_YIELD_GROUP','PRODUCT_COMBINATION']

# colunas_agregacao_total = [col for col in colunas_agregacao_total if col not in variaveis_remover]

# colunas_flags = ['ultimos_3_meses','ultimos_6_meses','ultimos_12_meses','ultimos_24_meses','ultimos_36_meses']

# expressoes_agregacao = []

# for categoria in nova_lista:
#   for flag in colunas_flags:
#     for coluna in colunas_agregacao_total:
#       if 'DPD' in coluna or 'DAY' in coluna:
#         expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MAX_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))
#         expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"QT_MIN_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))
#       else:
#         expressoes_agregacao.append(round(sum(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_TOT_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))
#         expressoes_agregacao.append(round(avg(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MED_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))
#         expressoes_agregacao.append(round(max(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MAX_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))
#         expressoes_agregacao.append(round(min(when(col(flag) == 1, col(coluna))), 2).alias(f"VL_MIN_{coluna.upper()}_{flag.upper()}_{categoria}_PREVIOUS_APPLICATION"))

# expressoes_agregacao = tuple(expressoes_agregacao)

# # Aplicar as expressões de agregação
# df_temp_04 = df_temp_02.groupBy("SK_ID_PREV").agg(*expressoes_agregacao).orderBy("SK_ID_PREV")

# # Mostrar o DataFrame resultante
# df_temp_04.show()