In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import to_date, lit, date_format

try:
    spark.stop()
except:
    pass

spark = SparkSession.builder \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "10g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()



25/07/21 17:34:49 WARN Utils: Your hostname, DESKTOP-A7MMD62 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/07/21 17:34:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/21 17:34:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df_previous = spark.read.csv("../../data/raw/previous_application.csv",
                               header=True,
                               inferSchema=True)

df_previous.createOrReplaceTempView("dados")

# Contagem de linhas e colunas
num_rows = df_previous.count()
num_columns = len(df_previous.columns)

print(f'Quantidade de linhas: {num_rows}')
print(f'Quantidade de variaveis (colunas): {num_columns}')

df_previous.show(5, truncate=False)

25/07/21 17:34:59 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Quantidade de linhas: 1670214
Quantidade de variaveis (colunas): 37
+----------+----------+------------------+-----------+---------------+----------+----------------+---------------+--------------------------+-----------------------+---------------------------+----------------------+-----------------+---------------------+------------------------+----------------------+--------------------+-------------+---------------------+------------------+---------------+----------------+-------------------+--------------+-----------------+-----------------------+----------------+--------------------+-----------+----------------+------------------------+------------------+--------------+-------------------------+-------------+----------------+-------------------------+
|SK_ID_PREV|SK_ID_CURR|NAME_CONTRACT_TYPE|AMT_ANNUITY|AMT_APPLICATION|AMT_CREDIT|AMT_DOWN_PAYMENT|AMT_GOODS_PRICE|WEEKDAY_APPR_PROCESS_START|HOUR_APPR_PROCESS_START|FLAG_LAST_APPL_PER_CONTRACT|NFLAG_LAST_APPL_IN_DAY|RATE_DOWN_PAYMEN

## Criando variáveis de janela temporal

In [3]:
df_temp_01 = spark.sql("""
SELECT
    *,
      CASE
        WHEN DAYS_DECISION >= -90 THEN 1
        ELSE 0
    END AS U3M,
    CASE
        WHEN DAYS_DECISION >= -180 THEN 1
        ELSE 0
    END AS U6M,    
    CASE
        WHEN DAYS_DECISION >= -360 THEN 1
        ELSE 0
    END AS U12M
FROM dados
ORDER BY `SK_ID_PREV`;
""")
df_temp_01.createOrReplaceTempView("df_temp_01")
display(df_temp_01.limit(5))

DataFrame[SK_ID_PREV: int, SK_ID_CURR: int, NAME_CONTRACT_TYPE: string, AMT_ANNUITY: double, AMT_APPLICATION: double, AMT_CREDIT: double, AMT_DOWN_PAYMENT: double, AMT_GOODS_PRICE: double, WEEKDAY_APPR_PROCESS_START: string, HOUR_APPR_PROCESS_START: int, FLAG_LAST_APPL_PER_CONTRACT: string, NFLAG_LAST_APPL_IN_DAY: int, RATE_DOWN_PAYMENT: double, RATE_INTEREST_PRIMARY: double, RATE_INTEREST_PRIVILEGED: double, NAME_CASH_LOAN_PURPOSE: string, NAME_CONTRACT_STATUS: string, DAYS_DECISION: int, NAME_PAYMENT_TYPE: string, CODE_REJECT_REASON: string, NAME_TYPE_SUITE: string, NAME_CLIENT_TYPE: string, NAME_GOODS_CATEGORY: string, NAME_PORTFOLIO: string, NAME_PRODUCT_TYPE: string, CHANNEL_TYPE: string, SELLERPLACE_AREA: int, NAME_SELLER_INDUSTRY: string, CNT_PAYMENT: double, NAME_YIELD_GROUP: string, PRODUCT_COMBINATION: string, DAYS_FIRST_DRAWING: double, DAYS_FIRST_DUE: double, DAYS_LAST_DUE_1ST_VERSION: double, DAYS_LAST_DUE: double, DAYS_TERMINATION: double, NFLAG_INSURED_ON_APPROVAL: doubl

## Lendo as bases pos_cash, credit_card_balance e installments

In [4]:
pos_cash = spark.read.parquet('../../data/books/pos-cash/')
credit_card_balance = spark.read.parquet('../../data/books/credit-card')
installments = spark.read.parquet('../../data/books/installments')

In [5]:
#Quantidade de colunas:
print(f'pos: {len(pos_cash.columns)-1}')
print(f'credit_card_balance: {len(credit_card_balance.columns)-1}')
print(f'installments: {len(installments.columns)-1}')

pos: 180
credit_card_balance: 912
installments: 24


## Join das bases

In [6]:
df_tmp_02 = df_temp_01.join(pos_cash, "SK_ID_PREV", how="left") \
    .join(credit_card_balance, "SK_ID_PREV", how="left") \
    .join(installments, "SK_ID_PREV", how="left")

In [7]:
#Quantidade de colunas:
len(df_tmp_02.columns)-1

1155

## Criando variáveis de primeira camada

In [8]:
from pyspark.sql.functions import col, round, sum, avg, max, min, when

# Definir as colunas para agregação
columns_agg_total = df_tmp_02.columns

columns_agg_total_remove = ['SK_ID_CURR','SK_ID_PREV','DAYS_DECISION','NAME_CONTRACT_STATUS','WEEKDAY_APPR_PROCESS_START','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION',
                     'NAME_PAYMENT_TYPE','CODE_REJECT_REASON','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
                     'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','CNT_PAYMENT','NAME_YIELD_GROUP','PRODUCT_COMBINATION']


columns_agg_total = [col for col in columns_agg_total if col not in columns_agg_total_remove]

expressions_agg = []

# Defindo a lista de colunas de flags.
colunas_flags = ['U3M','U6M','U12M']

suffix = "_PREV_APP"

#for flag in list_flags_columns:
for column in columns_agg_total:
  for flag in colunas_flags:
    if "DPD" in column or "DAY" in column:
        expressions_agg.append(round(max(when(col(flag)==1, col(column))), 2).alias(f'QT_MAX_{column.upper()}_{flag.upper()}{suffix}'))
    else:
        expressions_agg.append(round(avg(when(col(flag)==1, col(column))), 2).alias(f'VL_MED_{column.upper()}_{flag.upper()}{suffix}'))


expressions_agg = tuple(expressions_agg)

#print(expressions_agg)

#aplicar as expressões de agregação
df_tmp_03 = df_tmp_02.groupBy("SK_ID_CURR").agg(*expressions_agg).orderBy("SK_ID_CURR")

In [9]:
df_tmp_03.count()

                                                                                

338857

In [10]:
#Quantidade de colunas:
len(df_tmp_03.columns)-1

3411

(df_tmp_03.repartition(1)
     .write
     .mode("overwrite")
     .option("compression", "snappy")
     .parquet('/data/books/previous-app'))

In [11]:
# Reparticionar para um único arquivo
df_temp_04 = df_tmp_03.repartition(1)
df_temp_04.write.mode("overwrite").parquet('../../data/books/previous-app')


25/07/21 17:35:33 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
25/07/21 17:49:42 WARN DAGScheduler: Broadcasting large task binary with size 8.1 MiB
25/07/21 17:59:00 WARN DAGScheduler: Broadcasting large task binary with size 8.1 MiB
25/07/21 18:06:16 WARN DAGScheduler: Broadcasting large task binary with size 8.9 MiB
25/07/21 18:06:22 WARN DAGScheduler: Broadcasting large task binary with size 1065.5 KiB
                                                                                

In [12]:
spark.stop()

## Verificar metadados

In [2]:
import pandas as pd
df = pd.read_parquet('../../data/books/previous-app', engine='fastparquet')

In [3]:
import sys
sys.path.append(r'/home/jean/projetos/pod-bank/global/')
from util import *

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
metadados = generate_metadata(df,
                                          ids=['SK_ID_CURR'],
                                          targets=['TARGET'],
                                          orderby = 'PC_NULOS')

In [5]:
metadados

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,VL_MED_FLAG_LAST_APPL_PER_CONTRACT_U3M_PREV_APP,Explicativa,338857,100.0,0,float64
1,VL_MED_NAME_CONTRACT_TYPE_U12M_PREV_APP,Explicativa,338857,100.0,0,float64
2,VL_MED_FLAG_LAST_APPL_PER_CONTRACT_U6M_PREV_APP,Explicativa,338857,100.0,0,float64
3,VL_MED_NAME_CONTRACT_TYPE_U6M_PREV_APP,Explicativa,338857,100.0,0,float64
4,VL_MED_NAME_CONTRACT_TYPE_U3M_PREV_APP,Explicativa,338857,100.0,0,float64
...,...,...,...,...,...,...
3407,VL_MED_U12M_U12M_PREV_APP,Explicativa,145359,42.9,1,float64
3408,VL_MED_HOUR_APPR_PROCESS_START_U12M_PREV_APP,Explicativa,145359,42.9,1083,float64
3409,VL_MED_U3M_U12M_PREV_APP,Explicativa,145359,42.9,93,float64
3410,QT_MAX_NFLAG_LAST_APPL_IN_DAY_U12M_PREV_APP,Explicativa,145359,42.9,1,float64


## Retirar variáveis com mais de 70% dos valores nulos.

In [6]:
missing_cutoff = 70

drop_vars_nulos = metadados[(metadados['PC_NULOS'] >= missing_cutoff)]
lista_drop_vars = list(drop_vars_nulos.FEATURE.values)

print('Variáveis que serão excluídas por alto percentual de nulos: ',lista_drop_vars)
# retirando lista de variáveis com alto percentual de nulos
abt_fe_01 = df.drop(axis=1,columns=lista_drop_vars)
abt_fe_01.shape

Variáveis que serão excluídas por alto percentual de nulos:  ['VL_MED_FLAG_LAST_APPL_PER_CONTRACT_U3M_PREV_APP', 'VL_MED_NAME_CONTRACT_TYPE_U12M_PREV_APP', 'VL_MED_FLAG_LAST_APPL_PER_CONTRACT_U6M_PREV_APP', 'VL_MED_NAME_CONTRACT_TYPE_U6M_PREV_APP', 'VL_MED_NAME_CONTRACT_TYPE_U3M_PREV_APP', 'VL_MED_FLAG_LAST_APPL_PER_CONTRACT_U12M_PREV_APP', 'VL_MED_VL_MED_AMT_PAYMENT_CURRENT_U12M_ACTIVE_CRED_CARD_U3M_PREV_APP', 'VL_MED_VL_TOT_AMT_PAYMENT_CURRENT_U12M_ACTIVE_CRED_CARD_U3M_PREV_APP', 'VL_MED_VL_MED_AMT_PAYMENT_CURRENT_U6M_ACTIVE_CRED_CARD_U3M_PREV_APP', 'VL_MED_VL_TOT_AMT_PAYMENT_CURRENT_U6M_ACTIVE_CRED_CARD_U3M_PREV_APP', 'VL_MED_VL_TOT_AMT_PAYMENT_CURRENT_U3M_COMPLETED_CRED_CARD_U3M_PREV_APP', 'VL_MED_VL_MED_AMT_PAYMENT_CURRENT_U12M_COMPLETED_CRED_CARD_U3M_PREV_APP', 'VL_MED_VL_TOT_AMT_PAYMENT_CURRENT_U12M_COMPLETED_CRED_CARD_U3M_PREV_APP', 'VL_MED_VL_MED_AMT_PAYMENT_CURRENT_U12M_SENT_PROPOSAL_CRED_CARD_U3M_PREV_APP', 'VL_MED_VL_TOT_AMT_PAYMENT_CURRENT_U12M_SENT_PROPOSAL_CRED_CARD_U3M_

(338857, 227)

In [7]:
abt_fe_02 = abt_fe_01.copy()

In [8]:
abt_fe_02.shape

(338857, 227)

## Salvando os dados em parquet

In [None]:

abt_fe_02.to_parquet('../../data/books/previous-app-fs', index=False)

: 