Previous APPLICATIONS TABLE


In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
prev_app = pd.read_csv("previous_application.csv")

#Values "XNA" (not availables) will be replaced by NaN values
prev_app.replace("XNA", np.nan, inplace=True)

In [3]:
#See if there are any duplicated rows
duplicate_rows = prev_app[prev_app.duplicated()]
len(duplicate_rows)

0

Analyzing missing values


Building a model on variables that have higher than 40% missing values may lead to misleading results since we either have to decrease our sample greatly or substitute the missing values by the median

In [4]:
missing_percentages = prev_app.isna().mean() * 100
feature = missing_percentages.index.tolist() 
percentage = missing_percentages.tolist()

In [5]:
# Find columns with missing values above 40%
columns_to_drop = [feat for feat, perc in zip(feature, percentage) if perc > 40]

# Drop the columns with missing values above 40%
prev_app.drop(columns=columns_to_drop,inplace=True)
prev_app.drop(columns='SELLERPLACE_AREA',inplace=True)

In [6]:
print(columns_to_drop)

['AMT_DOWN_PAYMENT', 'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'NAME_CASH_LOAN_PURPOSE', 'NAME_TYPE_SUITE', 'NAME_GOODS_CATEGORY', 'NAME_PRODUCT_TYPE', 'NAME_SELLER_INDUSTRY', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'NFLAG_INSURED_ON_APPROVAL']


Dividing the variables by type to facilitate the EDA and Data Cleaning


In [7]:
variables_type = pd.DataFrame(prev_app.dtypes)

cat_variables = list(variables_type[variables_type[0] == "object"].index)

int_variables = list(variables_type[variables_type[0] == "int64"].index)

float_variables = list(variables_type[variables_type[0] == "float64"].index)

cat_binary_variables = []
for i in prev_app:
    if ((list(prev_app[i].unique()) == [0,1]) or (list(prev_app[i].unique()) == [1,0])) and (i != "TARGET"):
        cat_binary_variables.append(i)

int_with_no_binary = [j for j in int_variables if j not in cat_binary_variables and j not in ["SK_ID_CURR","SK_ID_PREV"]]

num_variables = int_with_no_binary + float_variables

print("Categorical variables (non-binary coded):", cat_variables)
print("Categorical variables (binary):", cat_binary_variables)
print("Numerical variables:", num_variables)

Categorical variables (non-binary coded): ['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_CLIENT_TYPE', 'NAME_PORTFOLIO', 'CHANNEL_TYPE', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION']
Categorical variables (binary): ['NFLAG_LAST_APPL_IN_DAY']
Numerical variables: ['HOUR_APPR_PROCESS_START', 'DAYS_DECISION', 'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'CNT_PAYMENT']


### Cleaning all remaining variables

Categorical variables

In [8]:
#Print the unique values of the categorical variables to see if they are correctly categorized
for i in cat_variables:
    print(i, prev_app[i].unique())

NAME_CONTRACT_TYPE ['Consumer loans' 'Cash loans' 'Revolving loans' nan]
WEEKDAY_APPR_PROCESS_START ['SATURDAY' 'THURSDAY' 'TUESDAY' 'MONDAY' 'FRIDAY' 'SUNDAY' 'WEDNESDAY']
FLAG_LAST_APPL_PER_CONTRACT ['Y' 'N']
NAME_CONTRACT_STATUS ['Approved' 'Refused' 'Canceled' 'Unused offer']
NAME_PAYMENT_TYPE ['Cash through the bank' nan 'Non-cash from your account'
 'Cashless from the account of the employer']
CODE_REJECT_REASON ['XAP' 'HC' 'LIMIT' 'CLIENT' 'SCOFR' 'SCO' nan 'VERIF' 'SYSTEM']
NAME_CLIENT_TYPE ['Repeater' 'New' 'Refreshed' nan]
NAME_PORTFOLIO ['POS' 'Cash' nan 'Cards' 'Cars']
CHANNEL_TYPE ['Country-wide' 'Contact center' 'Credit and cash offices' 'Stone'
 'Regional / Local' 'AP+ (Cash loan)' 'Channel of corporate sales'
 'Car dealer']
NAME_YIELD_GROUP ['middle' 'low_action' 'high' 'low_normal' nan]
PRODUCT_COMBINATION ['POS mobile with interest' 'Cash X-Sell: low' 'Cash X-Sell: high'
 'Cash X-Sell: middle' 'Cash Street: high' 'Cash'
 'POS household without interest' 'POS household

In [9]:
#Change binary variables from Yes/No to 1/0
prev_app["FLAG_LAST_APPL_PER_CONTRACT"] = prev_app["FLAG_LAST_APPL_PER_CONTRACT"].apply(lambda x: 0 if x == 'N' else 1)

In [10]:
#Exclude them from cat_variables and include them in cat_binary_variables (useful for the final pipeline)
cat_variables = [i for i in cat_variables if i not in ["FLAG_LAST_APPL_PER_CONTRACT"]]
cat_binary_variables.extend(["FLAG_LAST_APPL_PER_CONTRACT"])

Numerical variables

In [11]:
#Using describe method for numerical variables to get summary statistics and explore the data
prev_app[num_variables].describe()

Unnamed: 0,HOUR_APPR_PROCESS_START,DAYS_DECISION,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_GOODS_PRICE,CNT_PAYMENT
count,1670214.0,1670214.0,1297979.0,1670214.0,1670213.0,1284699.0,1297984.0
mean,12.48418,-880.6797,15955.12,175233.9,196114.0,227847.3,16.05408
std,3.334028,779.0997,14782.14,292779.8,318574.6,315396.6,14.56729
min,0.0,-2922.0,0.0,0.0,0.0,0.0,0.0
25%,10.0,-1300.0,6321.78,18720.0,24160.5,50841.0,6.0
50%,12.0,-581.0,11250.0,71046.0,80541.0,112320.0,12.0
75%,15.0,-280.0,20658.42,180360.0,216418.5,234000.0,24.0
max,23.0,-1.0,418058.1,6905160.0,6905160.0,6905160.0,84.0


Findings:


* DAYS_DECISION variables are counter-intuitively negative

In [12]:
#Put day variables in a list to be transformed together in the preprocessing pipeline, and exclude them from num_variables 
day_variables = [col for col in prev_app.columns if 'DAYS_DECISION' in col]
num_variables = [i for i in num_variables if i not in day_variables]

## Pipeline

In [13]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, prev_app, y=None):
        return self
    def transform(self, bank):
        return prev_app[self.attribute_names].values
    def get_feature_names(self):
        return self.attribute_names

In [14]:
class NegativeToPositive(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, prev_app, y=None):
        return self
    def transform(self, pre_app):
        prev_app[self.attribute_names] = abs(prev_app[self.attribute_names])
        return prev_app[self.attribute_names]

In [15]:
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_variables)),
        ])
day_col_pipeline = Pipeline([
        ('neg_to_pos', NegativeToPositive(day_variables)),
        ('selector', DataFrameSelector(day_variables)),
        ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_variables)),
        ('cat_encoder', OneHotEncoder())
        ])
cat_binary_pipeline = Pipeline ([
        ('selector', DataFrameSelector(cat_binary_variables))
        ])

In [16]:
preprocess_pipeline = ColumnTransformer([
        ("num_pipeline", num_pipeline, num_variables),
        ("days_pipeline", day_col_pipeline, day_variables),
        ("cat_pipeline", cat_pipeline, cat_variables),
        ("cat_binary_pipeline", cat_binary_pipeline, cat_binary_variables)
])

In [17]:
preprocess_pipeline.fit_transform(prev_app)
preprocessed_prev_app = preprocess_pipeline.transform(prev_app)

In [18]:
cat_feature_names = preprocess_pipeline.named_transformers_['cat_pipeline'].named_steps['cat_encoder'].get_feature_names_out(cat_variables)
feature_names = list(num_variables) + list(day_variables) + list(cat_feature_names) + list(cat_binary_variables)

In [19]:
dense_matrix = preprocessed_prev_app.todense()

In [20]:
transformed_prev_app_df = pd.DataFrame(dense_matrix, columns=feature_names)
transformed_prev_app_df = pd.concat([prev_app[["SK_ID_CURR", "SK_ID_PREV"]],transformed_prev_app_df], axis=1)
transformed_prev_app_df

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,HOUR_APPR_PROCESS_START,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_GOODS_PRICE,CNT_PAYMENT,DAYS_DECISION,NAME_CONTRACT_TYPE_Cash loans,...,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest,PRODUCT_COMBINATION_nan,NFLAG_LAST_APPL_IN_DAY,FLAG_LAST_APPL_PER_CONTRACT
0,271877,2030495,15.0,1730.430,17145.0,17145.0,17145.0,12.0,73.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1,108129,2802425,11.0,25188.615,607500.0,679671.0,607500.0,36.0,164.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,122040,2523466,11.0,15060.735,112500.0,136444.5,112500.0,12.0,301.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,176158,2819243,7.0,47041.335,450000.0,470790.0,450000.0,12.0,512.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,202054,1784265,9.0,31924.395,337500.0,404055.0,337500.0,24.0,781.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1670209,352015,2300464,12.0,14704.290,267295.5,311400.0,267295.5,30.0,544.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1670210,334635,2357031,15.0,6622.020,87750.0,64291.5,87750.0,12.0,1694.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1670211,249544,2659632,12.0,11520.855,105237.0,102523.5,105237.0,10.0,1488.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1670212,400317,2785582,9.0,18821.520,180000.0,191880.0,180000.0,12.0,1185.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [21]:
transformed_prev_app_df.sort_values(by='SK_ID_CURR', ascending=False)

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,HOUR_APPR_PROCESS_START,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_GOODS_PRICE,CNT_PAYMENT,DAYS_DECISION,NAME_CONTRACT_TYPE_Cash loans,...,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest,PRODUCT_COMBINATION_nan,NFLAG_LAST_APPL_IN_DAY,FLAG_LAST_APPL_PER_CONTRACT
729432,456255,1708056,14.0,2250.000,45000.0,45000.0,45000.0,0.0,456.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1383554,456255,2631384,14.0,54022.140,1170000.0,1271929.5,1170000.0,36.0,787.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1411592,456255,2729207,18.0,11514.555,58225.5,58545.0,58225.5,6.0,500.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
814647,456255,1296007,14.0,30737.655,765000.0,1067940.0,765000.0,60.0,171.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
214743,456255,1743609,18.0,11090.835,102037.5,112815.0,102037.5,12.0,991.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1021650,100003,2636178,17.0,64567.665,337500.0,348637.5,337500.0,6.0,828.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
575941,100003,1810518,12.0,98356.995,900000.0,1035882.0,900000.0,12.0,746.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1223745,100003,2396755,15.0,6737.310,68809.5,68053.5,68809.5,12.0,2341.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
892077,100002,1038818,9.0,9251.775,179055.0,179055.0,179055.0,24.0,606.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


In [22]:
transformed_prev_app_df.columns

Index(['SK_ID_CURR', 'SK_ID_PREV', 'HOUR_APPR_PROCESS_START', 'AMT_ANNUITY',
       'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'CNT_PAYMENT',
       'DAYS_DECISION', 'NAME_CONTRACT_TYPE_Cash loans',
       'NAME_CONTRACT_TYPE_Consumer loans',
       'NAME_CONTRACT_TYPE_Revolving loans', 'NAME_CONTRACT_TYPE_nan',
       'WEEKDAY_APPR_PROCESS_START_FRIDAY',
       'WEEKDAY_APPR_PROCESS_START_MONDAY',
       'WEEKDAY_APPR_PROCESS_START_SATURDAY',
       'WEEKDAY_APPR_PROCESS_START_SUNDAY',
       'WEEKDAY_APPR_PROCESS_START_THURSDAY',
       'WEEKDAY_APPR_PROCESS_START_TUESDAY',
       'WEEKDAY_APPR_PROCESS_START_WEDNESDAY', 'NAME_CONTRACT_STATUS_Approved',
       'NAME_CONTRACT_STATUS_Canceled', 'NAME_CONTRACT_STATUS_Refused',
       'NAME_CONTRACT_STATUS_Unused offer',
       'NAME_PAYMENT_TYPE_Cash through the bank',
       'NAME_PAYMENT_TYPE_Cashless from the account of the employer',
       'NAME_PAYMENT_TYPE_Non-cash from your account', 'NAME_PAYMENT_TYPE_nan',
       'CODE

* categorical ---> sum
* SK_ID_PREV ---> fazer count
* HOUR_APPR_PROCESS_START and days and CNT----> mean
* Tudo o que é AMT  ----> mean 

In [23]:
aggregation_functions = {}

aggregation_functions['SK_ID_PREV'] = 'count'
aggregation_functions['HOUR_APPR_PROCESS_START'] = 'mean'
aggregation_functions['DAYS_DECISION'] = 'mean' 
aggregation_functions['CNT_PAYMENT'] = 'mean'

for col in transformed_prev_app_df.columns:
    if col.startswith('AMT'):
        aggregation_functions[col] = 'mean'

    if (col in cat_feature_names) | (col in cat_binary_variables):
        aggregation_functions[col] = 'sum'

In [24]:
aggregation_functions

{'SK_ID_PREV': 'count',
 'HOUR_APPR_PROCESS_START': 'mean',
 'DAYS_DECISION': 'mean',
 'CNT_PAYMENT': 'mean',
 'AMT_ANNUITY': 'mean',
 'AMT_APPLICATION': 'mean',
 'AMT_CREDIT': 'mean',
 'AMT_GOODS_PRICE': 'mean',
 'NAME_CONTRACT_TYPE_Cash loans': 'sum',
 'NAME_CONTRACT_TYPE_Consumer loans': 'sum',
 'NAME_CONTRACT_TYPE_Revolving loans': 'sum',
 'NAME_CONTRACT_TYPE_nan': 'sum',
 'WEEKDAY_APPR_PROCESS_START_FRIDAY': 'sum',
 'WEEKDAY_APPR_PROCESS_START_MONDAY': 'sum',
 'WEEKDAY_APPR_PROCESS_START_SATURDAY': 'sum',
 'WEEKDAY_APPR_PROCESS_START_SUNDAY': 'sum',
 'WEEKDAY_APPR_PROCESS_START_THURSDAY': 'sum',
 'WEEKDAY_APPR_PROCESS_START_TUESDAY': 'sum',
 'WEEKDAY_APPR_PROCESS_START_WEDNESDAY': 'sum',
 'NAME_CONTRACT_STATUS_Approved': 'sum',
 'NAME_CONTRACT_STATUS_Canceled': 'sum',
 'NAME_CONTRACT_STATUS_Refused': 'sum',
 'NAME_CONTRACT_STATUS_Unused offer': 'sum',
 'NAME_PAYMENT_TYPE_Cash through the bank': 'sum',
 'NAME_PAYMENT_TYPE_Cashless from the account of the employer': 'sum',
 'NAME_PA

In [27]:
ready_to_merge = result = transformed_prev_app_df.groupby('SK_ID_CURR').agg(aggregation_functions)
ready_to_merge.reset_index(inplace=True)

In [29]:
ready_to_merge.rename(columns={'SK_ID_PREV': 'COUNT_PREV_APP'},inplace=True)
ready_to_merge

Unnamed: 0,SK_ID_CURR,COUNT_PREV_APP,HOUR_APPR_PROCESS_START,DAYS_DECISION,CNT_PAYMENT,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_GOODS_PRICE,NAME_CONTRACT_TYPE_Cash loans,...,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest,PRODUCT_COMBINATION_nan,NFLAG_LAST_APPL_IN_DAY,FLAG_LAST_APPL_PER_CONTRACT
0,100001,1,13.000000,1740.000,8.00,3951.000000,24835.500,23787.00,24835.500,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1,100002,1,9.000000,606.000,24.00,9251.775000,179055.000,179055.00,179055.000,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2,100003,3,14.666667,1305.000,10.00,56553.990000,435436.500,484191.00,435436.500,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0
3,100004,1,5.000000,815.000,4.00,5357.250000,24282.000,20106.00,24282.000,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
4,100005,2,10.500000,536.000,12.00,4813.200000,22308.750,20076.75,44617.500,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338852,456251,1,17.000000,273.000,8.00,6605.910000,40455.000,40455.00,40455.000,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
338853,456252,1,10.000000,2497.000,6.00,10074.465000,57595.500,56821.50,57595.500,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
338854,456253,2,11.500000,2380.000,5.00,4770.405000,24162.750,20625.75,24162.750,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0
338855,456254,2,15.000000,299.500,15.00,10681.132500,121317.750,134439.75,121317.750,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0
