# Data cleaning

Data cleaning of credit_card_balance.csv and installments_payments.csv.

In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

## Credit card balance

- Monthly balance snapshots of previous credit cards that the applicant has with Home Credit.

- This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credit cards * # of months where we have some history observable for the previous credit card) rows.

CUSTOMER RISK PROFILE

- Number of Loans per Customer
- Rate at which Loan is paid back by customer - No of instalments per customer per loan
- How much did the Customer load a Credit line?
- How many times did the Customer miss the minimum payment?
- What is the average number of days did Customer go past due date?
- What fraction of minimum payments were missed?

CUSTOMER BEHAVIOUR PATTERNS

- Cash withdrawals VS Overall Spending ratio
- Average number of drawings per customer - Total Drawings / Number of Drawings

In [3]:
# Load data

credit_card_balance = pd.read_csv('credit_card_balance.csv', sep=',')

credit_card_balance.head(10)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0
5,2646502,380010,-7,82903.815,270000,0.0,0.0,0.0,0.0,4449.105,...,82773.315,82773.315,0.0,0,0.0,0.0,2.0,Active,7,0
6,1079071,171320,-6,353451.645,585000,67500.0,67500.0,0.0,0.0,14684.175,...,351881.145,351881.145,1.0,1,0.0,0.0,6.0,Active,0,0
7,2095912,118650,-7,47962.125,45000,45000.0,45000.0,0.0,0.0,0.0,...,47962.125,47962.125,1.0,1,0.0,0.0,51.0,Active,0,0
8,2181852,367360,-4,291543.075,292500,90000.0,289339.425,0.0,199339.425,130.5,...,286831.575,286831.575,3.0,8,0.0,5.0,3.0,Active,0,0
9,1235299,203885,-5,201261.195,225000,76500.0,111026.7,0.0,34526.7,6338.34,...,197224.695,197224.695,3.0,9,0.0,6.0,38.0,Active,0,0


In [4]:
# Identify duplicate rows

duplicate_rows = credit_card_balance[credit_card_balance.duplicated()]
len(duplicate_rows)

0

### Feature selection

In [5]:
# Feature selection

credit_card_balance.loc[0:10, ["SK_ID_PREV", "SK_ID_CURR", "AMT_PAYMENT_CURRENT", "AMT_PAYMENT_TOTAL_CURRENT"]]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT
0,2562384,378907,1800.0,1800.0
1,2582071,363914,2250.0,2250.0
2,1740877,371185,2250.0,2250.0
3,1389973,337855,11925.0,11925.0
4,1891521,126868,27000.0,27000.0
5,2646502,380010,3825.0,3825.0
6,1079071,171320,15750.0,15750.0
7,2095912,118650,264.69,0.0
8,2181852,367360,4093.515,4093.515
9,1235299,203885,45000.0,45000.0


In [6]:
credit_card_balance.loc[credit_card_balance["AMT_PAYMENT_CURRENT"] < credit_card_balance["AMT_PAYMENT_TOTAL_CURRENT"],
                        :].loc[:, ["AMT_PAYMENT_CURRENT", "AMT_PAYMENT_TOTAL_CURRENT"]]

Unnamed: 0,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT


AMT_PAYMENT_CURRENT is never lower than AMT_PAYMENT_TOTAL_CURRENT (it is always higher or equal, or they are both null), which means that AMT_PAYMENT_CURRENT contains the value in AMT_PAYMENT_TOTAL_CURRENT. Therefore, we can drop that column.

In [7]:
credit_card_balance.loc[credit_card_balance["AMT_DRAWINGS_CURRENT"] < credit_card_balance["AMT_DRAWINGS_ATM_CURRENT"], :].loc[:, ["AMT_DRAWINGS_CURRENT",
                                                                                                                                  "AMT_DRAWINGS_ATM_CURRENT"]]

Unnamed: 0,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_ATM_CURRENT
438776,-1687.5,0.0
747302,-519.57,0.0
3284667,-6211.62,0.0


Same thing for AMT_DRAWINGS_CURRENT and AMT_DRAWINGS_ATM_CURRENT. Even though the amount of drawings in the ATM may be zero, the amount of drawings in total can be negative - when the amount of money drawing during a month by a client is lower than zero, it means that the client has made a payment to the bank that is greater than the outstanding balance on their credit card.

Therefore, we can drop AMT_DRAWINGS_ATM_CURRENT, because AMT_DRAWINGS_CURRENT contains all the information. Same thing applies to AMT_DRAWINGS_POS_CURRENT. 

In [8]:
credit_card_balance.loc[0:10, ["AMT_DRAWINGS_CURRENT", "AMT_DRAWINGS_OTHER_CURRENT"]]

Unnamed: 0,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT
0,877.5,0.0
1,2250.0,0.0
2,0.0,0.0
3,2250.0,0.0
4,11547.0,0.0
5,0.0,0.0
6,67500.0,0.0
7,45000.0,0.0
8,289339.425,0.0
9,111026.7,0.0


In [9]:
credit_card_balance.loc[credit_card_balance["AMT_DRAWINGS_CURRENT"] < credit_card_balance["AMT_DRAWINGS_OTHER_CURRENT"], :].loc[:, ["AMT_DRAWINGS_CURRENT", "AMT_DRAWINGS_OTHER_CURRENT"]]

Unnamed: 0,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT
438776,-1687.5,0.0
747302,-519.57,0.0
3284667,-6211.62,0.0


We can also drop this column.

Regarding AMT_RECEIVABLE_PRINCIPAL, AMT_RECIVABLE and AMT_TOTAL_RECEIVABLE, since AMT_TOTAL_RECEIVABLE is the total amount receivable on the previous credit, that contains the number we need and we can drop the other two columns.

Considering the columns we decided to drop, regarding the 4 columns with the number of drawings in the month in matter, it only makes sense to keep CNT_DRAWINGS_CURRENT, because that's what we can connect to the information that wasn't dropped. Also, that column contains total number of drawings, while the other 3 contained very specific drawings.

In [10]:
# Drop the columns

credit_card_balance.drop(['AMT_PAYMENT_TOTAL_CURRENT', 'AMT_DRAWINGS_ATM_CURRENT',
                          'AMT_DRAWINGS_POS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT',
                          "AMT_RECEIVABLE_PRINCIPAL", "AMT_RECIVABLE",
                          "CNT_DRAWINGS_ATM_CURRENT", "CNT_DRAWINGS_OTHER_CURRENT",
                          "CNT_DRAWINGS_POS_CURRENT", "AMT_INST_MIN_REGULARITY"],
                         axis=1, inplace=True)

In [11]:
credit_card_balance

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_CURRENT,AMT_PAYMENT_CURRENT,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.970,135000,877.5,1800.00,0.000,1,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.00,64875.555,1,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,2250.00,31460.085,0,30.0,Active,0,0
3,1389973,337855,-4,236572.110,225000,2250.0,11925.00,233048.970,1,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,11547.0,27000.00,453919.455,1,101.0,Active,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3840307,1036507,328243,-9,0.000,45000,0.0,,0.000,0,0.0,Active,0,0
3840308,1714892,347207,-9,0.000,45000,0.0,1879.11,0.000,0,23.0,Active,0,0
3840309,1302323,215757,-9,275784.975,585000,270000.0,375750.00,273093.975,2,18.0,Active,0,0
3840310,1624872,430337,-10,0.000,450000,0.0,,0.000,0,0.0,Active,0,0


### Dealing with missing values

In [12]:
# Deal with missing values

n_null_dict = {}

for column in credit_card_balance.columns:
    null_sum = credit_card_balance.isnull()[column].sum()
    n_null_dict[column] = null_sum

n_null = pd.DataFrame(n_null_dict, index = ["Number of missing values"])

In [13]:
n_null

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_CURRENT,AMT_PAYMENT_CURRENT,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
Number of missing values,0,0,0,0,0,0,767988,0,0,305236,0,0,0


We could replace the missing values using a statistical method such as mean, median, or mode. However, each client is different and has different ATM movements, so imputing with a single value may not be appropriate.

Therefore, the approach we will take is to impute the missing values based on similar clients: group clients by some relevant characteristics.

In [14]:
# Missing values in AMT_PAYMENT_CURRENT

# Check variable more correlated with AMT_PAYMENT_CURRENT

corr_payment_current = credit_card_balance.corr().abs()
pearson_payment_current = corr_payment_current["AMT_PAYMENT_CURRENT"]
pearson_payment_current.sort_values(axis = 0, ascending = False)

AMT_PAYMENT_CURRENT          1.000000
AMT_DRAWINGS_CURRENT         0.337343
AMT_CREDIT_LIMIT_ACTUAL      0.308294
CNT_DRAWINGS_CURRENT         0.223483
AMT_BALANCE                  0.143934
AMT_TOTAL_RECEIVABLE         0.142371
CNT_INSTALMENT_MATURE_CUM    0.079266
MONTHS_BALANCE               0.076355
SK_DPD                       0.030222
SK_DPD_DEF                   0.004340
SK_ID_PREV                   0.003472
SK_ID_CURR                   0.000127
Name: AMT_PAYMENT_CURRENT, dtype: float64

AMT_DRAWINGS_CURRENT seems to be the variable with the highest correlation with AMT_PAYMENT_CURRENT. Let's group clients based on that.

In [15]:
# Create new column with quantile bin for AMT_DRAWINGS_CURRENT

credit_card_balance['drawings_quantile'] = pd.qcut(credit_card_balance['AMT_DRAWINGS_CURRENT'],
                                                   2, labels=False, duplicates='drop')

# Compute median for each quantile 

quantile_medians = credit_card_balance.groupby('drawings_quantile')['AMT_DRAWINGS_CURRENT'].median()

quantile_medians

# Replace missing values in AMT_PAYMENT_CURRENT with the median for the corresponding quantile of AMT_DRAWINGS_CURRENT

for quantile in range(2):
    
    mask = (credit_card_balance['drawings_quantile'] == quantile) & credit_card_balance['AMT_PAYMENT_CURRENT'].isna()
    credit_card_balance.loc[mask, 'AMT_PAYMENT_CURRENT'] = quantile_medians[quantile]

credit_card_balance.drop("drawings_quantile", axis=1, inplace=True)

In [16]:
# Missing values in CNT_INSTALMENT_MATURE_CUM

# Check variable more correlated with CNT_INSTALMENT_MATURE_CUM

corr_instalment_mature = credit_card_balance.corr().abs()
pearson_instalment_mature = corr_instalment_mature["CNT_INSTALMENT_MATURE_CUM"]
pearson_instalment_mature.sort_values(axis = 0, ascending = False)

CNT_INSTALMENT_MATURE_CUM    1.000000
AMT_CREDIT_LIMIT_ACTUAL      0.157269
CNT_DRAWINGS_CURRENT         0.099186
AMT_DRAWINGS_CURRENT         0.093491
SK_DPD                       0.059654
AMT_PAYMENT_CURRENT          0.028843
MONTHS_BALANCE               0.008620
SK_ID_PREV                   0.007219
AMT_TOTAL_RECEIVABLE         0.005959
AMT_BALANCE                  0.005009
SK_DPD_DEF                   0.002156
SK_ID_CURR                   0.000581
Name: CNT_INSTALMENT_MATURE_CUM, dtype: float64

In [17]:
# Create new column with quantile bin for AMT_CREDIT_LIMIT_ACTUAL

credit_card_balance['drawings_quantile'] = pd.qcut(credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL'],
                                                   4, labels=False, duplicates='drop')

# Compute median for each quantile 

quantile_medians = credit_card_balance.groupby('drawings_quantile')['AMT_CREDIT_LIMIT_ACTUAL'].median()

quantile_medians

# Replace missing values in AMT_PAYMENT_CURRENT with the median for the corresponding quantile of AMT_DRAWINGS_CURRENT

for quantile in range(2):
    
    mask = (credit_card_balance['drawings_quantile'] == quantile) & credit_card_balance['CNT_INSTALMENT_MATURE_CUM'].isna()
    credit_card_balance.loc[mask, 'CNT_INSTALMENT_MATURE_CUM'] = quantile_medians[quantile]

credit_card_balance.drop("drawings_quantile", axis=1, inplace=True)

### Final pipeline to transform data and get new dataframe:

In [18]:
# Pipeline to transform numerical and categorical features

numerical_features = [col for col in credit_card_balance.columns if col != 'NAME_CONTRACT_STATUS' and col != 'SK_ID_PREV' and col != 'SK_ID_CURR']

categorical_features = ["NAME_CONTRACT_STATUS"]

# Define column transformer to apply scaling and one-hot encoding

preprocessor = make_column_transformer(
    (MinMaxScaler(), numerical_features),
    (OneHotEncoder(), categorical_features)
)

# Define the pipeline to apply the column transformer

pipeline = make_pipeline(preprocessor)

# Fit the pipeline to the original dataset 

pipeline.fit(credit_card_balance)

# Get the transformed numerical feature names

numerical_feature_names = numerical_features

# Get the transformed categorical feature names

categorical_feature_names = pipeline.named_steps['columntransformer'].transformers_[1][1].get_feature_names(categorical_features)

# Combine the feature names

feature_names = numerical_feature_names + categorical_feature_names.tolist()

# Select the columns to keep from the original dataframe

id_columns = ['SK_ID_PREV', 'SK_ID_CURR']

# Create a new dataframe with the transformed features

new_credit_card_balance = pd.DataFrame(pipeline.transform(credit_card_balance), columns=feature_names)

# Concatenate the transformed features dataframe with the original id columns

credit_card_balance = pd.concat([credit_card_balance[id_columns], new_credit_card_balance], axis=1)



### Final dataset:

In [19]:
credit_card_balance

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_CURRENT,AMT_PAYMENT_CURRENT,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_CURRENT,CNT_INSTALMENT_MATURE_CUM,SK_DPD,SK_DPD_DEF,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Refused,NAME_CONTRACT_STATUS_Sent proposal,NAME_CONTRACT_STATUS_Signed
0,2562384,378907,0.947368,0.218211,0.100000,0.003091,0.000420,0.219614,0.006061,0.000389,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2582071,363914,1.000000,0.251395,0.033333,0.003690,0.000525,0.253516,0.006061,0.000767,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1740877,371185,0.936842,0.234699,0.333333,0.002709,0.000525,0.236054,0.000000,0.000333,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1389973,337855,0.968421,0.341002,0.166667,0.003690,0.002780,0.341400,0.006061,0.000111,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1891521,126868,1.000000,0.453842,0.333333,0.007744,0.006295,0.456822,0.006061,0.001122,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3840307,1036507,328243,0.915789,0.218181,0.033333,0.002709,0.000000,0.219614,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3840308,1714892,347207,0.915789,0.218181,0.033333,0.002709,0.000438,0.219614,0.000000,0.000256,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3840309,1302323,215757,0.915789,0.361360,0.433333,0.120442,0.087604,0.362327,0.012121,0.000200,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3840310,1624872,430337,0.905263,0.218181,0.333333,0.002709,0.000000,0.219614,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Installment payments



In [20]:
# Load data

installment_payments = pd.read_csv('installments_payments.csv', sep=',')

installment_payments.head(10)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585
5,1137312,164489,1.0,12,-1384.0,-1417.0,5970.375,5970.375
6,2234264,184693,4.0,11,-349.0,-352.0,29432.295,29432.295
7,1818599,111420,2.0,4,-968.0,-994.0,17862.165,17862.165
8,2723183,112102,0.0,14,-197.0,-197.0,70.74,70.74
9,1413990,109741,1.0,4,-570.0,-609.0,14308.47,14308.47


In [21]:
# Identify duplicate rows

duplicate_rows = installment_payments[installment_payments.duplicated()]
len(duplicate_rows)

0

### Dealing with missing values

In [22]:
n_null_dict = {}

for column in installment_payments.columns:
    null_sum = installment_payments.isnull()[column].sum()
    n_null_dict[column] = null_sum

n_null = pd.DataFrame(n_null_dict, index = ["Number of missing values"])

In [23]:
n_null

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
Number of missing values,0,0,0,0,0,2905,0,2905


We only have missing values in the collumns:

- DAYS_ENTRY_PAYMENT, which says when was the installments of previous credit paid actually (relative to application date of current loan)
- AMT_PAYMENT, what the client actually paid on previous credit on this installment

We will interpret Null values as situations where the client hasn't paid the installment at all. Let's see some examples:

In [24]:
installment_payments[installment_payments['DAYS_ENTRY_PAYMENT'].isnull()]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
3764207,1531600,103793,1.0,7,-668.0,,49741.020,
3764208,1947105,159974,1.0,24,-36.0,,22849.515,
3764209,1843773,167270,1.0,22,-20.0,,48092.355,
3764210,1691592,192536,1.0,5,-2561.0,,7675.425,
3764211,1531299,157088,0.0,11,-1847.0,,67.500,
...,...,...,...,...,...,...,...,...
13605396,2186857,428057,0.0,66,-1624.0,,67.500,
13605397,1310347,414406,0.0,47,-1539.0,,67.500,
13605398,1308766,402199,0.0,43,-7.0,,43737.435,
13605399,1062206,409297,0.0,43,-1986.0,,67.500,


Let's replace the null values in DAYS_ENTRY_PAYMENT with "0" so that when we calculate a new column with the delay in payment, we get the delay we have relative to application date, even though the client can take even more time to make the payment.

However, we will create a new column saying whether the client has already payed previous installment or not, to identify these cases.

For the column AMT_PAYMENT, we will replace null values with 0 (because they haven't payed yet.

**How will we analyze data**: For the cases where client has payed, check values of delay time and difference between amount of installment and amount payed. Cases where client hasn't payed yet, take only into consideration that he hasn't payed yet.

In [25]:
installment_payments['DAYS_ENTRY_PAYMENT'] = installment_payments['DAYS_ENTRY_PAYMENT'].fillna(0)
installment_payments['AMT_PAYMENT'] = installment_payments['AMT_PAYMENT'].fillna(0)

### Feature selection and new columns

DAYS_INSTALMENT columns says when the installment of previous credit was supposed to be paid, and DAYS_ENTRY_PAYMENT says when was the installments of previous credit paid actually. We'll drop those two columns and replace them with one with the difference between both. That new column will show the delay in the installment payment.

In [26]:
# Create new column DAYS_PAYMENT_DELAY

installment_payments['DAYS_INSTALMENT'] = installment_payments['DAYS_INSTALMENT'].abs()
installment_payments['DAYS_ENTRY_PAYMENT'] = installment_payments['DAYS_ENTRY_PAYMENT'].abs()

installment_payments['DAYS_PAYMENT_DELAY'] = - (installment_payments['DAYS_ENTRY_PAYMENT'] - installment_payments['DAYS_INSTALMENT'])

In [27]:
# Create new column DAYS_PAYMENT_DELAY

installment_payments['DAYS_INSTALMENT'] = installment_payments['DAYS_INSTALMENT'].abs()
installment_payments['DAYS_ENTRY_PAYMENT'] = installment_payments['DAYS_ENTRY_PAYMENT'].abs()

installment_payments['AMT_MISSING_PAYMENT'] = installment_payments['AMT_INSTALMENT'] - installment_payments['AMT_PAYMENT']

In [28]:
# Create new column INSTALLMENT_PAYED

def payed_or_not(x):
    if x == 0:
        return '0'
    else:
        return '1'

installment_payments['INSTALLMENT_PAYED'] = installment_payments['DAYS_ENTRY_PAYMENT'].apply(payed_or_not)

Since we have new columns containing all the information that matters, we can drop some of the columns we had before.

In [29]:
installment_payments.drop(['NUM_INSTALMENT_VERSION', 'DAYS_INSTALMENT',
                          'DAYS_ENTRY_PAYMENT', 'AMT_INSTALMENT',
                          "AMT_PAYMENT"], axis=1, inplace=True)

In [30]:
installment_payments

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_NUMBER,DAYS_PAYMENT_DELAY,AMT_MISSING_PAYMENT,INSTALLMENT_PAYED
0,1054186,161674,6,-7.0,0.000,1
1,1330831,151639,34,-0.0,0.000,1
2,2085231,193053,1,-0.0,0.000,1
3,2452527,199697,3,-8.0,0.000,1
4,2714724,167756,2,17.0,4.455,1
...,...,...,...,...,...,...
13605396,2186857,428057,66,1624.0,67.500,0
13605397,1310347,414406,47,1539.0,67.500,0
13605398,1308766,402199,43,7.0,43737.435,0
13605399,1062206,409297,43,1986.0,67.500,0


### Final pipeline to transform data and get new dataframe:

In [31]:
# Pipeline to transform numerical features

numerical_features = [col for col in installment_payments.columns if col != 'SK_ID_PREV' and col != 'SK_ID_CURR']

# Define column transformer to apply scaling

preprocessor = make_column_transformer(
    (MinMaxScaler(), numerical_features))

# Define the pipeline to apply the column transformer

pipeline = make_pipeline(preprocessor)

# Fit the pipeline to the original dataset 

pipeline.fit(installment_payments)

# Feature names

feature_names = numerical_features

# Select the columns to keep from the original dataframe

id_columns = ['SK_ID_PREV', 'SK_ID_CURR']

# Create a new dataframe with the transformed features

new_installment_payments = pd.DataFrame(pipeline.transform(installment_payments), columns=feature_names)

# Concatenate the transformed features dataframe with the original id columns

installment_payments = pd.concat([installment_payments[id_columns], new_installment_payments], axis=1)

In [32]:
installment_payments

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_NUMBER,DAYS_PAYMENT_DELAY,AMT_MISSING_PAYMENT,INSTALLMENT_PAYED
0,1054186,161674,0.018116,0.521896,0.520391,1.0
1,1330831,151639,0.119565,0.523044,0.520391,1.0
2,2085231,193053,0.000000,0.523044,0.520391,1.0
3,2452527,199697,0.007246,0.521732,0.520391,1.0
4,2714724,167756,0.003623,0.525832,0.520392,1.0
...,...,...,...,...,...,...
13605396,2186857,428057,0.235507,0.789405,0.520405,0.0
13605397,1310347,414406,0.166667,0.775463,0.520405,0.0
13605398,1308766,402199,0.152174,0.524192,0.529043,0.0
13605399,1062206,409297,0.152174,0.848778,0.520405,0.0


# New CSV files

In [33]:
credit_card_balance.to_csv("processed_credit_card_balance.csv", index=False)

installment_payments.to_csv("processed_installments_payments.csv", index=False)