In [90]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer

## Installment payments



In [91]:
# Load data
installment_payments = pd.read_csv('installments_payments.csv', sep=',')

installment_payments.head(10)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585
5,1137312,164489,1.0,12,-1384.0,-1417.0,5970.375,5970.375
6,2234264,184693,4.0,11,-349.0,-352.0,29432.295,29432.295
7,1818599,111420,2.0,4,-968.0,-994.0,17862.165,17862.165
8,2723183,112102,0.0,14,-197.0,-197.0,70.74,70.74
9,1413990,109741,1.0,4,-570.0,-609.0,14308.47,14308.47


In [92]:
# Identify duplicate rows

duplicate_rows = installment_payments[installment_payments.duplicated()]
len(duplicate_rows)

0

### Dealing with missing values

In [93]:
miss_lower40 = []
miss_higher40 = []

for i in installment_payments:
    pct_missing = (installment_payments[i].isna().sum()/len(installment_payments[i]))*100
    if pct_missing > 0 and pct_missing < 40:
        miss_lower40.append(i)
    if pct_missing >= 40:
        miss_higher40.append(i)

In [94]:
print("Missing percentage lower than 40:", miss_lower40)
print("Missing percentage higher than 40:", miss_higher40)

Missing percentage lower than 40: ['DAYS_ENTRY_PAYMENT', 'AMT_PAYMENT']
Missing percentage higher than 40: []


We only have missing values in the collumns:

- DAYS_ENTRY_PAYMENT, which says when was the installments of previous credit paid actually (relative to application date of current loan)
- AMT_PAYMENT, what the client actually paid on previous credit on this installment

Let's replace the null values in DAYS_ENTRY_PAYMENT with "0" so that when we calculate a new column with the delay in payment, we get the delay we have relative to application date, even though the client can take even more time to make the payment.

However, we will create a new column saying whether the client has already payed previous installment or not, to identify these cases.

For the column AMT_PAYMENT, we will replace null values with 0 (because they haven't payed yet.

**How will we analyze data**: For the cases where client has payed, check values of delay time and difference between amount of installment and amount payed. Cases where client hasn't payed yet, take only into consideration that he hasn't payed yet.

### Feature selection and new columns

DAYS_INSTALMENT columns says when the installment of previous credit was supposed to be paid, and DAYS_ENTRY_PAYMENT says when was the installments of previous credit paid actually. We'll drop those two columns and replace them with one with the difference between both. That new column will show the delay in the installment payment.

### Pipeline

In [95]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [96]:
class NegativeToPositive(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, installment_payments, y=None):
        return self
    def transform(self, installment_payments):
        installment_payments[self.attribute_names] = abs(installment_payments[self.attribute_names])
        return installment_payments[self.attribute_names]

In [97]:
def add_features(X):
    DAYS_PAYMENT_DELAY = X[:, DAYS_ENTRY_PAYMENT] - X[:, DAYS_INSTALMENT]
    AMT_MISSING_PAYMENT = X[:, AMT_INSTALMENT] - X[:, AMT_PAYMENT]
    INSTALLMENT_PAYED = [0 if i == 0 else 1 for i in X[:, DAYS_ENTRY_PAYMENT]]
    return np.c_[DAYS_PAYMENT_DELAY, AMT_MISSING_PAYMENT, INSTALLMENT_PAYED]

In [98]:
features_1 = ['DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT', 'AMT_INSTALMENT', 'AMT_PAYMENT']

DAYS_ENTRY_PAYMENT, DAYS_INSTALMENT, AMT_INSTALMENT, AMT_PAYMENT = [
    list(features_1).index(col)
    for col in ('DAYS_ENTRY_PAYMENT', 'DAYS_INSTALMENT', 'AMT_INSTALMENT', 'AMT_PAYMENT')]

features1_pipeline = Pipeline([('neg_to_pos', NegativeToPositive(features_1)),
                                ('imputer', SimpleImputer(strategy="constant", fill_value=0)),
                                ('add_features', FunctionTransformer(add_features, validate=False))
                                 ])

preprocessor = ColumnTransformer([('features1_pipeline', features1_pipeline, features_1)])

preprocessor.fit(installment_payments)

process_installment_payments = preprocessor.transform(installment_payments)

In [99]:
new_installment_payments = pd.concat([installment_payments[["SK_ID_CURR","SK_ID_PREV"]], 
                                      pd.DataFrame(process_installment_payments, 
                                                   columns=["DAYS_PAYMENT_DELAY", "AMT_MISSING_PAYMENT", "INSTALLMENT_PAYED"])],
                                    axis=1
                                    )
new_installment_payments

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,DAYS_PAYMENT_DELAY,AMT_MISSING_PAYMENT,INSTALLMENT_PAYED
0,161674,1054186,7.0,0.000,1.0
1,151639,1330831,0.0,0.000,1.0
2,193053,2085231,0.0,0.000,1.0
3,199697,2452527,8.0,0.000,1.0
4,167756,2714724,-17.0,4.455,1.0
...,...,...,...,...,...
13605396,428057,2186857,-1624.0,67.500,0.0
13605397,414406,1310347,-1539.0,67.500,0.0
13605398,402199,1308766,-7.0,43737.435,0.0
13605399,409297,1062206,-1986.0,67.500,0.0


In [101]:
grouped_installment_payments = new_installment_payments.groupby("SK_ID_CURR").agg(
    {'SK_ID_PREV': 'count',
     'DAYS_PAYMENT_DELAY': 'mean',
     'AMT_MISSING_PAYMENT': 'sum',
     'INSTALLMENT_PAYED': 'sum'}
)

grouped_installment_payments.rename({'SK_ID_PREV':'COUNT_ID_PREV'}, inplace=True)

grouped_installment_payments

Unnamed: 0_level_0,SK_ID_PREV,DAYS_PAYMENT_DELAY,AMT_MISSING_PAYMENT,INSTALLMENT_PAYED
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100001,7,7.285714,0.000,7.0
100002,19,20.421053,0.000,19.0
100003,25,7.160000,0.000,25.0
100004,3,7.666667,0.000,3.0
100005,9,23.555556,0.000,9.0
...,...,...,...,...
456251,7,36.285714,0.000,7.0
456252,6,2.833333,0.000,6.0
456253,14,14.500000,3973.095,14.0
456254,19,19.000000,0.000,19.0
