In [2]:
# Data Explore

### Import Packages

In [20]:
from pathlib import Path
import re
import time
from datetime import datetime
from dateutil import relativedelta

import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import config
from util.db_helper import SQLite

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Connect Database

In [21]:
db_path = Path(config.ROOT_DIR, 'data', 'home_credit_default_risk.db')
SQLite.get_conn(db_path)

<sqlite3.Connection at 0x7f81973ae340>

### Application Data

In [37]:
sql = """
    SELECT * 
    FROM application_train 
    WHERE RANDOM() < 0.01
"""
df = SQLite.query(sql)

In [39]:
df.dtypes

SK_ID_CURR                        int64
TARGET                            int64
NAME_CONTRACT_TYPE               object
CODE_GENDER                      object
FLAG_OWN_CAR                     object
FLAG_OWN_REALTY                  object
CNT_CHILDREN                      int64
AMT_INCOME_TOTAL                float64
AMT_CREDIT                      float64
AMT_ANNUITY                     float64
AMT_GOODS_PRICE                 float64
NAME_TYPE_SUITE                  object
NAME_INCOME_TYPE                 object
NAME_EDUCATION_TYPE              object
NAME_FAMILY_STATUS               object
NAME_HOUSING_TYPE                object
REGION_POPULATION_RELATIVE      float64
DAYS_BIRTH                        int64
DAYS_EMPLOYED                     int64
DAYS_REGISTRATION               float64
DAYS_ID_PUBLISH                   int64
OWN_CAR_AGE                     float64
FLAG_MOBIL                        int64
FLAG_EMP_PHONE                    int64
FLAG_WORK_PHONE                   int64


In [40]:
df.head(3)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100009,0,Cash loans,F,Y,Y,1,171000.0,1560726.0,41301.0,1395000.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.035792,-13778,-3130,-1213.0,-619,17.0,1,1,0,1,1,0,Accountants,3.0,2,2,SUNDAY,16,0,0,0,0,0,0,Business Entity Type 3,0.774761,0.724,0.49206,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,-1562.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,1.0,2.0
2,100015,0,Cash loans,F,N,Y,0,38419.155,148365.0,10678.5,135000.0,Children,Pensioner,Secondary / secondary special,Married,House / apartment,0.015221,-20417,365243,-5246.0,-2512,,1,0,0,1,1,0,,2.0,2,2,FRIDAY,7,0,0,0,0,0,0,XNA,0.722044,0.555183,0.652897,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-2396.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


In [38]:
df.shape

(153720, 122)

In [41]:
df['SK_ID_CURR'].nunique()

153720

### Bureau data

In [60]:
sql = """
    SELECT * 
    FROM bureau 
    WHERE RANDOM() < 0.01
"""
df = SQLite.query(sql)

In [61]:
df.dtypes

SK_ID_CURR                  int64
SK_ID_BUREAU                int64
CREDIT_ACTIVE              object
CREDIT_CURRENCY            object
DAYS_CREDIT                 int64
CREDIT_DAY_OVERDUE          int64
DAYS_CREDIT_ENDDATE       float64
DAYS_ENDDATE_FACT         float64
AMT_CREDIT_MAX_OVERDUE    float64
CNT_CREDIT_PROLONG          int64
AMT_CREDIT_SUM            float64
AMT_CREDIT_SUM_DEBT       float64
AMT_CREDIT_SUM_LIMIT      float64
AMT_CREDIT_SUM_OVERDUE    float64
CREDIT_TYPE                object
DAYS_CREDIT_UPDATE          int64
AMT_ANNUITY                object
dtype: object

In [62]:
df.head(3)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
2,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,


In [63]:
df.shape

(858034, 17)

In [64]:
df[['SK_ID_CURR', 'SK_ID_BUREAU']].nunique()

SK_ID_CURR      266066
SK_ID_BUREAU    858034
dtype: int64

In [65]:
df.loc[df['SK_ID_CURR']==215354].sort_values(by='DAYS_CREDIT')

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
112618,215354,5714460,Closed,currency 1,-1333,0,-1089.0,-987.0,,0,102150.0,,,0.0,Consumer credit,-984,
112619,215354,5714461,Closed,currency 1,-1011,0,-830.0,-770.0,,0,500463.0,,,0.0,Consumer credit,-767,
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
2,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
3,215354,5714468,Active,currency 1,-43,0,79.0,,0.0,0,42103.8,42103.8,0.0,0.0,Consumer credit,-22,


### Bureau Balance Data

In [49]:
sql = """
    SELECT * 
    FROM bureau_balance 
    WHERE RANDOM() < 0.01
"""
df = SQLite.query(sql)

In [50]:
df.dtypes

SK_ID_BUREAU       int64
MONTHS_BALANCE     int64
STATUS            object
dtype: object

In [51]:
df.head(3)

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,-1,C
1,5715448,-3,C
2,5715448,-5,C


In [52]:
df.shape

(13652778, 3)

In [57]:
df[['SK_ID_BUREAU', 'MONTHS_BALANCE']].value_counts()

SK_ID_BUREAU  MONTHS_BALANCE
5001709       -96               1
6297937       -21               1
6297945       -15               1
              -13               1
              -10               1
                               ..
5868166       -3                1
              -1                1
               0                1
5868167       -4                1
6842888       -4                1
Length: 13652778, dtype: int64

In [59]:
df.loc[df['SK_ID_BUREAU']==6297945].sort_values(by='MONTHS_BALANCE')

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
3075799,6297945,-47,X
3075798,6297945,-42,0
3075797,6297945,-41,X
3075796,6297945,-40,0
3075795,6297945,-38,0
3075794,6297945,-37,X
3075793,6297945,-36,0
3075792,6297945,-34,0
3075791,6297945,-31,0
3075790,6297945,-28,0


### Credit Card Balance

In [67]:
sql = """
    SELECT * 
    FROM credit_card_balance 
    WHERE RANDOM() < 0.01
"""
df = SQLite.query(sql)

In [68]:
df.dtypes

SK_ID_PREV                      int64
SK_ID_CURR                      int64
MONTHS_BALANCE                  int64
AMT_BALANCE                   float64
AMT_CREDIT_LIMIT_ACTUAL         int64
AMT_DRAWINGS_ATM_CURRENT      float64
AMT_DRAWINGS_CURRENT          float64
AMT_DRAWINGS_OTHER_CURRENT    float64
AMT_DRAWINGS_POS_CURRENT      float64
AMT_INST_MIN_REGULARITY       float64
AMT_PAYMENT_CURRENT           float64
AMT_PAYMENT_TOTAL_CURRENT     float64
AMT_RECEIVABLE_PRINCIPAL      float64
AMT_RECIVABLE                 float64
AMT_TOTAL_RECEIVABLE          float64
CNT_DRAWINGS_ATM_CURRENT      float64
CNT_DRAWINGS_CURRENT            int64
CNT_DRAWINGS_OTHER_CURRENT    float64
CNT_DRAWINGS_POS_CURRENT      float64
CNT_INSTALMENT_MATURE_CUM     float64
NAME_CONTRACT_STATUS           object
SK_DPD                          int64
SK_DPD_DEF                      int64
dtype: object

In [69]:
df.head(3)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
1,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0
2,1079071,171320,-6,353451.645,585000,67500.0,67500.0,0.0,0.0,14684.175,15750.0,15750.0,345433.86,351881.145,351881.145,1.0,1,0.0,0.0,6.0,Active,0,0


In [70]:
df.shape

(1920029, 23)

In [72]:
df[['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE']].value_counts()

SK_ID_PREV  SK_ID_CURR  MONTHS_BALANCE
1000018     394447      -5                1
2216356     300414      -23               1
                        -5                1
                        -7                1
                        -8                1
                                         ..
1581831     364822      -4                1
                        -5                1
                        -6                1
                        -7                1
2843496     425374      -2                1
Length: 1920029, dtype: int64

In [75]:
df.loc[df['SK_ID_PREV']==2843496].sort_values(by='MONTHS_BALANCE')

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
1288085,2843496,425374,-10,101473.38,630000,0.0,19899.72,0.0,19899.72,8590.86,87750.0,80024.535,99749.205,101473.38,101473.38,0.0,7,0.0,7.0,4.0,Active,0,0
1018661,2843496,425374,-9,76000.545,630000,0.0,0.0,0.0,0.0,7725.465,47250.0,41709.33,74813.535,76000.545,76000.545,0.0,0,0.0,0.0,5.0,Active,0,0
710390,2843496,425374,-6,0.0,630000,0.0,1138.5,0.0,1138.5,1086.75,1350.0,206.325,0.0,0.0,0.0,0.0,1,0.0,1.0,8.0,Active,0,0
143776,2843496,425374,-2,0.0,630000,0.0,0.0,0.0,0.0,328.23,121.77,121.77,0.0,0.0,0.0,0.0,0,0.0,0.0,11.0,Active,0,0


### Installment Payment

In [78]:
sql = """
    SELECT * 
    FROM installments_payments 
    WHERE RANDOM() < 0.01
"""
df = SQLite.query(sql)

In [79]:
df.dtypes

SK_ID_PREV                  int64
SK_ID_CURR                  int64
NUM_INSTALMENT_VERSION    float64
NUM_INSTALMENT_NUMBER       int64
DAYS_INSTALMENT           float64
DAYS_ENTRY_PAYMENT        float64
AMT_INSTALMENT            float64
AMT_PAYMENT               float64
dtype: object

In [80]:
df.head(3)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
1,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
2,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [81]:
df.shape

(6802416, 8)

In [82]:
df[['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_NUMBER']].value_counts()

SK_ID_PREV  SK_ID_CURR  NUM_INSTALMENT_NUMBER
1625131     433276      6                        6
2511252     430829      10                       6
2426357     121033      15                       6
2543281     419106      14                       5
1861225     206795      5                        5
                                                ..
1590334     322932      6                        1
                        5                        1
                        4                        1
                        2                        1
2843499     314148      10                       1
Length: 6614635, dtype: int64

* In one month, the user can repay the loan many times at the same time
* It is tricky to determin the primary key here

In [84]:
df[['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT']].value_counts()

SK_ID_PREV  SK_ID_CURR  NUM_INSTALMENT_VERSION  NUM_INSTALMENT_NUMBER  DAYS_INSTALMENT  DAYS_ENTRY_PAYMENT
2092572     349779      1.0                     5                      -2016.0          -2020.0               2
1779603     184589      1.0                     8                      -2376.0          -2384.0               2
1395434     320306      1.0                     8                      -2318.0          -2324.0               2
2416015     318620      1.0                     6                      -2230.0          -2222.0               2
1574435     348174      1.0                     6                      -965.0           -966.0                2
                                                                                                             ..
1576641     247798      1.0                     6                      -1634.0          -1650.0               1
                                                5                      -1664.0          -1671.0              

In [85]:
df.loc[df['SK_ID_PREV']==2092572].sort_values(by='NUM_INSTALMENT_NUMBER')

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
5261812,2092572,349779,1.0,2,-2106.0,-2110.0,31584.825,31584.825
5497675,2092572,349779,1.0,3,-2076.0,-2082.0,31584.825,31584.825
4914059,2092572,349779,1.0,4,-2046.0,-2051.0,31584.825,31584.825
4320155,2092572,349779,1.0,5,-2016.0,-2020.0,31477.05,1.755
5475821,2092572,349779,1.0,5,-2016.0,-2020.0,31477.05,31475.295


### POS Cash Balance

In [86]:
sql = """
    SELECT * 
    FROM POS_CASH_balance 
    WHERE RANDOM() < 0.01
"""
df = SQLite.query(sql)

In [87]:
df.dtypes

SK_ID_PREV                 int64
SK_ID_CURR                 int64
MONTHS_BALANCE             int64
CNT_INSTALMENT           float64
CNT_INSTALMENT_FUTURE    float64
NAME_CONTRACT_STATUS      object
SK_DPD                     int64
SK_DPD_DEF                 int64
dtype: object

In [88]:
df.head(3)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0


In [89]:
df.shape

(4998705, 8)

In [90]:
df[['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE']].value_counts()

SK_ID_PREV  SK_ID_CURR  MONTHS_BALANCE
1000001     158271      -9                1
2211199     160570      -18               1
2211201     315085      -10               1
2211199     160570      -10               1
                        -12               1
                                         ..
1576018     452809      -8                1
                        -11               1
                        -14               1
                        -16               1
2843499     314148      -30               1
Length: 4998705, dtype: int64

In [92]:
df.loc[df['SK_ID_PREV']==2211199].sort_values(by='MONTHS_BALANCE')

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
2154614,2211199,160570,-25,36.0,35.0,Active,0,0
378705,2211199,160570,-24,36.0,34.0,Active,0,0
334616,2211199,160570,-20,36.0,30.0,Active,0,0
732187,2211199,160570,-18,36.0,28.0,Active,0,0
172627,2211199,160570,-17,36.0,27.0,Active,0,0
395632,2211199,160570,-16,36.0,26.0,Active,0,0
81695,2211199,160570,-15,36.0,25.0,Active,0,0
776121,2211199,160570,-13,36.0,23.0,Active,0,0
347335,2211199,160570,-12,36.0,22.0,Active,0,0
63245,2211199,160570,-10,36.0,20.0,Active,0,0


In [None]:
### POS Cash Balance

In [34]:
df_stat = df.groupby(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_NUMBER', 'NUM_INSTALMENT_VERSION', 'DAYS_ENTRY_PAYMENT']).count()


In [35]:
df_stat

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,DAYS_INSTALMENT,AMT_INSTALMENT,AMT_PAYMENT
SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_NUMBER,NUM_INSTALMENT_VERSION,DAYS_ENTRY_PAYMENT,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1000001,158271,1,1.0,-294.0,1,1,1
1000001,158271,2,2.0,-244.0,1,1,1
1000002,101962,1,1.0,-1611.0,1,1,1
1000002,101962,2,1.0,-1575.0,1,1,1
1000002,101962,3,1.0,-1559.0,1,1,1
...,...,...,...,...,...,...,...
2843499,314148,6,1.0,-1074.0,1,1,1
2843499,314148,7,1.0,-1047.0,1,1,1
2843499,314148,8,1.0,-1018.0,1,1,1
2843499,314148,9,1.0,-980.0,1,1,1


In [36]:
df_stat[df_stat['DAYS_INSTALMENT']>1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,DAYS_INSTALMENT,AMT_INSTALMENT,AMT_PAYMENT
SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_NUMBER,NUM_INSTALMENT_VERSION,DAYS_ENTRY_PAYMENT,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1000465,337486,12,1.0,-2416.0,2,2,2
1000805,253471,8,1.0,-2442.0,2,2,2
1003935,254866,10,1.0,-1966.0,2,2,2
1008154,269239,10,1.0,-2285.0,2,2,2
1009562,183161,10,1.0,-2168.0,2,2,2
...,...,...,...,...,...,...,...
2834358,315684,6,1.0,-2671.0,2,2,2
2835168,264951,12,1.0,-2249.0,2,2,2
2835368,168196,7,1.0,-2312.0,2,2,2
2837350,152710,6,1.0,-2743.0,2,2,2


In [38]:
df.loc[(df['SK_ID_CURR']==337486) & (df['SK_ID_PREV']==1000465)].sort_values(by='NUM_INSTALMENT_NUMBER')

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
8814848,1000465,337486,1.0,1,-2739.0,-2752.0,9453.06,9453.06
9774342,1000465,337486,1.0,2,-2709.0,-2703.0,9453.06,9453.06
9930946,1000465,337486,1.0,3,-2679.0,-2690.0,9453.06,9453.06
9433887,1000465,337486,1.0,4,-2649.0,-2658.0,9453.06,9453.06
10023486,1000465,337486,1.0,5,-2619.0,-2620.0,9453.06,9453.06
9025394,1000465,337486,1.0,6,-2589.0,-2595.0,9453.06,9453.06
8230471,1000465,337486,1.0,7,-2559.0,-2565.0,9453.06,9450.0
8319316,1000465,337486,1.0,7,-2559.0,-2536.0,9453.06,3.06
7751676,1000465,337486,1.0,8,-2529.0,-2536.0,9453.06,9450.0
10929268,1000465,337486,1.0,8,-2529.0,-2502.0,9453.06,3.06
