## Init

In [1]:
%ls -lah ../data

total 22M
drwxr-xr-x  5 root root  160 Oct  7 18:54 [0m[01;34m.[0m/
drwxr-xr-x 16 root root  512 Oct  8 17:54 [01;34m..[0m/
-rw-r--r--  1 root root  21M Feb 25  2019 dataset.csv
-rw-r--r--  1 root root  396 Feb 25  2019 template.csv
-rw-r--r--  1 root root 571K Oct  8 17:18 test_predictions.csv


In [2]:
import numpy as np
import pandas as pd
from IPython.display import display

import matplotlib.pyplot as plt
%matplotlib inline

## Read data

In [3]:
df = pd.read_csv("../data/dataset.csv", sep=";")
df.shape

(99976, 43)

## Sanity checks

In [4]:
df.head()

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,1,1,1,0,0,0,178839,9.653333,1.0
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,1,1,2,2,0,0,0,49014,13.181389,
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,1,1,2,2,0,0,0,124839,11.561944,1.0
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,1,1,1,0,0,0,324676,15.751111,1.0
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,0,1,1,1,0,0,0,7100,12.698611,


In [5]:
df.tail()

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
99971,5c03bc63-ea65-4ffd-aa7b-95ea9a46db34,,0,0.0,0.0,0.0,,1.0,1.0,,...,1,1,1,1,0,0,0,60127,10.765556,
99972,f8db22f4-9819-420c-abbc-9ddf1843176e,,0,0.0,0.0,0.0,0.004044,1.0,1.0,,...,1,0,1,1,0,7948,0,4740,21.708333,
99973,b22e21ea-b1b2-4df3-b236-0ff6d5fdc0d8,,45671,0.0,20.0,0.0,0.705078,2.0,2.0,2.0,...,0,0,0,0,0,17447,19627,3100,2.185278,
99974,bafcab15-9898-479c-b729-c9dda7edb78f,,56102,0.0,0.0,0.0,0.064175,1.0,2.0,1.0,...,1,1,1,1,0,18339,56180,34785,9.725278,
99975,ac88f18c-96a6-49bc-9e9d-a780225914af,,0,0.0,0.0,0.0,,1.0,1.0,,...,1,2,2,2,0,0,0,30602,11.585278,


In [6]:
print(f"Number of duplicates: {df.duplicated().sum()}")

Number of duplicates: 0


In [7]:
# Number of NaNs per each row
df.isnull().sum(axis=1).value_counts()

1     13582
7     13034
12    11752
2     10173
8      9130
0      9111
6      6460
10     5901
3      4595
9      3921
11     3367
4      3239
15     1841
5      1831
13     1604
14      258
16      177
dtype: int64

In [8]:
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
uuid,99976,99976.0,4d011c2f-799a-4fb2-b224-935a611067ed,1.0,,,,,,,
default,89976,,,,0.0143149,0.118786,0.0,0.0,0.0,0.0,1.0
account_amount_added_12_24m,99976,,,,12255.1,35481.5,0.0,0.0,0.0,4937.25,1128780.0
account_days_in_dc_12_24m,88140,,,,0.223043,5.80812,0.0,0.0,0.0,0.0,365.0
account_days_in_rem_12_24m,88140,,,,5.04462,22.864,0.0,0.0,0.0,0.0,365.0
account_days_in_term_12_24m,88140,,,,0.286896,2.92991,0.0,0.0,0.0,0.0,97.0
account_incoming_debt_vs_paid_0_24m,40661,,,,1.33129,26.4823,0.0,0.0,0.152082,0.662952,3914.0
account_status,45603,,,,1.04217,0.202713,1.0,1.0,1.0,1.0,4.0
account_worst_status_0_3m,45603,,,,1.17291,0.420142,1.0,1.0,1.0,1.0,4.0
account_worst_status_12_24m,33215,,,,1.33735,0.575043,1.0,1.0,1.0,2.0,4.0


### Notes

- 10k test rows
- Minimum vaue for `time_hours` looks suspicious (~0.000028h = 1sec)
- Looks like NaNs for some features might be associated with the historical data that might be missing for some of the clients

## Split data

In [12]:
df_train = df[~df.default.isnull()]
df_test = df[df.default.isnull()]

print(df_train.shape)
print(df_test.shape)

(89976, 43)
(10000, 43)
