In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
import re

from collections import Counter
from ydata_profiling import ProfileReport
from utils import get_non_castable, cleanup_and_cast, replace_outliers, fill_nas_from_other_customer_records
from constants import *

In [2]:
%load_ext autoreload
%autoreload 2

# Pandas

## Read data

In [3]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [4]:
pd_df_train = pd.read_csv("train.csv")

  pd_df_train = pd.read_csv("train.csv")


In [5]:
pd_df_train.columns[26]

'Monthly_Balance'

In [6]:
get_non_castable(pd_df_train, "Monthly_Balance", float)

['__-333333333333333333333333333__',
 '__-333333333333333333333333333__',
 '__-333333333333333333333333333__',
 '__-333333333333333333333333333__',
 '__-333333333333333333333333333__',
 '__-333333333333333333333333333__',
 '__-333333333333333333333333333__',
 '__-333333333333333333333333333__',
 '__-333333333333333333333333333__']

In [7]:
NA_VALUES = {"Monthly_Balance": "__-333333333333333333333333333__"}
pd_df_train = pd.read_csv("train.csv", na_values=NA_VALUES)

## Check validity

In [8]:
pd_df_train.dtypes

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                          object
SSN                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                

In [9]:
for var in pd_df_train.columns:
    print(var)
    print("Examples of non-castable values: {}".format(get_non_castable(pd_df_train, var, float)[:10]))
    print("Unique values (potentially trimmed): {}".format(pd_df_train[var].unique()[:10]))
    print("")

ID
Examples of non-castable values: ['0x1602', '0x1603', '0x1604', '0x1605', '0x1606', '0x1607', '0x1608', '0x1609', '0x160e', '0x160f']
Unique values (potentially trimmed): ['0x1602' '0x1603' '0x1604' '0x1605' '0x1606' '0x1607' '0x1608' '0x1609'
 '0x160e' '0x160f']

Customer_ID
Examples of non-castable values: ['CUS_0xd40', 'CUS_0xd40', 'CUS_0xd40', 'CUS_0xd40', 'CUS_0xd40', 'CUS_0xd40', 'CUS_0xd40', 'CUS_0xd40', 'CUS_0x21b1', 'CUS_0x21b1']
Unique values (potentially trimmed): ['CUS_0xd40' 'CUS_0x21b1' 'CUS_0x2dbc' 'CUS_0xb891' 'CUS_0x1cdb'
 'CUS_0x95ee' 'CUS_0x284a' 'CUS_0x5407' 'CUS_0x4157' 'CUS_0xba08']

Month
Examples of non-castable values: ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'January', 'February']
Unique values (potentially trimmed): ['January' 'February' 'March' 'April' 'May' 'June' 'July' 'August']

Name
Examples of non-castable values: ['Aaron Maashoh', 'Aaron Maashoh', 'Aaron Maashoh', 'Aaron Maashoh', 'Aaron Maashoh', 'Aaron Maashoh', 

### Type_of_Loan

In [10]:
split = pd_df_train.Type_of_Loan[~pd_df_train.Type_of_Loan.isna()].apply(lambda x: re.split(r", and |, ", x))
Counter([x for l in split.to_list() for x in l])

Counter({'Payday Loan': 40568,
         'Credit-Builder Loan': 40440,
         'Not Specified': 39616,
         'Home Equity Loan': 39104,
         'Student Loan': 38968,
         'Mortgage Loan': 38936,
         'Personal Loan': 38888,
         'Debt Consolidation Loan': 38776,
         'Auto Loan': 37992})

TODO: split, determine unique and expand into categoricals

## Clean

### Replace with NaNs

In [18]:
pd_df_train_clean = cleanup_and_cast(pd_df_train)

### Initial EDA

In [39]:
pr = ProfileReport(pd_df_train_clean)

In [44]:
pr.to_file("eda.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Clean up outliers

In [19]:
pd_df_train_clean = replace_outliers(pd_df_train_clean)

In [13]:
for var in VARS_WITH_OUTLIERS_TOLERANCES.keys():
    print(var)
    desc = pd_df_train_clean[var].describe()
    print(desc)
    print("")

Age
count    97219.000000
mean        33.320009
std         10.769558
min         14.000000
25%         24.000000
50%         33.000000
75%         42.000000
max         56.000000
Name: Age, dtype: float64

Annual_Income
count     99001.000000
mean      50497.576053
std       38296.622971
min        7005.930000
25%       19344.270000
50%       36993.940000
75%       71676.000000
max      179987.280000
Name: Annual_Income, dtype: float64

Num_Bank_Accounts
count    98685.000000
mean         5.367624
std          2.593839
min         -1.000000
25%          3.000000
50%          5.000000
75%          7.000000
max         11.000000
Name: Num_Bank_Accounts, dtype: float64

Num_Credit_Card
count    97729.000000
mean         5.533322
std          2.067703
min          0.000000
25%          4.000000
50%          5.000000
75%          7.000000
max         11.000000
Name: Num_Credit_Card, dtype: float64

Interest_Rate
count    97966.000000
mean        14.531603
std          8.739748
min         

### EDA after removing outliers

In [82]:
pr_no_outliers = ProfileReport(pd_df_train_clean)
pr_no_outliers.to_file("eda_no_outliers.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Fill NaNs

In [14]:
pd_df_train_clean.apply(lambda x: x.isna().sum(), axis=0)

ID                              0
Customer_ID                     0
Month                           0
Name                         9985
Age                          2781
SSN                             0
Occupation                   7062
Annual_Income                 999
Monthly_Inhand_Salary       15002
Num_Bank_Accounts            1315
Num_Credit_Card              2271
Interest_Rate                2034
Num_of_Loan                  4348
Type_of_Loan                    0
Delay_from_due_date             0
Num_of_Delayed_Payment      28643
Changed_Credit_Limit         2091
Num_Credit_Inquiries        17774
Credit_Mix                  20195
Outstanding_Debt                0
Credit_Utilization_Ratio        0
Credit_History_Age           9030
Payment_of_Min_Amount       12007
Total_EMI_per_month          5404
Amount_invested_monthly      8784
Payment_Behaviour               0
Monthly_Balance              1209
Credit_Score                    0
dtype: int64

In [21]:
pd_df_train_clean = fill_nas_from_other_customer_records(pd_df_train_clean)

In [35]:
pd_df_train_clean.apply(lambda x: x.isna().sum(), axis=0)

ID                              0
Customer_ID                     0
Month                           0
Name                            0
Age                             0
SSN                             0
Occupation                      0
Annual_Income                   0
Monthly_Inhand_Salary           0
Num_Bank_Accounts               0
Num_Credit_Card              2271
Interest_Rate                2034
Num_of_Loan                  4348
Type_of_Loan                    0
Delay_from_due_date             0
Num_of_Delayed_Payment      28643
Changed_Credit_Limit         2091
Num_Credit_Inquiries        17774
Credit_Mix                      0
Outstanding_Debt                0
Credit_Utilization_Ratio        0
Credit_History_Age              0
Payment_of_Min_Amount       12007
Total_EMI_per_month          5404
Amount_invested_monthly      8784
Payment_Behaviour            7600
Monthly_Balance              1209
Credit_Score                    0
dtype: int64

# Read data - Polars

In [41]:
pl_df_train = pl.read_csv("train.csv")

ComputeError: Could not parse `__-333333333333333333333333333__` as dtype `f64` at column 'Monthly_Balance' (column number 27).
The current offset in the file is 23442390 bytes.

You might want to try:
- increasing `infer_schema_length` (e.g. `infer_schema_length=10000`),
- specifying correct dtype with the `dtypes` argument
- setting `ignore_errors` to `True`,
- adding `__-333333333333333333333333333__` to the `null_values` list.

In [None]:
NULL_VALUES_LIST = ["__-333333333333333333333333333__"]
pl_df_train = pl.read_csv("train.csv", null_values=NULL_VALUES_LIST)

In [None]:
x = pl.read_csv("train.csv", ignore_errors=True)

In [None]:
x

In [None]:
pl_df_train.dtypes

In [None]:
pl_df_train.schema

In [None]:
df_train.Delay_from_due_date.describe()