In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [3]:
import pandas as pd
import numpy as np

In [4]:
import sys
import os
from collections import defaultdict, Counter

In [5]:
import math

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt 

## Constants

In [7]:
data_path = "/Users/hardiksahi/Personal/MachineLearning/kaggle/home-credit-risk-model-stability/data/01_raw/home-credit-credit-risk-model-stability"

## Get paths of files

In [8]:
base_file_dict = defaultdict(list)
credit_bureau_a_1_file_dict = defaultdict(list)
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        if file_path.endswith('parquet'):
            file_type = 'train' if 'train' in file_path else 'test'
            if 'base' in file_path:
                base_file_dict[file_type].append(file_path)
            if 'credit_bureau_a_1' in file_path:
                credit_bureau_a_1_file_dict[file_type].append(file_path)

In [9]:
credit_bureau_a_1_file_dict.get("train")

['/Users/hardiksahi/Personal/MachineLearning/kaggle/home-credit-risk-model-stability/data/01_raw/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_0.parquet',
 '/Users/hardiksahi/Personal/MachineLearning/kaggle/home-credit-risk-model-stability/data/01_raw/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_1.parquet',
 '/Users/hardiksahi/Personal/MachineLearning/kaggle/home-credit-risk-model-stability/data/01_raw/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_3.parquet',
 '/Users/hardiksahi/Personal/MachineLearning/kaggle/home-credit-risk-model-stability/data/01_raw/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_2.parquet']

## Functions

In [10]:
def get_column_description(feature_def_df, column_name):
    return feature_def_df[feature_def_df.Variable == column_name]['Description'].iloc[0]

In [11]:
def plot_scatterplot(df, x_col_name, y_col_name, show_x_y_line=False):
    assert x_col_name in df.columns, f"Ensure that {x_col_name} is in the df"
    assert y_col_name in df.columns, f"Ensure that {y_col_name} is in the df"
    
    fig, axs = plt.subplots()
    sns.scatterplot(data=df, x=df[x_col_name], y=df[y_col_name], ax=axs)
    
    if show_x_y_line:
        df[x_col_name] = df[x_col_name].replace([np.inf, -np.inf], np.nan)
        df[y_col_name] = df[y_col_name].replace([np.inf, -np.inf], np.nan)
        not_null_x_filter = ~df[x_col_name].isnull()
        not_null_y_filter = ~df[y_col_name].isnull()
        not_null_filter = not_null_x_filter & not_null_y_filter
        filtered_df = df[not_null_filter]
        x_values_series = filtered_df[x_col_name]
        y_values_series = filtered_df[y_col_name]
        x0, x1 = min(x_values_series), max(x_values_series)
        y0, y1 = min(y_values_series), max(y_values_series)
        #line_values = [i for i in np.arange(min(x0, y0), max(x1, y1), 0.1)]
        lims = [min(x0, y0), max(x1, y1)]
        print(f"lims: {lims}")
        plt.plot(lims, lims)
    return fig

In [12]:
def get_percent_of_column(df, groupby_column, target_column):
    target_df = df.groupby(by=[groupby_column]).agg({target_column:"sum"}).reset_index()
    all_rows_df = df.groupby(by=[groupby_column]).size().reset_index().rename({0:"total_rows"}, axis=1)
    merged_df = target_df.merge(all_rows_df, left_on=[groupby_column], right_on=[groupby_column], how="left")
    merged_df["percentage_default"] = 100*merged_df["target"]/merged_df["total_rows"]
    merged_df[groupby_column] = merged_df[groupby_column].astype(str)
    return merged_df

In [13]:
def plot_barplot(df, percent_column, category_column):
    fig, ax = plt.subplots(1,1,figsize=(8,5))
    sns.barplot(ax=ax, data=df, y=category_column, x=percent_column, order=df.sort_values(by=[percent_column], ascending=False)[category_column].values)
    return fig

In [14]:
def get_relevant_indices(series):
    percentile_25 = series.quantile(0.25)
    percentile_75 = series.quantile(0.75)
    iqr = percentile_75-percentile_25
    upper_limit = percentile_75 + 1.5*iqr
    lower_limit = percentile_25 - 1.5*iqr

    keep_indices = series[(series>=lower_limit) & (series<=upper_limit)].index
    return keep_indices 

## Read datasets

In [15]:
feature_df = pd.read_csv(f"{data_path}/feature_definitions.csv")
print(f"Shape of feature_df: {feature_df.shape}")

Shape of feature_df: (465, 2)


In [16]:
train_base_df = pd.concat([pd.read_parquet(file_path) for file_path in base_file_dict['train']], axis=0)
print(f"Shape of train_base_df: {train_base_df.shape}")

Shape of train_base_df: (1526659, 5)


## Properties: depth=1, internal data source

In [17]:
credit_bureau_a_1_df = pd.concat([pd.read_parquet(file_path) for file_path in credit_bureau_a_1_file_dict['train']], axis=0)
print(f"Shape of credit_bureau_a_1_df: {credit_bureau_a_1_df.shape}")

Shape of credit_bureau_a_1_df: (15940537, 79)


In [20]:
credit_bureau_a_1_df.columns.unique()

Index(['case_id', 'annualeffectiverate_199L', 'annualeffectiverate_63L',
       'classificationofcontr_13M', 'classificationofcontr_400M',
       'contractst_545M', 'contractst_964M', 'contractsum_5085717L',
       'credlmt_230A', 'credlmt_935A', 'dateofcredend_289D',
       'dateofcredend_353D', 'dateofcredstart_181D', 'dateofcredstart_739D',
       'dateofrealrepmt_138D', 'debtoutstand_525A', 'debtoverdue_47A',
       'description_351M', 'dpdmax_139P', 'dpdmax_757P',
       'dpdmaxdatemonth_442T', 'dpdmaxdatemonth_89T', 'dpdmaxdateyear_596T',
       'dpdmaxdateyear_896T', 'financialinstitution_382M',
       'financialinstitution_591M', 'instlamount_768A', 'instlamount_852A',
       'interestrate_508L', 'lastupdate_1112D', 'lastupdate_388D',
       'monthlyinstlamount_332A', 'monthlyinstlamount_674A',
       'nominalrate_281L', 'nominalrate_498L', 'num_group1',
       'numberofcontrsvalue_258L', 'numberofcontrsvalue_358L',
       'numberofinstls_229L', 'numberofinstls_320L',
       'n

In [21]:
train_base_df["date_decision"] = pd.to_datetime(train_base_df["date_decision"])
#tax_registry_c_1_df["processingdate_168D"] = pd.to_datetime(tax_registry_c_1_df["processingdate_168D"])

In [26]:
credit_bureau_a_1_df[credit_bureau_a_1_df.case_id == 388].sort_values(by=["num_group1"])

Unnamed: 0,case_id,annualeffectiverate_199L,annualeffectiverate_63L,classificationofcontr_13M,classificationofcontr_400M,contractst_545M,contractst_964M,contractsum_5085717L,credlmt_230A,credlmt_935A,...,residualamount_488A,residualamount_856A,subjectrole_182M,subjectrole_93M,totalamount_6A,totalamount_996A,totaldebtoverduevalue_178A,totaldebtoverduevalue_718A,totaloutstanddebtvalue_39A,totaloutstanddebtvalue_668A
1,388,,,4408ff0f,a55475b1,7241344e,a55475b1,,,,...,,,ab3c25cf,ab3c25cf,,268897.62,0.0,0.0,374419.5,0.0
0,388,,,ea6782cc,a55475b1,7241344e,a55475b1,,,135806.0,...,,114325.805,a55475b1,a55475b1,,,,,,
2,388,,,a55475b1,a55475b1,a55475b1,a55475b1,,,,...,,,a55475b1,a55475b1,,,,,,
3,388,,,a55475b1,a55475b1,a55475b1,a55475b1,,,,...,,,a55475b1,a55475b1,,,,,,
4,388,,,a55475b1,a55475b1,a55475b1,a55475b1,,,,...,,,a55475b1,a55475b1,,,,,,
5,388,,,a55475b1,a55475b1,a55475b1,a55475b1,,,,...,,,a55475b1,a55475b1,,,,,,
6,388,,,a55475b1,a55475b1,a55475b1,a55475b1,,,,...,,,a55475b1,a55475b1,,,,,,
7,388,,,a55475b1,a55475b1,a55475b1,a55475b1,,,,...,,,a55475b1,a55475b1,,,,,,
8,388,,,a55475b1,a55475b1,a55475b1,a55475b1,,,,...,,,a55475b1,a55475b1,,,,,,
9,388,,,a55475b1,a55475b1,a55475b1,a55475b1,,,,...,,,a55475b1,a55475b1,,,,,,


## This means that there are multiple rows for each case_id indexed by num_group1

In [None]:
unique_num_group_case_id_df = credit_bureau_a_1_df.groupby(by=["case_id"]).agg({"num_group1": "nunique"}).reset_index().rename({"num_group1": "generated_count_of_tax_information_by_provider_c"}, axis=1)

In [None]:
unique_num_group_case_id_df[unique_num_group_case_id_df.generated_count_of_tax_information_by_provider_c>10].sort_values(by=["generated_count_of_tax_information_by_provider_c"]).head(5)

In [None]:
tax_registry_c_1_df[tax_registry_c_1_df.case_id == 1846104].sort_values(by=["num_group1"]).head()

In [None]:
sns.boxplot(data=unique_num_group_case_id_df, y="generated_count_of_tax_information_by_provider_c")

## Number of tax related records provided for case_ids vary from 1 to 99

### 1. processingdate_168D

In [None]:
get_column_description(feature_df, "processingdate_168D")

In [None]:
tax_registry_c_1_df.processingdate_168D.isnull().value_counts()

In [None]:
## Check across how unique dates are tax deduction records for a given case_id
statistics_processingdate_168D_for_case_id_df = tax_registry_c_1_df.groupby(by=["case_id"]).agg({"processingdate_168D": ["nunique", "max"]}).reset_index()
statistics_processingdate_168D_for_case_id_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_processingdate_168D_for_case_id_df.columns]

In [None]:
statistics_processingdate_168D_for_case_id_df.head(2)

In [None]:
print(f"Minimum number of unique dates associated with case_id: {statistics_processingdate_168D_for_case_id_df['generated_processingdate_168D_nunique'].min()}")
print(f"Maximum number of unique dates associated with case_id: {statistics_processingdate_168D_for_case_id_df['generated_processingdate_168D_nunique'].max()}")

## A case_id is associated with 1 to 38 unique dates

## Is this the date on which tax deduction record was provided by the registry to homecredit instead of the date of tx deduction??

In [None]:
print(f"Min processingdate_168D: {tax_registry_c_1_df.processingdate_168D.min()}")
print(f"Max processingdate_168D: {tax_registry_c_1_df.processingdate_168D.max()}")

## This record information is obtained across an year...

In [None]:
merged_train_base_statistics_processingdate_168D_for_case_id_df = train_base_df.merge(statistics_processingdate_168D_for_case_id_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one", how="left")
print(f"Shape of merged_train_base_statistics_processingdate_168D_for_case_id_df: {merged_train_base_statistics_processingdate_168D_for_case_id_df.shape}")

In [None]:
merged_train_base_statistics_processingdate_168D_for_case_id_df.head(2)

In [None]:
merged_train_base_statistics_processingdate_168D_for_case_id_df["day_gap_between_processingdate_168D_and_date_decision"] = (merged_train_base_statistics_processingdate_168D_for_case_id_df["generated_processingdate_168D_max"]-merged_train_base_statistics_processingdate_168D_for_case_id_df["date_decision"]).dt.days

In [None]:
sns.boxplot(data=merged_train_base_statistics_processingdate_168D_for_case_id_df, y="day_gap_between_processingdate_168D_and_date_decision", x="target")

## Some tax deduction was done before date_decision and some after.  

In [None]:
merged_train_base_statistics_processingdate_168D_for_case_id_df[~merged_train_base_statistics_processingdate_168D_for_case_id_df.generated_processingdate_168D_nunique.isnull()]["day_gap_between_processingdate_168D_and_date_decision"].value_counts()

### 2. pmtamount_36A

In [None]:
get_column_description(feature_df, "pmtamount_36A")

In [None]:
statistics_pmtamount_36A_df = tax_registry_c_1_df.groupby(by=["case_id"]).agg({"pmtamount_36A": ["max", "min", "mean"]}).reset_index()
statistics_pmtamount_36A_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_pmtamount_36A_df.columns]

In [None]:
statistics_pmtamount_36A_df.sort_values(by=["case_id"]).head()

### 3. employername_160M

In [None]:
get_column_description(feature_df, "employername_160M")

In [None]:
statistics_employername_160M_for_case_id_df = tax_registry_c_1_df.groupby(by=["case_id"]).agg(generated_employername_160M_nunique=pd.NamedAgg(column="employername_160M", aggfunc="nunique"), generated_employername_160M_mode=pd.NamedAgg(column="employername_160M", aggfunc=lambda x:x.value_counts().index[0]))

In [None]:
statistics_employername_160M_for_case_id_df

Use tax_registry b and c