In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np

In [None]:
import sys
import os
from collections import defaultdict, Counter

In [None]:
import math

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 

## Constants

In [None]:
data_path = "/Users/hardiksahi/Personal/MachineLearning/kaggle/home-credit-risk-model-stability/data/01_raw/home-credit-credit-risk-model-stability"

## Get paths of files

In [None]:
base_file_dict = defaultdict(list)
applprev_1_file_dict = defaultdict(list)
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        if file_path.endswith('parquet'):
            file_type = 'train' if 'train' in file_path else 'test'
            if 'base' in file_path:
                base_file_dict[file_type].append(file_path)
            if 'applprev_1' in file_path:
                applprev_1_file_dict[file_type].append(file_path)

In [None]:
applprev_1_file_dict.get("train")

## Functions

In [None]:
def get_column_description(feature_def_df, column_name):
    return feature_def_df[feature_def_df.Variable == column_name]['Description'].iloc[0]

In [None]:
def plot_scatterplot(df, x_col_name, y_col_name, show_x_y_line=False):
    assert x_col_name in df.columns, f"Ensure that {x_col_name} is in the df"
    assert y_col_name in df.columns, f"Ensure that {y_col_name} is in the df"
    
    fig, axs = plt.subplots()
    sns.scatterplot(data=df, x=df[x_col_name], y=df[y_col_name], ax=axs)
    
    if show_x_y_line:
        df[x_col_name] = df[x_col_name].replace([np.inf, -np.inf], np.nan)
        df[y_col_name] = df[y_col_name].replace([np.inf, -np.inf], np.nan)
        not_null_x_filter = ~df[x_col_name].isnull()
        not_null_y_filter = ~df[y_col_name].isnull()
        not_null_filter = not_null_x_filter & not_null_y_filter
        filtered_df = df[not_null_filter]
        x_values_series = filtered_df[x_col_name]
        y_values_series = filtered_df[y_col_name]
        x0, x1 = min(x_values_series), max(x_values_series)
        y0, y1 = min(y_values_series), max(y_values_series)
        #line_values = [i for i in np.arange(min(x0, y0), max(x1, y1), 0.1)]
        lims = [min(x0, y0), max(x1, y1)]
        print(f"lims: {lims}")
        plt.plot(lims, lims)
    return fig

In [None]:
def get_percent_of_column(df, groupby_column, target_column):
    target_df = df.groupby(by=[groupby_column]).agg({target_column:"sum"}).reset_index()
    all_rows_df = df.groupby(by=[groupby_column]).size().reset_index().rename({0:"total_rows"}, axis=1)
    merged_df = target_df.merge(all_rows_df, left_on=[groupby_column], right_on=[groupby_column], how="left")
    merged_df["percentage_default"] = 100*merged_df["target"]/merged_df["total_rows"]
    merged_df[groupby_column] = merged_df[groupby_column].astype(str)
    return merged_df

In [None]:
def plot_barplot(df, percent_column, category_column):
    fig, ax = plt.subplots(1,1,figsize=(8,5))
    sns.barplot(ax=ax, data=df, y=category_column, x=percent_column, order=df.sort_values(by=[percent_column], ascending=False)[category_column].values)
    return fig

## Read datasets

In [None]:
feature_df = pd.read_csv(f"{data_path}/feature_definitions.csv")
print(f"Shape of feature_df: {feature_df.shape}")

## Properties: depth=1, internal data source

In [None]:
train_applprev_1_df = pd.concat([pd.read_parquet(file_path) for file_path in applprev_1_file_dict['train']], axis=0)
print(f"Shape of train_applprev_1_df: {train_applprev_1_df.shape}")

## This means that there are multiple rows for each case_id indexed by num_group1

In [None]:
unique_num_group_case_id_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"num_group1": "nunique"}).reset_index().rename({"num_group1": "generated_count_of_previous_applications"}, axis=1)

In [None]:
unique_num_group_case_id_df.head()

In [None]:
sns.boxplot(data=unique_num_group_case_id_df, y="generated_count_of_previous_applications")

In [None]:
unique_num_group_case_id_df[unique_num_group_case_id_df.generated_count_of_previous_applications>15]

## There are certain case_id values that have a lot of  num_group1 values associated with them. This means there are as many previous aplications associated with this case_id. We need to somehow aggregate them into a single row per case_id. The most recent previous application has lowest num_group1 associated with it (0)

## Analyse columns

## 1. Previous contract information

## Assume that if application is approved, then there will be  a not null approvaldate_319D

In [None]:
train_applprev_1_df["creationdate_885D"] = pd.to_datetime(train_applprev_1_df["creationdate_885D"])
train_applprev_1_df["approvaldate_319D"] = pd.to_datetime(train_applprev_1_df["approvaldate_319D"])

### 1.1 creationdate_885D

In [None]:
get_column_description(feature_df, "creationdate_885D")

In [None]:
train_applprev_1_df.creationdate_885D.isnull().value_counts()

In [None]:
train_applprev_1_df[train_applprev_1_df.creationdate_885D.isnull()]["status_219L"].isnull().value_counts()
## , "approvaldate_319D"]

In [None]:
train_applprev_1_df[train_applprev_1_df.creationdate_885D.isnull()]["approvaldate_319D"].isnull().value_counts()

In [None]:
not_null_creationdate_885D_df = train_applprev_1_df[~train_applprev_1_df.creationdate_885D.isnull() & ~train_applprev_1_df.approvaldate_319D.isnull()]

In [None]:
(not_null_creationdate_885D_df["creationdate_885D"]< not_null_creationdate_885D_df["approvaldate_319D"]).value_counts()

In [None]:
(not_null_creationdate_885D_df["creationdate_885D"] == not_null_creationdate_885D_df["approvaldate_319D"]).value_counts()

In [None]:
(not_null_creationdate_885D_df["creationdate_885D"] > not_null_creationdate_885D_df["approvaldate_319D"]).value_counts()

## There are 3481795 applications where approval was done the same day the application was created.
## There are 32042 applications where approval was done some days after the application was created.
## Weirdly enough, there are 1848 applications where approval date is before creation date

## Number of days between application creation date and application approval date

In [None]:
train_applprev_1_df["generated_number_of_days_passed_between_application_creation_and_approval"] = (train_applprev_1_df["approvaldate_319D"] - train_applprev_1_df["creationdate_885D"]).dt.days

In [None]:
sns.boxplot(data=train_applprev_1_df, y="generated_number_of_days_passed_between_application_creation_and_approval", x="status_219L")

1. A, L, H status are the ones where number_of_days_passed_between_application_creation_and_approval>=0 (Makes sense)
2. For remaining, approvaldate_319D<creationdate_885D (Does not make sense)

In [None]:
train_applprev_1_df[train_applprev_1_df.generated_number_of_days_passed_between_application_creation_and_approval<=-100].head()

In [None]:
train_applprev_1_df[train_applprev_1_df.generated_number_of_days_passed_between_application_creation_and_approval<0].groupby(by=["status_219L"]).size()

In [None]:
train_applprev_1_df[train_applprev_1_df.case_id == 104430].sort_values(by=["num_group1"])[["num_group1", "approvaldate_319D", "creationdate_885D", "status_219L", "generated_number_of_days_passed_between_application_creation_and_approval"]]

## Hence use approvaldate_319D and creationdate_885D where creationdate_885D<=approvaldate_319D

In [None]:
sns.boxplot(data=train_applprev_1_df[train_applprev_1_df.generated_number_of_days_passed_between_application_creation_and_approval>=0], y="generated_number_of_days_passed_between_application_creation_and_approval", x="status_219L")

## Days count between creation and approval date is quite varied for status=K

### 1.2 status_219L

In [None]:
get_column_description(feature_df, "status_219L")

In [None]:
## There is no specific status that represents non approval ()
set(train_applprev_1_df.status_219L.dropna().unique()).difference(train_applprev_1_df[~train_applprev_1_df.approvaldate_319D.isnull()]['status_219L'].dropna().unique())

In [None]:
not_null_approvaldate_319D_status_219L_count_df = train_applprev_1_df[~train_applprev_1_df.approvaldate_319D.isnull()]['status_219L'].value_counts().reset_index().rename({"count": "not_null_approval_date_status_count"}, axis=1)
all_status_219L_count_df = train_applprev_1_df['status_219L'].value_counts().reset_index().rename({"count": "all_status_count"}, axis=1)
merged_status_count_df = all_status_219L_count_df.merge(not_null_approvaldate_319D_status_219L_count_df, left_on=["status_219L"], right_on=["status_219L"], how="left", validate="one_to_one")
merged_status_count_df["percentage_with_not_null_approval_date"] = 100*merged_status_count_df["not_null_approval_date_status_count"]/merged_status_count_df["all_status_count"]

In [None]:
merged_status_count_df.sort_values(by=["percentage_with_not_null_approval_date"], ascending=True)

In [None]:
bar_plot = plot_barplot(merged_status_count_df, "percentage_with_not_null_approval_date", "status_219L")
bar_plot.show()

## Status D is the status where there is loweset percentage of approval date. Interesting...

## Get number of previous applications with different status

In [None]:
groupby_case_id_status_219L_count_df = train_applprev_1_df.groupby(by=["case_id", "status_219L"]).size().reset_index().rename({0: "count"}, axis=1)

In [None]:
groupby_case_id_status_219L_count_df["percentage_of_status"] = groupby_case_id_status_219L_count_df["count"]/groupby_case_id_status_219L_count_df.groupby("case_id")["count"].transform('sum')

In [None]:
percentage_of_applications_for_status_df = pd.pivot_table(groupby_case_id_status_219L_count_df, index=["case_id"], columns=["status_219L"], values="percentage_of_status")

In [None]:
percentage_of_applications_for_status_df.columns = [f"generated_percentage_{percentage_of_applications_for_status_df.columns.name}_{col}" for col in percentage_of_applications_for_status_df.columns]

In [None]:
percentage_of_applications_for_status_df.head()

### 1.3 actualdpd_943P

In [None]:
get_column_description(feature_df, "actualdpd_943P")

In [None]:
train_applprev_1_df.actualdpd_943P.isnull().value_counts()

In [None]:
statistics_actualdpd_943P_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"actualdpd_943P": ["max", "min", "mean"]}).reset_index()
statistics_actualdpd_943P_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_actualdpd_943P_df.columns]

In [None]:
most_recent_previous_application_actualdpd_943P_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "actualdpd_943P"]].head(1).rename({"actualdpd_943P": "generated_most_recent_previous_application_actualdpd_943P"}, axis=1)

In [None]:
statistics_actualdpd_943P_df = statistics_actualdpd_943P_df.merge(most_recent_previous_application_actualdpd_943P_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
sns.boxplot(data=statistics_actualdpd_943P_df, y="generated_most_recent_previous_application_actualdpd_943P")

### 1.4 annuity_853A

In [None]:
get_column_description(feature_df, "annuity_853A")

In [None]:
train_applprev_1_df.annuity_853A.isnull().value_counts()

In [None]:
statistics_annuity_853A_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"annuity_853A": ["max", "min", "mean"]}).reset_index()
statistics_annuity_853A_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_annuity_853A_df.columns]

In [None]:
most_recent_previous_application_annuity_853A_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "annuity_853A"]].head(1).rename({"annuity_853A": "generated_most_recent_previous_application_annuity_853A"}, axis=1)

In [None]:
statistics_annuity_853A_df = statistics_annuity_853A_df.merge(most_recent_previous_application_annuity_853A_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
sns.boxplot(data=statistics_annuity_853A_df, y="generated_annuity_853A_mean")

### 1.5 byoccupationinc_3656910L

In [None]:
get_column_description(feature_df, "byoccupationinc_3656910L")

In [None]:
train_applprev_1_df.byoccupationinc_3656910L.isnull().value_counts(normalize=True)

In [None]:
train_applprev_1_df.byoccupationinc_3656910L.describe()

In [None]:
statistics_byoccupationinc_3656910L_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"byoccupationinc_3656910L": ["max", "min", "mean"]}).reset_index()
statistics_byoccupationinc_3656910L_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_byoccupationinc_3656910L_df.columns]

In [None]:
most_recent_previous_application_byoccupationinc_3656910L_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "byoccupationinc_3656910L"]].head(1).rename({"byoccupationinc_3656910L": "generated_most_recent_previous_application_byoccupationinc_3656910L"}, axis=1)

In [None]:
statistics_byoccupationinc_3656910L_df = statistics_byoccupationinc_3656910L_df.merge(most_recent_previous_application_byoccupationinc_3656910L_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
statistics_byoccupationinc_3656910L_df.tail()

In [None]:
sns.boxplot(data=statistics_byoccupationinc_3656910L_df, y="generated_byoccupationinc_3656910L_max")

In [None]:
sns.boxplot(data=train_applprev_1_df, y="byoccupationinc_3656910L", x="status_219L")

## H and S have higher byoccupationinc_3656910L 

### 1.6 cancelreason_3545846M

In [None]:
get_column_description(feature_df, "cancelreason_3545846M")

In [None]:
statistics_cancelreason_3545846M_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"cancelreason_3545846M":pd.Series.mode}).reset_index().rename({"cancelreason_3545846M": "generated_cancelreason_3545846M_mode"}, axis=1)

In [None]:
most_recent_previous_application_cancelreason_3545846M_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "cancelreason_3545846M"]].head(1).rename({"cancelreason_3545846M": "generated_most_recent_previous_application_cancelreason_3545846M"}, axis=1)

In [None]:
statistics_cancelreason_3545846M_df = statistics_cancelreason_3545846M_df.merge(most_recent_previous_application_cancelreason_3545846M_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
statistics_cancelreason_3545846M_df.generated_most_recent_previous_application_cancelreason_3545846M.value_counts()

In [None]:
statistics_cancelreason_3545846M_df.generated_cancelreason_3545846M_mode.value_counts()

### 1.7 childnum_21L

In [None]:
get_column_description(feature_df, "childnum_21L")

In [None]:
statistics_childnum_21L_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"childnum_21L": ["max"]}).reset_index()
statistics_childnum_21L_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_childnum_21L_df.columns]

In [None]:
sns.boxplot(data=statistics_childnum_21L_df, y="generated_childnum_21L_max")

### 1.8 credacc_actualbalance_314A [IGNORE]

In [None]:
get_column_description(feature_df, "credacc_actualbalance_314A")

In [None]:
train_applprev_1_df.credacc_actualbalance_314A.isnull().value_counts()

In [None]:
statistics_credacc_actualbalance_314A_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"credacc_actualbalance_314A": ["max", "min", "mean"]}).reset_index()
statistics_credacc_actualbalance_314A_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_credacc_actualbalance_314A_df.columns]

In [None]:
sns.boxplot(data=statistics_credacc_actualbalance_314A_df, y="generated_credacc_actualbalance_314A_max")

In [None]:
statistics_credacc_actualbalance_314A_df.generated_credacc_actualbalance_314A_max.describe()

### 1.9 credacc_credlmt_575A

In [None]:
get_column_description(feature_df, "credacc_credlmt_575A")

In [None]:
train_applprev_1_df.credacc_credlmt_575A.isnull().value_counts()

In [None]:
statistics_credacc_credlmt_575A_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"credacc_credlmt_575A": ["max", "min", "mean"]}).reset_index()
statistics_credacc_credlmt_575A_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_credacc_credlmt_575A_df.columns]

In [None]:
most_recent_previous_application_credacc_credlmt_575A_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "credacc_credlmt_575A"]].head(1).rename({"credacc_credlmt_575A": "generated_most_recent_previous_application_credacc_credlmt_575A"}, axis=1)

In [None]:
statistics_credacc_credlmt_575A_df = statistics_credacc_credlmt_575A_df.merge(most_recent_previous_application_credacc_credlmt_575A_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
statistics_credacc_credlmt_575A_df.head(5)

In [None]:
sns.boxplot(data=statistics_credacc_credlmt_575A_df, y="generated_credacc_credlmt_575A_max")

### 1.10 credacc_maxhisbal_375A

In [None]:
get_column_description(feature_df, "credacc_maxhisbal_375A")

In [None]:
train_applprev_1_df[["case_id", "credacc_actualbalance_314A", "credacc_maxhisbal_375A"]]

In [None]:
train_applprev_1_df[train_applprev_1_df.case_id == 2703453][["credacc_actualbalance_314A", "credacc_maxhisbal_375A"]]

### 1.11 credacc_minhisbal_90A

In [None]:
get_column_description(feature_df, "credacc_minhisbal_90A")

### 1.12 credacc_status_367L [IGNORE]

In [None]:
# get_column_description(feature_df, "credacc_status_367L")

# train_applprev_1_df.credacc_status_367L.value_counts()

# train_applprev_1_df.credacc_status_367L.isnull().value_counts(normalize=True)

# statistics_credacc_status_367L_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"credacc_status_367L":pd.Series.mode}).reset_index().rename({"credacc_status_367L": "generated_credacc_status_367L_mode"}, axis=1)

# most_recent_previous_application_credacc_status_367L_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "credacc_status_367L"]].head(1).rename({"credacc_status_367L": "generated_most_recent_previous_application_credacc_status_367L"}, axis=1)

# statistics_credacc_status_367L_df.generated_credacc_status_367L_mode.value_counts()

# most_recent_previous_application_credacc_status_367L_df

### 1.13 credacc_transactions_402L

In [None]:
get_column_description(feature_df, "credacc_transactions_402L")

In [None]:
train_applprev_1_df.credacc_transactions_402L.isnull().value_counts()

In [None]:
statistics_credacc_transactions_402L_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"credacc_transactions_402L": ["max", "min", "mean"]}).reset_index()
statistics_credacc_transactions_402L_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_credacc_transactions_402L_df.columns]

In [None]:
most_recent_previous_application_credacc_transactions_402L_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "credacc_transactions_402L"]].head(1).rename({"credacc_transactions_402L": "generated_most_recent_previous_application_credacc_transactions_402L"}, axis=1)

In [None]:
statistics_credacc_transactions_402L_df = statistics_credacc_transactions_402L_df.merge(most_recent_previous_application_credacc_transactions_402L_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
sns.boxplot(data=statistics_credacc_transactions_402L_df, y="generated_credacc_transactions_402L_max")

### 1.14 credamount_590A

In [None]:
get_column_description(feature_df, "credamount_590A")

In [None]:
statistics_credamount_590A_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"credamount_590A": ["max", "min", "mean"]}).reset_index()
statistics_credamount_590A_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_credamount_590A_df.columns]
most_recent_previous_application_credamount_590A_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "credamount_590A"]].head(1).rename({"credamount_590A": "generated_most_recent_previous_application_credamount_590A"}, axis=1)
statistics_credamount_590A_df = statistics_credamount_590A_df.merge(most_recent_previous_application_credamount_590A_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
sns.boxplot(data=statistics_credamount_590A_df, y="generated_credamount_590A_max")

### 1.15 credtype_587L

In [None]:
get_column_description(feature_df, "credtype_587L")

In [None]:
train_applprev_1_df.credtype_587L.value_counts()

In [None]:
statistics_credtype_587L_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"credtype_587L":pd.Series.mode}).reset_index().rename({"credtype_587L": "generated_credtype_587L_mode"}, axis=1)#
statistics_credtype_587L_df["generated_credtype_587L_mode"] = statistics_credtype_587L_df["generated_credtype_587L_mode"].apply(lambda x: x if isinstance(x, str) else (x[0] if len(x)>0 else np.nan))
most_recent_previous_application_credtype_587L_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "credtype_587L"]].head(1).rename({"credtype_587L": "generated_most_recent_previous_application_credtype_587L"}, axis=1)

In [None]:
statistics_credtype_587L_df = statistics_credtype_587L_df.merge(most_recent_previous_application_credtype_587L_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

### 1.16 currdebt_94A

In [None]:
get_column_description(feature_df, "currdebt_94A")

In [None]:
statistics_currdebt_94A_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"currdebt_94A": ["max", "min", "mean"]}).reset_index()
statistics_currdebt_94A_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_currdebt_94A_df.columns]
most_recent_previous_application_currdebt_94A_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "currdebt_94A"]].head(1).rename({"currdebt_94A": "generated_most_recent_previous_application_currdebt_94A"}, axis=1)
statistics_currdebt_94A_df = statistics_currdebt_94A_df.merge(most_recent_previous_application_currdebt_94A_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
sns.boxplot(data=statistics_currdebt_94A_df, y="generated_currdebt_94A_max")

### 1.17 district_544M

In [None]:
get_column_description(feature_df, "district_544M")

In [None]:
train_applprev_1_df.district_544M.isnull().value_counts()

In [None]:
statistics_district_544M_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"district_544M":pd.Series.mode}).reset_index().rename({"district_544M": "generated_district_544M_mode"}, axis=1)
statistics_district_544M_df["generated_district_544M_mode"] = statistics_district_544M_df["generated_district_544M_mode"].apply(lambda x: x if isinstance(x, str) else (x[0] if len(x)>0 else np.nan))
most_recent_previous_application_district_544M_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "district_544M"]].head(1).rename({"district_544M": "generated_most_recent_previous_application_district_544M"}, axis=1)

In [None]:
statistics_district_544M_df = statistics_district_544M_df.merge(most_recent_previous_application_district_544M_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
statistics_district_544M_df.head(2)

In [None]:
# (statistics_district_544M_df.generated_district_544M_mode == statistics_district_544M_df.generated_most_recent_previous_application_district_544M).value_counts()

### 1.18 downpmt_134A

In [None]:
get_column_description(feature_df, "downpmt_134A")

In [None]:
train_applprev_1_df.downpmt_134A.isnull().value_counts()

In [None]:
statistics_downpmt_134A_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"downpmt_134A": ["max", "min", "mean"]}).reset_index()
statistics_downpmt_134A_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_downpmt_134A_df.columns]
most_recent_previous_application_downpmt_134A_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "downpmt_134A"]].head(1).rename({"downpmt_134A": "generated_most_recent_previous_application_downpmt_134A"}, axis=1)
statistics_downpmt_134A_df = statistics_downpmt_134A_df.merge(most_recent_previous_application_downpmt_134A_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
statistics_downpmt_134A_df.head(2)

### 1.19 dtlastpmt_581D

In [None]:
get_column_description(feature_df, "dtlastpmt_581D")

### 1.20 dtlastpmtallstes_3545839D

In [None]:
get_column_description(feature_df, "dtlastpmtallstes_3545839D")

### 1.21 education_1138M

In [None]:
get_column_description(feature_df, "education_1138M")

In [None]:
train_applprev_1_df.education_1138M.value_counts()

In [None]:
most_recent_previous_application_education_1138M_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "education_1138M"]].head(1).rename({"education_1138M": "generated_most_recent_previous_application_education_1138M"}, axis=1)

### 1.22 employedfrom_700D

In [None]:
get_column_description(feature_df, "employedfrom_700D")

### 1.23 familystate_726L

In [None]:
get_column_description(feature_df, "familystate_726L")

In [None]:
train_applprev_1_df.familystate_726L.isnull().value_counts()

In [None]:
most_recent_previous_application_familystate_726L_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "familystate_726L"]].head(1).rename({"familystate_726L": "generated_most_recent_previous_application_familystate_726L"}, axis=1)

### 1.24 outstandingdebt_522A

In [None]:
get_column_description(feature_df, "outstandingdebt_522A")

In [None]:
train_applprev_1_df.outstandingdebt_522A.isnull().value_counts()

In [None]:
statistics_outstandingdebt_522A_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"outstandingdebt_522A": ["max", "min", "mean"]}).reset_index()
statistics_outstandingdebt_522A_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_outstandingdebt_522A_df.columns]
most_recent_previous_application_outstandingdebt_522A_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "outstandingdebt_522A"]].head(1).rename({"outstandingdebt_522A": "generated_most_recent_previous_application_outstandingdebt_522A"}, axis=1)
statistics_outstandingdebt_522A_df = statistics_outstandingdebt_522A_df.merge(most_recent_previous_application_outstandingdebt_522A_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
statistics_outstandingdebt_522A_df.head()

### 1.25 pmtnum_8L

In [None]:
get_column_description(feature_df, "pmtnum_8L")

In [None]:
train_applprev_1_df.pmtnum_8L.isnull().value_counts()

In [None]:
statistics_pmtnum_8L_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"pmtnum_8L": ["max", "min", "mean"]}).reset_index()
statistics_pmtnum_8L_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_pmtnum_8L_df.columns]
most_recent_previous_application_pmtnum_8L_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "pmtnum_8L"]].head(1).rename({"pmtnum_8L": "generated_most_recent_previous_application_pmtnum_8L"}, axis=1)
statistics_pmtnum_8L_df = statistics_pmtnum_8L_df.merge(most_recent_previous_application_pmtnum_8L_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
statistics_pmtnum_8L_df.head()

### 1.26 postype_4733339M [IGNORE]

In [None]:
get_column_description(feature_df, "postype_4733339M")

### 1.27 profession_152M

In [None]:
get_column_description(feature_df, "profession_152M")

In [None]:
most_recent_previous_application_profession_152M_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "profession_152M"]].head(1).rename({"education_1138M": "generated_most_recent_previous_application_profession_152M"}, axis=1)

In [None]:
most_recent_previous_application_profession_152M_df

### 1.28 rejectreason_755M

In [None]:
get_column_description(feature_df, "rejectreason_755M")

In [None]:
most_recent_previous_application_rejectreason_755M_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "rejectreason_755M"]].head(1).rename({"rejectreason_755M": "generated_most_recent_previous_application_rejectreason_755M"}, axis=1)

In [None]:
most_recent_previous_application_rejectreason_755M_df.generated_most_recent_previous_application_rejectreason_755M.value_counts()

### 1.29 tenor_203L

In [None]:
get_column_description(feature_df, "tenor_203L")

In [None]:
statistics_tenor_203L_df = train_applprev_1_df.groupby(by=["case_id"]).agg({"tenor_203L": ["max", "min", "mean"]}).reset_index()
statistics_tenor_203L_df.columns  = ['case_id' if index_tuple[0] == 'case_id' else f"generated_{index_tuple[0]}_{index_tuple[1]}" for index_tuple in statistics_tenor_203L_df.columns]
most_recent_previous_application_tenor_203L_df = train_applprev_1_df.sort_values(by=["num_group1"], ascending=True).groupby(by=["case_id"])[["case_id", "tenor_203L"]].head(1).rename({"tenor_203L": "generated_most_recent_previous_application_tenor_203L"}, axis=1)
statistics_tenor_203L_df = statistics_tenor_203L_df.merge(most_recent_previous_application_tenor_203L_df, left_on=["case_id"], right_on=["case_id"], validate="one_to_one")

In [None]:
statistics_tenor_203L_df

In [None]:
sns.boxplot(data=statistics_tenor_203L_df, y="generated_tenor_203L_max")