In [12]:
import pandas as pd
import numpy as np


In [13]:
# Load dataset
df = pd.read_csv('../data/raw/application_train.csv')

In [14]:
print("shape:", df.shape)
print("memory (MB):", df.memory_usage(deep=True).sum() / 1024**2)
print("\nColumns sample:", df.columns.tolist()[:20])

shape: (307511, 122)
memory (MB): 504.98532581329346

Columns sample: ['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION']


In [16]:
missing = df.isna().mean().sort_values(ascending=False)
unique_counts = df.nunique().sort_values()
print("\nTop missing columns:\n", missing.head(20))
print("\nColumns with few uniques (<=5):\n", unique_counts[unique_counts <= 5].head(40))
print("\nHigh-cardinality columns (>50 uniques):\n", unique_counts[unique_counts > 50].head(40))


Top missing columns:
 COMMONAREA_MEDI             0.698723
COMMONAREA_AVG              0.698723
COMMONAREA_MODE             0.698723
NONLIVINGAPARTMENTS_MODE    0.694330
NONLIVINGAPARTMENTS_AVG     0.694330
NONLIVINGAPARTMENTS_MEDI    0.694330
FONDKAPREMONT_MODE          0.683862
LIVINGAPARTMENTS_MODE       0.683550
LIVINGAPARTMENTS_AVG        0.683550
LIVINGAPARTMENTS_MEDI       0.683550
FLOORSMIN_AVG               0.678486
FLOORSMIN_MODE              0.678486
FLOORSMIN_MEDI              0.678486
YEARS_BUILD_MEDI            0.664978
YEARS_BUILD_MODE            0.664978
YEARS_BUILD_AVG             0.664978
OWN_CAR_AGE                 0.659908
LANDAREA_MEDI               0.593767
LANDAREA_MODE               0.593767
LANDAREA_AVG                0.593767
dtype: float64

Columns with few uniques (<=5):
 LIVE_CITY_NOT_WORK_CITY        2
FLAG_DOCUMENT_18               2
FLAG_DOCUMENT_14               2
FLAG_MOBIL                     2
FLAG_EMP_PHONE                 2
FLAG_WORK_PHONE        

In [18]:
print(df['TARGET'].value_counts(normalize=True))

TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64


In [24]:
# Numeric summary + skew
num = df.select_dtypes(include=["number"])
desc = num.drop(columns=['SK_ID_CURR', 'TARGET']).describe().T
desc["skew"] = num.skew().round(3)
desc.head(20)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,skew
CNT_CHILDREN,307511.0,0.417052,0.722121,0.0,0.0,0.0,1.0,19.0,1.975
AMT_INCOME_TOTAL,307511.0,168797.919297,237123.146279,25650.0,112500.0,147150.0,202500.0,117000000.0,391.56
AMT_CREDIT,307511.0,599025.999706,402490.776996,45000.0,270000.0,513531.0,808650.0,4050000.0,1.235
AMT_ANNUITY,307499.0,27108.573909,14493.737315,1615.5,16524.0,24903.0,34596.0,258025.5,1.58
AMT_GOODS_PRICE,307233.0,538396.207429,369446.46054,40500.0,238500.0,450000.0,679500.0,4050000.0,1.349
REGION_POPULATION_RELATIVE,307511.0,0.020868,0.013831,0.00029,0.010006,0.01885,0.028663,0.072508,1.488
DAYS_BIRTH,307511.0,-16036.995067,4363.988632,-25229.0,-19682.0,-15750.0,-12413.0,-7489.0,-0.116
DAYS_EMPLOYED,307511.0,63815.045904,141275.766519,-17912.0,-2760.0,-1213.0,-289.0,365243.0,1.664
DAYS_REGISTRATION,307511.0,-4986.120328,3522.886321,-24672.0,-7479.5,-4504.0,-2010.0,0.0,-0.591
DAYS_ID_PUBLISH,307511.0,-2994.202373,1509.450419,-7197.0,-4299.0,-3254.0,-1720.0,0.0,0.349


In [26]:
# Age/work features diagnostic
if "DAYS_BIRTH" in df.columns:
    df["AGE_YEARS"] = (-df["DAYS_BIRTH"] / 365).astype(int)
    print("\nAGE_YEARS range:", df["AGE_YEARS"].min(), df["AGE_YEARS"].max())
    # target rate by age bins
    if "TARGET" in df.columns:
        bins = pd.cut(df["AGE_YEARS"], bins=10)
        age_rate = df.groupby(bins, observed=True)["TARGET"].mean().round(4)
        print("\nTARGET rate by age bin:\n", age_rate)


AGE_YEARS range: 20 69

TARGET rate by age bin:
 AGE_YEARS
(19.951, 24.9]    0.1231
(24.9, 29.8]      0.1114
(29.8, 34.7]      0.1029
(34.7, 39.6]      0.0893
(39.6, 44.5]      0.0786
(44.5, 49.4]      0.0742
(49.4, 54.3]      0.0670
(54.3, 59.2]      0.0553
(59.2, 64.1]      0.0528
(64.1, 69.0]      0.0372
Name: TARGET, dtype: float64


In [27]:
# DAYS_EMPLOYED sentinel check (Kaggle dataset uses 365243)
if "DAYS_EMPLOYED" in df.columns:
    sentinel = (df["DAYS_EMPLOYED"] > 0).sum()
    sentinel365 = (df["DAYS_EMPLOYED"] == 365243).sum()
    print("\nDAYS_EMPLOYED > 0 count:", int(sentinel))
    print("DAYS_EMPLOYED == 365243 count:", int(sentinel365))
    print("DAYS_EMPLOYED describe:\n", df["DAYS_EMPLOYED"].describe())


DAYS_EMPLOYED > 0 count: 55374
DAYS_EMPLOYED == 365243 count: 55374
DAYS_EMPLOYED describe:
 count    307511.000000
mean      63815.045904
std      141275.766519
min      -17912.000000
25%       -2760.000000
50%       -1213.000000
75%        -289.000000
max      365243.000000
Name: DAYS_EMPLOYED, dtype: float64


In [30]:
# Binary-like columns detection (0/1)
binary_cols = [c for c in df.columns if set(df[c].dropna().unique()).issubset({0, 1})]
print(f"\nBinary-like columns (0/1) count {len(binary_cols)}:", binary_cols)


Binary-like columns (0/1) count 33: ['TARGET', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']


In [31]:
# Quick anomaly checks: extreme values for amounts
for col in ["AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE"]:
    if col in df.columns:
        s = df[col].dropna()
        print(f"\n{col}: min={s.min():.2f}, 1%={s.quantile(0.01):.2f}, 99%={s.quantile(0.99):.2f}, max={s.max():.2f}")


AMT_INCOME_TOTAL: min=25650.00, 1%=45000.00, 99%=472500.00, max=117000000.00

AMT_CREDIT: min=45000.00, 1%=76410.00, 99%=1854000.00, max=4050000.00

AMT_ANNUITY: min=1615.50, 1%=6182.91, 99%=70006.50, max=258025.50

AMT_GOODS_PRICE: min=40500.00, 1%=67500.00, 99%=1800000.00, max=4050000.00
