# Credit Loss Forecasting Model

In [1]:
"""
Part 1: Setup and Initial Data Analysis
Part 2: Data Cleaning and Preprocessing
Part 3: Univariate Analysis
Part 4: Feature Engineering
Part 5: Model Deployment
"""

'\nPart 1: Setup and Initial Data Analysis\nPart 2: Data Cleaning and Preprocessing\nPart 3: Univariate Analysis\nPart 4: Feature Engineering\nPart 5: Model Deployment\n'

## Part 1: Setup and Loading

In [2]:
import numpy as np
import pandas as pd

In [3]:
print("="* 60)
print("CREDIT LOSS FORECASTING MODEL")
print("="*60)

CREDIT LOSS FORECASTING MODEL


In [4]:
print("\n Step1: Load Data")

base_dir = "/Users/koushalsmodi/Desktop/MachineLearning/MachineLearningProjects/TensorFlow/CreditRisk"
try:
    df = pd.read_csv(base_dir+"/home-credit-default-risk/application_train.csv")
    print(f"Loaded application_train.csv: {df.shape}")
    
except FileNotFoundError:
    print("Please download 'application_train'.csv from Kaggle: https://www.kaggle.com/competitions/home-credit-default-risk/data")
    
    
print(f"\n Dataset shape: {df.shape[0]} rows * {df.shape[1]} columns")



 Step1: Load Data
Loaded application_train.csv: (307511, 122)

 Dataset shape: 307511 rows * 122 columns


In [5]:
print("="* 60)
print("Initial Data Exploration")
print("="* 60)

Initial Data Exploration


In [6]:
print("\n [Data Types]")
print(df.dtypes.value_counts())


 [Data Types]
float64    65
int64      41
str        16
Name: count, dtype: int64


In [7]:
print("\n [First Few Rows] ")
print(df.head())


 [First Few Rows] 
   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M            N   
1      100003       0         Cash loans           F            N   
2      100004       0    Revolving loans           M            Y   
3      100006       0         Cash loans           F            N   
4      100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0          202500.0    406597.5      24700.5   
1               N             0          270000.0   1293502.5      35698.5   
2               Y             0           67500.0    135000.0       6750.0   
3               Y             0          135000.0    312682.5      29686.5   
4               Y             0          121500.0    513000.0      21865.5   

   ...  FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21  \
0  ...           

In [8]:
print("\n [Basic Statistics]")
print(df.describe())


 [Basic Statistics]
          SK_ID_CURR         TARGET   CNT_CHILDREN  AMT_INCOME_TOTAL  \
count  307511.000000  307511.000000  307511.000000      3.075110e+05   
mean   278180.518577       0.080729       0.417052      1.687979e+05   
std    102790.175348       0.272419       0.722121      2.371231e+05   
min    100002.000000       0.000000       0.000000      2.565000e+04   
25%    189145.500000       0.000000       0.000000      1.125000e+05   
50%    278202.000000       0.000000       0.000000      1.471500e+05   
75%    367142.500000       0.000000       1.000000      2.025000e+05   
max    456255.000000       1.000000      19.000000      1.170000e+08   

         AMT_CREDIT    AMT_ANNUITY  AMT_GOODS_PRICE  \
count  3.075110e+05  307499.000000     3.072330e+05   
mean   5.990260e+05   27108.573909     5.383962e+05   
std    4.024908e+05   14493.737315     3.694465e+05   
min    4.500000e+04    1615.500000     4.050000e+04   
25%    2.700000e+05   16524.000000     2.385000e+05   


In [9]:
print(len(df))

307511


In [10]:
print("\n Missing Values Summary")

missing = df.isnull().sum()
missing_pct = 100 * missing / len(df)
print(missing_pct)


 Missing Values Summary
SK_ID_CURR                     0.000000
TARGET                         0.000000
NAME_CONTRACT_TYPE             0.000000
CODE_GENDER                    0.000000
FLAG_OWN_CAR                   0.000000
                                ...    
AMT_REQ_CREDIT_BUREAU_DAY     13.501631
AMT_REQ_CREDIT_BUREAU_WEEK    13.501631
AMT_REQ_CREDIT_BUREAU_MON     13.501631
AMT_REQ_CREDIT_BUREAU_QRT     13.501631
AMT_REQ_CREDIT_BUREAU_YEAR    13.501631
Length: 122, dtype: float64


In [11]:
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Missing_Percentage': missing_pct
}).sort_values('Missing_Percentage', ascending= False)
print(missing_df[missing_df['Missing_Count'] > 0].head(10))

                          Missing_Count  Missing_Percentage
COMMONAREA_MEDI                  214865           69.872297
COMMONAREA_AVG                   214865           69.872297
COMMONAREA_MODE                  214865           69.872297
NONLIVINGAPARTMENTS_MODE         213514           69.432963
NONLIVINGAPARTMENTS_AVG          213514           69.432963
NONLIVINGAPARTMENTS_MEDI         213514           69.432963
FONDKAPREMONT_MODE               210295           68.386172
LIVINGAPARTMENTS_MODE            210199           68.354953
LIVINGAPARTMENTS_AVG             210199           68.354953
LIVINGAPARTMENTS_MEDI            210199           68.354953


In [12]:
# Next step: Data Cleaning and Preprocessing