In [None]:
# Installments of required tables
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



In [None]:
#  BigQuery API activation
from google.colab import auth
auth.authenticate_user()


In [None]:
# big query add-on installation

!pip install --quiet google-cloud-bigquery
from google.cloud import bigquery


# Data Table Import from BigQuery

Application Training Import

In [None]:
# BigQuery client oluştur
client = bigquery.Client(project="homecredit-478707")

# Tabloyu BigQuery'den çek
query = """
SELECT *
FROM `homecredit-478707.Homecredit_Tables.application_training`
"""
app_train = client.query(query).to_dataframe()

In [None]:
app_train.head(3)

In [None]:
app_train.info()

In [None]:
app_train.describe()

In [None]:
# Primary key check

print("Primary key unique?", app_train['SK_ID_CURR'].is_unique)

In [None]:
app_train["SK_ID_CURR"].isna().sum() #

In [None]:
app_train.columns

In [None]:
import pandas as pd

# 4) Null oranları
null_ratio = app_train.isnull().sum() / len(app_train)
null_ratio = null_ratio.sort_values(ascending=False)
print(null_ratio.head(20))  # en çok null olan 20 kolon

# There are lots of variables with too many missing values. As I will use LightGBM, I am going to left it as it is.

In [None]:
# 5) Target distribution


sns.countplot(x='TARGET', data=app_train)
plt.title("Target Distribution")
plt.show()
print(app_train['TARGET'].value_counts(normalize=True))

plt.savefig("target_dist.png")

Target variable is dominated by less risky applicants(%91)

In [None]:
# 6) Numerical variables summary
numeric_cols = app_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
print(app_train[numeric_cols].describe())

# 106 numerical variables.

In [None]:
# 7) Categorical variable summary(only 10)
categorical_cols = app_train.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    print(f"{col} value counts:")
    print(app_train[col].value_counts().head(10))


In [None]:
# 8) Null ratio vis
plt.figure(figsize=(12,6))
sns.barplot(x=null_ratio.head(20).index, y=null_ratio.head(20).values)
plt.xticks(rotation=90)
plt.xlabel("Variables")
plt.ylabel("Null Ratio")
plt.title("Top 20 Null Ratios")
plt.show()

plt.savefig("null_ratio.png")

In [None]:
from numpy import append
important_numeric_cols = ['AMT_INCOME_TOTAL', 'AMT_CREDIT']

for col in important_numeric_cols:
    plt.figure(figsize=(12,4))

    # Histogram
    plt.subplot(1,2,1)
    sns.histplot(app_train[col], kde=True, bins=50)
    plt.title(f"{col} Histogram")

    # Boxplot
    plt.subplot(1,2,2)
    sns.boxplot(app_train[col])
    plt.title(f"{col} Boxplot")

    plt.show()
plt.savefig("important_numeric_cols.png")


In [None]:
important_categorical_cols = ['NAME_CONTRACT_TYPE', 'CODE_GENDER']

for col in important_categorical_cols:
    plt.figure(figsize=(6,4))
    sns.countplot(x=col, data=app_train)
    plt.title(f"{col} Count Plot")
    plt.ylabel("Count")
    plt.show()

    # Detailed explanation
    print(f"{col} value counts:")
    print(app_train[col].value_counts())

plt.savefig("important_categorical_cols.png")

In [None]:
# Correlation Heatmap

# Correlation matrix
numeric_cols = app_train.select_dtypes(include=['float64', 'int64']).columns
corr = app_train[numeric_cols].corr()

# Corr only with target
target_corr = corr['TARGET'].abs().sort_values(ascending=False)

# Except Target variable highest 20
top_features = target_corr.iloc[1:21].index

plt.figure(figsize=(10,6))
sns.barplot(x=target_corr[top_features], y=top_features)
plt.title("Top 20 Highest Correlations with TARGET")
plt.xlabel("Correlation")
plt.ylabel("Features")
plt.show()

plt.savefig("highest_corr_with_target.png")

print(top_features)

In [None]:
#  |corr| > 0.3 Variables Heatmap

# Correlation matrix
numeric_cols = app_train.select_dtypes(include=['float64', 'int64']).columns
corr = app_train[numeric_cols].corr()

# Threshold
threshold = 0.3

# Variables over threshold
filtered_cols = corr.columns[(corr.abs() > threshold).any()]

plt.figure(figsize=(12,10))
sns.heatmap(corr.loc[filtered_cols, filtered_cols], cmap="coolwarm", center=0)
plt.title("Correlation Heatmap (|corr| > 0.3)")
plt.show()


In [None]:
target_corr = corr['TARGET'].abs().sort_values(ascending=False)
top10 = target_corr.iloc[1:11].index

plt.figure(figsize=(8,6))
sns.heatmap(corr.loc[top10, top10], annot=True, cmap="coolwarm", center=0)
plt.title("Top 10 Correlated Features Heatmap")
plt.show()
plt.savefig("top10_corr_heatmap.png")

print(top10)

In [None]:
# (Feature Type Summary)

summary = {
    "Total Columns": app_train.shape[1],
    "Numeric Columns": len(app_train.select_dtypes(include=['float64','int64']).columns),
    "Categorical Columns": len(app_train.select_dtypes(include=['object']).columns),
    "Binary Columns": sum([app_train[col].nunique() == 2 for col in app_train.columns])
}

summary


In [None]:
# Categorical Feature Cardinality Analysis

cat_cols = app_train.select_dtypes(include=['object']).columns

for col in cat_cols:
    print(col, app_train[col].nunique())


High-cardinality features were identified for special encoding(Organization type, connot be encoded!!)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# ---------------------------
# High-correlation / Important Feature
# ---------------------------
high_corr_cols = [
    "EXT_SOURCE_3",                 # External risk score 3 from bureau; strong predictor of default
    "EXT_SOURCE_2",                 # External risk score 2 from bureau; highly correlated with EXT_SOURCE_3
    "EXT_SOURCE_1",                 # External risk score 1; commonly used in credit scoring models
    "DAYS_BIRTH",                   # Client's age in negative days; younger clients usually higher risk
    "REGION_RATING_CLIENT_W_CITY",  # Socio-economic rating of client's region including city; lower rating → higher risk
    "REGION_RATING_CLIENT",         # Socio-economic rating of client's region without city
    "DAYS_LAST_PHONE_CHANGE",       # Days since last phone change; recent change may indicate instability
    "DAYS_ID_PUBLISH",              # Days since ID was issued; recently issued IDs may indicate higher risk
    "REG_CITY_NOT_WORK_CITY",       # Client works in a city different from registered city; mismatch → risk signal
    "FLAG_EMP_PHONE",               # Employment phone provided? Missing slightly increases risk
    "DAYS_EMPLOYED",                # Days employed (negative); special value 365243 = not employed
    "AMT_GOODS_PRICE",              # Price of purchased goods financed by loan; higher prices may affect risk
    "AMT_INCOME_TOTAL",             # Total income of the client; key indicator of repayment capacity
    "AMT_CREDIT",                   # Total credit amount; strongly correlated with repayment risk
    "AMT_ANNUITY"                   # Monthly loan installment; important for debt-to-income ratio
]

# ---------------------------
# Convert all potentially non-numeric columns to numeric
# ---------------------------
numeric_cols = [
    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "DAYS_LAST_PHONE_CHANGE",
    "DAYS_ID_PUBLISH",
    "REGION_RATING_CLIENT_W_CITY",
    "REGION_RATING_CLIENT",
    "REG_CITY_NOT_WORK_CITY",
    "FLAG_EMP_PHONE",
    "EXT_SOURCE_1",
    "EXT_SOURCE_2",
    "EXT_SOURCE_3",
    "AMT_GOODS_PRICE",
    "AMT_INCOME_TOTAL",
    "AMT_CREDIT",
    "AMT_ANNUITY"
]

for col in numeric_cols:
    app_train[col] = pd.to_numeric(app_train[col], errors='coerce')  # convert to numeric, invalid parsing = NaN

# ---------------------------
# Function to detect outliers using IQR
# ---------------------------
def outlier_summary(df, col):
    """
    Prints outlier summary for a numeric column using IQR method.
    """
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]

    print(f"--- {col} ---")
    print(f"Lower bound: {lower:.2f}, Upper bound: {upper:.2f}")
    print(f"Outlier count: {len(outliers)}\n")

# ---------------------------
# Loop through features to generate summary and plots
# ---------------------------
for col in high_corr_cols:
    # 1) Outlier summary
    outlier_summary(app_train, col)

    # 2) Histogram + Boxplot for numeric features
    plt.figure(figsize=(12,4))

    # Histogram
    plt.subplot(1,2,1)
    sns.histplot(app_train[col].dropna(), bins=50, kde=True, color='skyblue')
    plt.title(f"{col} Histogram")

    # Boxplot
    plt.subplot(1,2,2)
    sns.boxplot(x=app_train[col].dropna(), color='lightgreen')
    plt.title(f"{col} Boxplot")

    plt.tight_layout()
    plt.show()


The key drivers of credit risk in the Home Credit dataset are captured by a combination of external risk scores, demographic characteristics, socio-economic factors, and behavioral stability indicators. Specifically:

External risk scores (EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3) are the most powerful predictors of credit default, summarizing historical credit behavior and bureau information.

Age (DAYS_BIRTH) is a strong demographic signal, with younger clients generally showing higher default probability.

Regional socio-economic indicators (REGION_RATING_CLIENT, REGION_RATING_CLIENT_W_CITY) reflect the financial and social environment of the client, influencing repayment behavior.

Behavioral stability features (DAYS_LAST_PHONE_CHANGE, DAYS_ID_PUBLISH, REG_CITY_NOT_WORK_CITY, FLAG_EMP_PHONE) provide insight into personal stability and consistency, which are well-known factors in credit risk modeling.

Financial characteristics (AMT_GOODS_PRICE, AMT_INCOME_TOTAL, AMT_CREDIT, AMT_ANNUITY, DAYS_EMPLOYED) capture the client’s income, debt burden, and employment duration, which are essential for assessing repayment capacity.

Together, these variables exhibit the highest correlation with the target variable and are widely recognized in the credit risk literature as robust structural predictors of default.

In [None]:
app_train.columns