In [1]:
# import all necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

ModuleNotFoundError: No module named 'seaborn'

In [None]:
train_data = "https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_train.csv"
test_data = "https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_test.csv"

In [None]:
# Categorical columns
cat_cols = [x for x in df_train.columns if df_train[x].dtype not in ["int64", "float64"]]
cat_cols.append("Loan_Amount_Term")

In [None]:
# Extract the columns with numerical values
num_col = [x for x in df_train.columns if x not in cat_cols]

In [None]:
# Define a function to check the statistical summary
def univariate_numerical_eda(df, column):
    """
    Performing univariate EDA on each numerical columns.
    Display the statistical summary skewness, kurtosis, histogram, KDE and boxplot
    """
    print(f"Feature: {column}")
    print("-" * 40)
    print(df_train[column].describe().to_frame())
    print(f"Skewness: {df_train[column].skew():.3f}")
    print(f"Kurtosis: {df_train[column].kurt():.3f}")

    plt.figure(figsize=(12,4))

    # Histogram + KDE
    plt.subplot(1,2,1)
    sns.histplot(df[column], kde=True, bins=30, color='orange')
    plt.title(f"\n Distribution of {column}", fontsize=13)
    plt.xlabel(column)
    plt.ylabel("Frequency")

    # Box Plot
    plt.subplot(1,2,2)
    sns.boxplot(x=df[column], color='orange')
    plt.title(f"\n Boxplot of {column}", fontsize=13)

    plt.tight_layout()
    plt.show()

    # Transformation
    if abs(df[column].skew()) > 1:
        print(f"\n {column} is highly skewed, Consider log or Box-Cox transformation")
    elif abs(df[column].skew()) > 0.5:
        print(f"\n{column} is moderately skewed")
    else:
        print(f"{column} is fairly symmetric")

In [None]:

# Histogram + boxplot for Applicant income
univariate_numerical_eda(df_train, "ApplicantIncome")

In [None]:

# Looping to check the summary for the other numerical dataset
for col in num_col:
    univariate_numerical_eda(df_train, col)

In [None]:
def univariate_categorical_eda(df, column):
    """
    Performs univariate EDA on categorical columns
    Displays frequency table, proportion, bar chart and pie chart
    """

    print(f"Feature: {column}")
    print("="*40)

    # Frequency +proportion
    freq = df[column].value_counts()
    prop = df[column].value_counts(normalize=True)* 100
    summary = pd.DataFrame({"Count": freq, "Percentage": prop.round(2)})
    print(summary)
    print()

    plt.figure(figsize=(8,5))
    # Visualization (Barchart)
    plt.subplot(1,2,1)
    ax = sns.countplot(x=column, data=df_train, palette="Set3", order=freq.index, color="orange")
    plt.title(f"Distribution Barchart of {column}", fontsize=13)
    plt.xlabel(column)
    plt.ylabel("Count")

    # Visualization (piechart)
    plt.subplot(1,2,2)
    df_train[col].value_counts().plot(kind="pie", autopct="%1.1f%%", startangle=90)
    plt.title(f"Distribution Piechart of {column}", fontsize=13)

    # Annotate each bar with percentage
    total = len(df[column])
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width()/2, height + 1, f"{(height/total)*100:.1f}%", ha="center")

    plt.tight_layout()
    plt.show()

In [None]:

# Check the data types of the columns
df_train.dtypes

In [None]:

# Lets find the numerical features
num_features = df_train.select_dtypes(include=['Int64', 'Float64']).columns

# Convert to dataframe
num_features = pd.DataFrame(df_train[num_features])
num_features.head(2)

In [None]:

df_train.isna().sum()

In [None]:
def correlation_with_target(df_train, target_col, figsize=(8,5)):
    """
    Plots a heatmap showing the correlation of each numeric feature against target_col.
    Returns a Series of correlations (sorted by absolute magnitude)
    """
    

    # Compute correlations of every numeric column with the target
    corrs = df_train[num_col].corrwith(df_train['Loan_Status']).drop(target_col)

    corrs_df = corrs.to_frame(name='corr').T  # shape(1,n)

    plt.figure(figsize=figsize)
    sns.heatmap(
        corrs_df,
        annot=True,
        fmt=".3f",
        cmap="coolwarm",
        center = 0,
        vmin = -1,
        vmax = 1,
        cbar_kws = {"orientation": "vertical", "shrink": 0.7}
    )

    plt.xticks(rotation=0, ha='right')
    plt.yticks([0], [target_col], rotation=0)
    plt.title(f"Correlation of numeric features with {target_col}", fontsize=12)
    plt.tight_layout()
    plt.show()

    # Return sorted correlation for downstream use
    return corrs.reindex(corrs.abs().sort_values(ascending=False).index)

In [None]:

corr_series = correlation_with_target(df_train, 'Loan_Status', figsize=(12, 2.5))
(corr_series)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select only numeric columns
numeric_df = df_train.select_dtypes(include=['int64', 'float64'])

# Compute correlation matrix
corr_matrix = numeric_df.corr()

# Plot
plt.figure(figsize=(8, 5))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap (Numeric Columns Only)', fontsize=14)
plt.show()