In [None]:
#importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#load data in dataframe
df = pd.read_excel("Data_Dictionary.xlsx")

In [None]:
df.head()

In [None]:
loandf = pd.read_csv("loan.csv")

In [None]:
# Read the CSV file with low_memory set to False
loandf = pd.read_csv("loan.csv", low_memory=False)

In [None]:
loandf.shape

In [None]:
loandf.head()

## Data Cleaning

In [None]:
def get_nan_columns_sorted(df):
    # Count NaN values for each column
    nan_count = df.isnull().sum()
    # Filter columns where NaN count is greater than 0
    nan_columns_sorted = nan_count[nan_count > 0].sort_values(ascending=False)
    return nan_columns_sorted

In [None]:
get_nan_columns_sorted(loandf)

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
# Drop columns where all values are NaN
loandf_cleaned = loandf.dropna(axis=1, how='all')

In [None]:
loandf_cleaned.shape

In [None]:
# Get columns where the number of unique values is 1
single_value_cols = loandf_cleaned.columns[loandf_cleaned.nunique() == 1].tolist()

In [None]:
single_value_cols

In [None]:
# Drop these columns from the DataFrame as these will not contribute to analysis
loandf_cleaned = loandf_cleaned.drop(columns=single_value_cols)

In [None]:
loandf_cleaned.shape

In [None]:
get_nan_columns_sorted(loandf_cleaned).head(2)

In [None]:
loandf_cleaned[["next_pymnt_d","loan_status"]].head()

In [None]:
# Leaving NaN as is for next_paymnt_d as it actually means "No Payment due", cocedering the loan_status column
# Find rows where next_pymnt_d is NaN and loan_status is 'Current'
missing_next_pymnt_rows = loandf_cleaned[(loandf_cleaned['next_pymnt_d'].isnull()) & (loandf_cleaned['loan_status'] == 'Current')]
missing_next_pymnt_rows.shape

In [None]:
# Find rows where next_pymnt_d is not NaN and loan_status is not 'Current'
non_current_with_payment_due = loandf_cleaned[(loandf_cleaned['next_pymnt_d'].notnull()) & (loandf_cleaned['loan_status'] != 'Current')]
non_current_with_payment_due.shape

In [None]:
# It is clear that next_pymnt_d is realted to loan_status. 
# Data value is only for loan_status = Current
# Hence it is safe to drop the column as the information is there in loan status column
loandf_cleaned = loandf_cleaned.drop(columns=["next_pymnt_d"])

In [None]:
get_nan_columns_sorted(loandf_cleaned).head(2)

In [None]:
#handling mths_since_last_record fiield
loandf_cleaned[["mths_since_last_record","loan_status"]].head()

In [None]:
# view full test in dis=ctionary
pd.set_option('display.max_colwidth', None)

In [None]:
loandf_cleaned.mths_since_last_record.value_counts()

In [None]:
# Create a new column 'has_public_record' where 'mths_since_last_record' is NaN => 'No', else 'Yes'
loandf_cleaned.loc[:, 'has_public_record'] = np.where(loandf_cleaned['mths_since_last_record'].isnull(), 'No', 'Yes')

In [None]:
loandf_cleaned.has_public_record.value_counts()

In [None]:
# Dropping the mths_since_last_record column
loandf_cleaned = loandf_cleaned.drop(columns=["mths_since_last_record"])

In [None]:
#handling mths_since_last_delinq column
# mths_since_last_delinq has 64.6% null values. But NaN value maeans the accounts have never been delinquent
# Create a new column has_delinquency with Yes and No values
loandf_cleaned.loc[:,'has_delinquency'] = np.where(loandf_cleaned['mths_since_last_delinq'].isnull(), 'No', 'Yes')

In [None]:
loandf_cleaned.has_delinquency.value_counts()

In [None]:
# Dropping the mths_since_last_record column
loandf_cleaned = loandf_cleaned.drop(columns=["mths_since_last_delinq"])

In [None]:
loandf_cleaned.desc.head()

In [None]:
#handling desc column. It is difficult to make any sense of text at this point.
#create a new column has_desc
loandf_cleaned.loc[:,'has_desc'] = np.where(loandf_cleaned['desc'].isnull(), 'No', 'Yes')

In [None]:
loandf_cleaned.has_desc.value_counts()

In [None]:
#dropping desc column
loandf_cleaned = loandf_cleaned.drop(columns=['desc'])

In [None]:
loandf_cleaned.emp_title.head()

In [None]:
# handling emp_title column
# about 6% is null. This can be either rhe borrower was unemployed or simply did not provide details
# Setting the null values with "Unknown"
loandf_cleaned.loc[:,'emp_title'] = loandf_cleaned['emp_title'].fillna('Unknown')

In [None]:
loandf_cleaned.emp_title.value_counts()

In [None]:
loandf_cleaned.emp_length.value_counts()

In [None]:
#handle emp_length
# this indicates job stability. Missing values may indicate borrower being unemployed or simple not provided.
# hence replacing nulls with Unknown seems a good approach
loandf_cleaned.loc[:,'emp_length'] = loandf_cleaned['emp_length'].fillna('Unknown')

In [None]:
loandf_cleaned.emp_length.value_counts()

In [None]:
loandf_cleaned.pub_rec_bankruptcies.value_counts()

In [None]:
# Handling pub_rec_bankruptcies
# This column has 1.75 missing values
# It is very likely that missing values is because the borrowers have no bankrupcies as any bankrupcies is likely to be discovered by loan provider
# Hence fillling with 0.0 (which is also median) seems a reasonable approach here
loandf_cleaned.loc[:, 'pub_rec_bankruptcies'] = loandf_cleaned['pub_rec_bankruptcies'].fillna(0.0)

In [None]:
loandf_cleaned.pub_rec_bankruptcies.value_counts()

In [None]:
# Handling last_pymnt_d
# Relative small .18% is missing
# No clear reason can be attributed
# In order to avoid any bias, dropping the rows seems reasonable
loandf_cleaned = loandf_cleaned.dropna(subset=['last_pymnt_d'])

In [None]:
loandf_cleaned.shape

In [None]:
loandf_cleaned.revol_util.value_counts()

In [None]:
# Handling revol_util
# Relativelly low .13 % null values
# As there is not clear memaning of null values, it seems best to drop the rows with null values
loandf_cleaned = loandf_cleaned.dropna(subset=['revol_util'])

In [None]:
# Remove '%' and convert 'revol_util' to float
loandf_cleaned.loc[:, 'revol_util'] = loandf_cleaned['revol_util'].str.rstrip('%').astype('float') / 100

In [None]:
loandf_cleaned.shape

In [None]:
get_nan_columns_sorted(loandf_cleaned).head(2)

In [None]:
# Handling title
# title has only 11 null values
# Introducing a new category "Unknown" seeps appropriate
loandf_cleaned.loc[:, 'title'] = loandf_cleaned['title'].fillna('Unknown')

In [None]:
# Handling last_credit_pull_d
# Since there are only 2 null values, it is safe to drop there rows without any impact on EDA
loandf_cleaned = loandf_cleaned.dropna(subset=['last_credit_pull_d'])

In [None]:
loandf_cleaned.shape

In [None]:
get_nan_columns_sorted(loandf_cleaned).head(2)

In [None]:
# url = https://lendingclub.com/browse/loanDetail.action?loan_id=1077501 is also does not seem to have any contribution to EDA
loandf_cleaned = loandf_cleaned.drop(columns=["url"])

In [None]:
loandf_cleaned.shape

In [None]:
#dropping id and member_id colums as there will not be analysed
loandf_cleaned = loandf_cleaned.drop(columns=["id", "member_id"])

In [None]:
loandf_cleaned.shape

In [None]:
num_cols = []
cat_cols = []
extra_cols = []

# Loop through each column and classify based on unique values
for col in loandf_cleaned.columns:
    unique_values = loandf_cleaned[col].nunique()  # Get number of unique values in the column
    
    if unique_values > 20:
        num_cols.append(col)  # If more than 20 unique values, treat as numeric
    else:
        cat_cols.append(col)  # If 20 or fewer unique values, treat as categorical

In [None]:
print(num_cols, "----" ,cat_cols)

In [None]:
# sub_grade is categorical column
cat_cols_observed = ["sub_grade","emp_title", "title", "addr_state", "earliest_cr_line", "last_pymnt_d", "last_credit_pull_d","issue_d","zip_code"];
num_cols_observed = []
# Remove columns from num_cols that are present in cat_cols_observed
num_cols = [col for col in num_cols if col not in cat_cols_observed]
# Append the columns from cat_cols_observed to cat_cols
cat_cols = cat_cols + cat_cols_observed

In [None]:
print(loandf_cleaned.shape, len(num_cols + cat_cols))

In [None]:
loandf_cleaned.iloc[:,0:10].head()

In [None]:
#Observe the remaining columns and take necessary action

In [None]:
# Clean the 'int_rate' column
loandf_cleaned.loc[:, 'int_rate'] = loandf_cleaned['int_rate'].str.rstrip('%').astype('float') / 100

In [None]:
loandf_cleaned.iloc[:,40:50].head()

## Univariate Analysis

In [None]:
for col in num_cols:
        # Create a figure with 2 subplots (1 row, 2 columns)
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

    # Plot histogram on the first subplot
    sns.histplot(x=loandf_cleaned[col], ax=axes[0])
    axes[0].set_title(f'Histogram of {col}')

    # Plot boxplot on the second subplot
    sns.boxplot(y=loandf_cleaned[col], ax=axes[1])
    axes[1].set_title(f'Boxplot of {col}')

    # Show the combined plots
    plt.tight_layout()  # Adjusts the spacing between plots
    plt.show()

#### loan_amt: 
##### plot is right skewed, also there are outliars on the highr side: This means most of the loans are of lower value. 
##### However there are few very high value loans. This certainly indicates increased risk


#### funded_amnt, funded_amnt_inv
##### This has similar distribution to loan_amnt

#### int_rate, installment, open_acct
##### is also right skewed and outliars on top.

#### annual_income - **

#### dti:
##### Has normal distribution with no outliars

#### revol_bal
##### Is right skewed and has outliars on the top

### Segment Univariate Analysis

In [None]:
for col in num_cols:
    plt.figure(figsize=(12, 5))
    sns.boxplot(x='loan_status', y=loandf_cleaned[col], data=loandf_cleaned)
    plt.title(f'{col} by Loan Status')
    plt.xticks(rotation=45)
    plt.show()

##### 1. loan_amnt, funded_amnt, funded_amnt_inv median and quratiles are slightly higher. The loans with current is more higher. This suggests there is a risk of defaults to increate.
##### 2.  Int_rate is also higher for defaults. Lokking at the box plot of current loans, seems the current loans are are at risk

In [None]:
# Derived metrics
#Debt-to-Loan Ratio: dti / loan_amnt.
loandf_cleaned["debt_to_loan_ratio"] = loandf_cleaned["dti"] / loandf_cleaned["loan_amnt"]
#Utilization Ratio: revol_bal / total_acc for revolving credit.
loandf_cleaned["utilization_ratio"] = loandf_cleaned["revol_bal"] / loandf_cleaned["total_acc"]
#Income-to-Loan Ratio: annual_inc / loan_amnt.
loandf_cleaned["income-to-loan_ratio"] = loandf_cleaned["annual_inc"] / loandf_cleaned["loan_amnt"]

derived_cols = ["debt_to_loan_ratio", "utilization_ratio", "income-to-loan_ratio"]

In [None]:
for col in derived_cols:
    plt.figure(figsize=(12, 5))
    sns.boxplot(x='loan_status', y=loandf_cleaned[col], data=loandf_cleaned)
    plt.title(f'{col} by Loan Status')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
df

In [None]:
for col in cat_cols:
    sns.countplot(x=loandf_cleaned[col])
    plt.xticks(rotation=75)
    plt.show()

### Univariate Analysis observtions and Corrective actions

## Bivariate Analysis

In [None]:
corr = loandf_cleaned[num_cols + derived_cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Numerical Features')
plt.show()
            
    

In [None]:
for col in cat_cols:
    pd.crosstab(loandf_cleaned[col], loandf_cleaned['loan_status'], normalize='index').plot(kind='bar', stacked=True)
    plt.title('Loan Status Distribution by ' + col)
    plt.show()