# LENDING CLUB CASE STUDY 
## Group Members :
1. Karthick Chetti 
2. Anirudh KVC

## Objectives :
1. Reduce Credit loss from 'charged-off' customers who are the 'defaulters'. (Lending loans to ‘risky’ applicants) which is largest source of financial loss
2. Understand the driving factors (or driver variables) behind loan default

## Data Cleaning

### Task 1 : Removing the unnecessary columns based on its relevance to the objective

#### 1. id	& member_id are unique and shouldn't influence the loan_status
#### 2. emp_title - contains subjective information and irrevelant to analyze


### Task 2 : Format Correction of Columns

#### 1. term - remove months from each cell and convert to int

### Task 3 : Relevant Columns
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
installment
grade
sub_grade
emp_length
home_ownership
annual_inc
verification_status
loan_status
purpose
addr_state
dti
delinq_2yrs
earliest_cr_line
mths_since_last_delinq
open_acc
pub_rec
revol_bal
revol_util
total_acc
last_pymnt_d
pub_rec_bankruptcies



In [None]:
# Load the libraries
import pandas as pd #To work with dataset
pd.set_option("display.max_columns", 100)
import numpy as np #Math library
import seaborn as sns #Graph library that use matplot in background
import matplotlib.pyplot as plt #to plot some parameters in seaborn
from matplotlib.ticker import PercentFormatter # for percentage

In [None]:
# Importing data from loan.csv
lc_df=pd.read_csv("loan.csv")

In [None]:
# As we are interested in understanding charged off customers loan status as current is not useful for the analysis
# Filtering data accorgingly

lc_df = lc_df[~ (lc_df["loan_status"] == "Current")]

In [None]:
# Key features
# id,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,annual_inc,verification_status,purpose,zip_code,add_state
# Add other additional features if felt relevant in the future
# Here id is conidered for aggregation purpose
req_features=["id","loan_amnt","term","int_rate","installment","grade","sub_grade","home_ownership","emp_length","loan_status","annual_inc","verification_status","purpose","zip_code","addr_state","dti","open_acc"]
lc_df = lc_df[req_features]

In [None]:
# Having a look at the data
lc_df.head()

In [None]:
# Understanding the data and data types
lc_df.info(verbose=True,show_counts=True)
lc_df.describe()
#Checking

# Missing value imputation
# Total number of data points found are 38577 
## It is observed that for emp_length, some data is missing

In [None]:
# As the number of missing data is less, missing value imputation can be done
# Checking the values in emp_length column
lc_df["emp_length"].value_counts()

In [None]:
# As the most recurring value is 10+years, adding 10+ years whereever the data is missing
lc_df["emp_length"] = lc_df["emp_length"].fillna("10+ years")
lc_df.info()
lc_df["emp_length"].value_counts()

In [None]:
#Removing columns having 50% null values in them as such data is not useful
#lc_df_nulldrop=lc_df.dropna(axis=1,thresh=int(0.5*len(lc_df)))

In [None]:
#Printing number of columns removed
#removed_columns=len(lc_df.columns)-len(lc_df_nulldrop.columns)
#print(f"{removed_columns} columns were removed")

In [None]:
#Checking the statistics of the new dataframe
lc_df.describe()

### It is observed that some columns have only one value(mostly 0) throughout the rows, which are not useful for the analysis

In [None]:
#Dropping such columns in the dataframe
# dropped_col=[]
# for col in lc_df_nulldrop.columns:
#     if len(lc_df_nulldrop[col].unique())==1:
#         dropped_col.append(col)
# print(f"The columns to be dropped are : {dropped_col}")
# lc_df_droppedcol=lc_df_nulldrop.drop(dropped_col,axis=1)

### Dropping columns based on their relevance to the business objective
#### 1. id ,member id and url will be specific to each customer 
#### 2. title,emp_title,desc - Very subjective and unstructured values in these columns

In [None]:
#Collecting all non relevant columns
# non_relevant_cols=["id","member_id","url","title","emp_title","desc"]
# #Dropping these columns
# lc_df_droppedcol.drop(non_relevant_cols,axis=1,inplace=True)

In [None]:
#lc_df_droppedcol.info(verbose=True,show_counts=True)

### It is observed that some columns still have missing values and 0 as only values in column
#### collections_12_mths_ex_med, chargeoff_within_12_mths, tax_liens

In [None]:
# print("Unique values of collections_12_mths_ex_med column :",lc_df_droppedcol["collections_12_mths_ex_med"].unique())
# print("Unique values of chargeoff_within_12_mths column :",lc_df_droppedcol["chargeoff_within_12_mths"].unique())
# print("Unique values of tax_liens column :",lc_df_droppedcol["tax_liens"].unique())
# #Hence, dropping these columns
# lc_df_droppedcol.drop(["collections_12_mths_ex_med","chargeoff_within_12_mths","tax_liens"],axis=1,inplace=True)

In [None]:
# lc_df_droppedcol.head()

In [None]:
# lc_df_droppedcol.info(verbose=True,show_counts=True)

### There are still some columns which have missing values, Therefore understanding the number of missing values in these columns

In [None]:
# empty_cols=lc_df_droppedcol.columns[lc_df_droppedcol.isna().any()].tolist()
# empty_vals=[lc_df_droppedcol[col].isna().sum() for col in empty_cols]
# empty_per_dict=dict(zip(empty_cols,empty_vals))
# print("The missing values in each column is :\n",empty_per_dict)

### Missing Value Imputation

In [None]:
#Add mean and median to missing values

In [None]:
lc_df.median()

### Fixing data types of columns

In [None]:
# Removing % and converting to float
lc_df["int_rate"]=lc_df["int_rate"].str.replace("%","").astype(float)
# lc_df_droppedcol["revol_util"]=lc_df_droppedcol["revol_util"].str.replace("%","").astype(float)

## Checking Data Distributions for Numeric Data

In [None]:
#Checking the distributions for numeric data and identifying relevant columns for business objective
def box_plot_col(df):
    i=1
    for col in df.columns:
        if df[col].dtype in ["int64","float64"]:
            # plt.subplot(1, 2, 1)
            # plt.hist(df[col])
            # plt.xlabel(col)
            #plt.subplot(1, 2, 2)
            plt.boxplot(df[col])
            plt.xlabel(col)
            #plt.tight_layout()
            plt.show()
box_plot_col(lc_df)

## EDA for Business Objective 1 :
### Reduce Credit loss from 'charged-off' customers who are the 'defaulters'. (Lending loans to ‘risky’ applicants) which is largest source of financial loss

In [None]:
# Univariate Analysis
# Functions for plotting graphs wrt columns (numerical) as percentage


def plot_hist(df,column_name,bins):
    plot_data = (df[df["loan_status"]=="Fully Paid"][column_name],df[df["loan_status"]=="Charged Off"][column_name])
    #weight_data= (np.ones(len(plot_data[0])) / len(plot_data[0]+plot_data[1]),np.ones(len(plot_data[1])) / len(plot_data[0]+plot_data[1]))
    weight_data= (np.ones(len(plot_data[0])) / len(plot_data[0]),np.ones(len(plot_data[1])) / len(plot_data[1]))
    plt.figure(figsize=(20,10))
    n,bin_data,patches=plt.hist(plot_data,color=("g","r"),weights=weight_data,bins=bins)
    plt.xlabel(column_name,fontsize=22)
    plt.ylabel("Percentage",fontsize=22)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
    plt.legend(("Fully Paid","Charged Off"),fontsize=15)
    plt.show()
    return n,bin_data,patches

# Functions for plotting bar chart for categorical variables

def plot_bar_cat(df,index_name,column_name,width,x_lbl_rotation):
    pivot=pd.pivot_table(lc_df,values="id",index=index_name,columns=column_name,aggfunc=lambda x: len(x.unique()))
    plt.figure(figsize=(20,10))
    colors = ("r","g")
    x_label = pivot.index
    x_data = np.arange(len(x_label)) # create values from 0 to len(x_label) - 1
    #height_data = (df[df["loan_status"]=="Fully Paid"][column_name].value_counts().values,df[df["loan_status"]=="Charged Off"][column_name].value_counts().values)
    chart=[]
    for j,i in enumerate(pivot.columns):
        chart.append(plt.bar(x_data + j*(width), pivot[i],width=width,align="edge"))
    plt.xlabel(index_name,fontsize=22)
    plt.ylabel("Frequency",fontsize=22)
    x_label_pos = x_data + (len(pivot.columns)*width)/2
    plt.xticks(x_label_pos ,x_label,fontsize=15,rotation = x_lbl_rotation)
    plt.yticks(fontsize=15)
    plt.legend(chart,pivot.columns,fontsize=15)
    plt.show()


In [None]:
a1,b1,c1=plot_hist(lc_df,"loan_amnt",np.linspace(0,40000,num=9))
a2,b2,c2 = plot_hist(lc_df,"int_rate",np.linspace(0,25,num=6))
a3,b3,c3 = plot_hist(lc_df,"annual_inc",np.linspace(4000,6000000,num=100))

In [None]:
# Insights from the histogram
# No loan is offered from 0 to 5 % interest rate

# wrt interest rate
#1) Around 48% of the charged of customers fall under interest rate of 10 to 15 percentage
#2) 85 % of the charged off customers have taken the loan at an interest rate of > 10 %

# wrt loan_amnt
#1) 82 % of the charged customers have taken the loan amount > 5000
#2) 50 % of fully paid customers have taken the loan amont < 10000

In [None]:
plot_bar_cat(lc_df,"term","loan_status",0.2,0)
plot_bar_cat(lc_df,"home_ownership","loan_status",0.2,0)
plot_bar_cat(lc_df,"emp_length","loan_status",0.2,0)
plot_bar_cat(lc_df,"verification_status","loan_status",0.2,0)
plot_bar_cat(lc_df,"purpose","loan_status",0.2,45)
plot_bar_cat(lc_df,"addr_state","loan_status",0.2)

In [None]:
# Insights from bar chart


In [None]:
num_cols=[]
for col in lc_df.columns:
    if lc_df[col].dtype in ["int64","float64"]:
        num_cols.append(col)
num_lc_df_corr=lc_df[num_cols].corr()
num_lc_df = lc_df[num_cols]

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(num_lc_df_corr,cmap="viridis",annot=True)
plt.show()

In [None]:
# sns.pairplot(num_lc_df.iloc[:,1:10])
# plt.show()

In [None]:
### Derived Metrics
#### earliest_cr_line to year
#### closed acc = total - open acc
#### ratio of funded_amt_inv to loan_amt