# Lending Case Study

In [None]:
## Importing all necessary libs

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 99999)

In [None]:
df = pd.read_csv('loan.csv')

In [None]:
df.head()

In [None]:
print("Shape: ", df.shape)
df.columns[df.isna().sum() > 0]

In [None]:
## Dropping all columns that contain NULL values

df.dropna(axis=1, how='all', inplace=True)
print('Shape: ', df.shape)
df.columns[df.isna().sum() > 0]

In [None]:
print(df.isna().sum()/len(df.columns))

## Columns containing 0 null values
print(df.columns[df.isna().sum() > 0])

There are 14 more columns that still contains missing values.

Filling in the missing values in the columns with the mode/mean/median of the columns can be done.
Since there are 14 columns, 
Lets identify which of the columns would be needed for the analysis and which of the columns would not be needed for the analysis.

In [None]:
print(df.columns)

columns with all unique values can be dropped as they do not provide any information

In [None]:
## Identifying columns with all unique values
print("Shape: ",df.shape)
# print(df.nunique())
df_unique = (df.nunique() == 1) 
df_unique = df_unique[df_unique == True]
print(df_unique)


## Dropping columns that contain only 1 unique value
df.drop(df_unique.index, axis=1, inplace=True)
df.shape

In [None]:
def print_unique_and_null_values(df_x):
    print(df_x.shape)
    # for col in df.columns:
    #     print(col, df[col].nunique(), df[col].isna().sum())
    na_sum = df_x.isna().sum()
    n_unique = df_x.nunique()
    na_percentage = ((na_sum/len(df_x)) * 100).astype(int).apply(str) + '%'

    na_sum_df = na_sum.reset_index()
    n_unique_df = n_unique.reset_index()
    na_percentage_df = na_percentage.reset_index()

    n_unique_df.columns = ['Column', 'Unique Values']
    na_sum_df.columns = ['Column', 'Missing Values']
    na_percentage_df.columns = ['Column', 'Missing Percentage']

    merged_df = pd.merge( n_unique_df,na_sum_df, on='Column')
    merged_df = pd.merge(merged_df, na_percentage_df, on='Column')
    print(merged_df)

In [None]:
print("Shape: \n", df.shape)
print_unique_and_null_values(df)

Columns that are not required for analysis
(based on the value present in the cells and data dictionary)

In [None]:
def remove_columns(df, columns_to_remove):
    df_x = df.copy()
    df_x.drop(columns_to_remove, axis=1, inplace=True)
    return df_x


# Columns that are not required for analysis based on the present value and data dictionary
irrelevant_columns = np.array([
    # 'id',
    'member_id',
    'url',
    'desc',
    'title',
    'zip_code',
    # 'addr_state',
])

print("irrelevant_columns.shape: ", irrelevant_columns.shape)

df_1 = remove_columns(df, irrelevant_columns)

print("Shape: \n", df_1.shape)
print_unique_and_null_values(df_1)

In [None]:
# Columns that are post-loan approval and not required for analysis
post_approval_columns = np.array([
    'emp_title', 

    'funded_amnt',

    'issue_d', 
    'delinq_2yrs', 
    
    'mths_since_last_delinq',
    'mths_since_last_record',

    'revol_bal',

    "out_prncp",
    "out_prncp_inv",

    'total_pymnt',
    'total_pymnt_inv',
    'total_rec_prncp',
    'total_rec_int',
    'total_rec_late_fee',

    'recoveries',
    'collection_recovery_fee', 

    'last_pymnt_d',
    'last_pymnt_amnt', 
    'next_pymnt_d', 
    'last_credit_pull_d'
    # 'pub_rec_bankruptcies'
    ])

print("post_approval_columns.shape: ",post_approval_columns.shape)

df_1 = remove_columns(df_1, post_approval_columns)

print("Shape: \n", df_1.shape)
print_unique_and_null_values(df_1)


In [None]:
# removing outliers from annual_income
sns.boxplot(df_1['annual_inc'])

In [None]:
quantile_info = df_1.annual_inc.quantile([0.5, 0.75,0.90, 0.95, 0.97,0.98, 0.99,1.0])
quantile_info

In [None]:
per_95_annual_inc = df_1['annual_inc'].quantile(0.99)
df_1 = df_1[df_1.annual_inc <= per_95_annual_inc]

# Data Standardization

In [None]:
## emp_length column

emp_length_default = df_1['emp_length'].mode()[0]

df_1['emp_length'].value_counts()
df_1['emp_length'].fillna(emp_length_default, inplace=True)

print_unique_and_null_values(df_1)

In [None]:
## revol_util column
df_1['revol_util'].isna().sum()

## since the number of missing values is very low, we can drop the rows with missing values
df_1.dropna(axis=0, subset=['revol_util'], inplace=True)

print_unique_and_null_values(df_1)

In [None]:
## pub_rec_bankruptcies column

df_1['pub_rec_bankruptcies'].value_counts()

## records with loan_status == 'Current', should not be considered for analysis, since the loan_status is not final
df_1= df_1[ df_1['loan_status'] != 'Current' ]


## records with loan_status == 'Fully Paid', probabily have pub_rec_bankruptcies == 0
# df_1['pub_rec_bankruptcies'] = np.where(df_1['loan_status'] == 'Fully Paid', 0, df_1['pub_rec_bankruptcies'])

print_unique_and_null_values(df_1)

In [None]:
## pub_rec_bankruptcies column
df_1_pub_rec_bankruptcies_na = df_1[ df_1['pub_rec_bankruptcies'].isna()]

count = df_1['loan_status'].value_counts()
na_len =  df_1_pub_rec_bankruptcies_na['loan_status'].value_counts()

print(count)
print(na_len)
print('\n',(na_len/count)*100)

In [None]:
## since the ratio of missing values is very low, we can drop the rows with missing values
df_1.dropna(axis=0, subset=['pub_rec_bankruptcies'], inplace=True)

print_unique_and_null_values(df_1)

In [None]:
df_1.info()

In [None]:
df_2 = df_1.copy()

df_2['term'] = df_2['term'].str.extract('(\d+)').astype(int)
df_2['int_rate'] = df_2['int_rate'].str.extract('(\d+.\d+)').astype(float)
df_2['emp_length'] = df_2['emp_length'].str.extract('(\d+)').astype(int)


def standardize_dates(date_str):
    if '/' in date_str:
        return pd.to_datetime(date_str, format='%d/%m/%Y', errors='coerce').strftime('%Y-%m-%d')
    else:
        return pd.to_datetime(date_str, format='%b-%y', errors='coerce').strftime('%Y-%m-%d')

# df_2['earliest_cr_line'] = df_2['earliest_cr_line'].apply(lambda x: standardize_dates(x))

# df_2['earliest_cr_line'] = pd.to_datetime(df_2['earliest_cr_line'], format='%Y-%m-%d', errors='coerce')


print(df_2.info())
(df_2.head())


#  Analysis

##  Univariate Analysis

### Categorical data

In [None]:
df_3 = df_2.copy()

df_3_charged_off = df_3[ df_3['loan_status'] == 'Charged Off']
df_3_fully_paid = df_3[ df_3['loan_status'] == 'Fully Paid']

In [None]:
# print(df_3['loan_status'].value_counts())
# df_3['loan_status'].value_counts().plot(kind='bar')


sns.countplot(x='loan_status', data=df_2)
plt.title('Count of Items in Each Status')
plt.xlabel('Loan Status')
plt.ylabel('Count')
plt.show()

In [None]:
# sns.countplot( x='grade', order = ['A', 'B', 'C', 'D', 'E', 'F', 'G'], data=df_3_charged_off)
# plt.title('Distribution of Charged Off Loans by Grade')
# plt.xlabel('Category')
# plt.ylabel('Value')
# plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))  # 1 row, 2 columns

axes[0].set_title('Distribution of Fully Paid Loans by Grade')
axes[1].set_title('Distribution of Charged Off Loans by Grade')

sns.countplot( x='grade', order = ['A', 'B', 'C', 'D', 'E', 'F', 'G'], data=df_3_fully_paid, ax=axes[0])
sns.countplot( x='grade', order = ['A', 'B', 'C', 'D', 'E', 'F', 'G'], data=df_3_charged_off, ax=axes[1])

plt.show()

In [None]:
fig, axes = plt.subplots(2,2, figsize=(10, 6))  # 1 row, 2 columns

axes[0][0].set_title('Distribution of Fully Paid Loans by Grade')
axes[0][1].set_title('Distribution of Charged Off Loans by Grade')

## by subgrades
axes[1][0].set_title('Distribution of Fully Paid Loans by Subgrade')
axes[1][1].set_title('Distribution of Charged Off Loans by Subgrade')

sns.countplot( x='grade', order = ['A', 'B', 'C', 'D', 'E', 'F', 'G'], data=df_3_fully_paid, ax=axes[0][0])
sns.countplot( x='grade', order = ['A', 'B', 'C', 'D', 'E', 'F', 'G'], data=df_3_charged_off, ax=axes[0][1])

sns.countplot( x='sub_grade', order = ['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5'], data=df_3_fully_paid, ax=axes[1][0])
sns.countplot( x='sub_grade', order = ['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5'], data=df_3_charged_off, ax=axes[1][1])

plt.show()

In [None]:
df_3['sub_grade'] = pd.to_numeric(df_3.sub_grade.apply(lambda x : x[-1]))
df_3_charged_off = df_3[ df_3['loan_status'] == 'Charged Off']
df_3_fully_paid = df_3[ df_3['loan_status'] == 'Fully Paid']

In [None]:
fig, axes = plt.subplots( figsize=(12, 6))  # 1 row, 2 columns

axes.set_title('Distribution of Charged Off Loans by Grade')
sns.countplot( x='grade', order = ['A', 'B', 'C', 'D', 'E', 'F', 'G'], hue = 'sub_grade', data=df_3_charged_off )
plt.show()

In [None]:
print(df_3_charged_off['purpose'].value_counts())

ig, ax = plt.subplots(figsize = (10,8))
sns.countplot(x ='purpose', data=df_3_charged_off)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()

In [None]:
# since the number of records for 'debt_consolidation' is very high,let try using a log scale
ig, ax = plt.subplots(figsize = (10,8))
sns.countplot(y ='purpose', data=df_3_charged_off)
ax.set(xscale = 'log')
plt.show()

In [None]:
df_3.info()

In [None]:
print(df_3['home_ownership'].value_counts())

fig, axes = plt.subplots( figsize=(12, 6))
axes.set_title('Distribution of Charged Off Loans by Home Ownership')
sns.countplot( x='home_ownership', order=[ 'RENT', 'MORTGAGE', 'OWN', 'OTHER'], data=df_3_charged_off)
axes.set(yscale = 'log')  # since the count of 'OTHER' is very low
plt.show() 

In [None]:
print(df_3['home_ownership'].value_counts())

fig, axes = plt.subplots( figsize=(12, 6))
axes.set_title('Distribution of Charged Off Loans by Home Ownership')
sns.countplot( x='home_ownership', order=[ 'RENT', 'MORTGAGE', 'OWN', 'OTHER'], data=df_3_charged_off)
axes.set(yscale = 'log')  # since the count of 'OTHER' is very low
plt.show() 

In [None]:
print(df_3_charged_off['verification_status'].value_counts())

fig, axes = plt.subplots( figsize=(12, 6))
axes.set_title('Distribution of Charged Off Loans by Verification Status')
sns.countplot( x='verification_status', order=[ 'Source Verified', 'Verified','Not Verified',], data=df_3_charged_off)
plt.show() 

In [None]:
df_3['revol_util'] = df_3['revol_util'].str.extract('(\d+.\d+)').astype(float)

df_3['revol_util_groups'] = pd.cut(df_3['revol_util'], bins=5,precision =0,labels=['0-20','20-40','40-60','60-80','80-100'])

df_3_charged_off = df_3[ df_3['loan_status'] == 'Charged Off']
df_3_fully_paid = df_3[ df_3['loan_status'] == 'Fully Paid']

In [None]:
print(df_3_charged_off['revol_util_groups'].value_counts())

fig, axes = plt.subplots( figsize=(12, 6))
axes.set_title('Distribution of Charged Off Loans by Revolving Utilization')
sns.countplot( x='revol_util_groups', data=df_3_charged_off)
plt.show()

### Numerical data

In [None]:
df_3.info()

In [None]:
# function to create bins and show chart

def count_plot_with_custom_bins(df_x,column, title, bins, labels, log_scale=False):
    df_y = df_x.copy()
    new_column = column + '_groups'

    df_y[new_column] = pd.cut(df_y[column], bins=bins,precision =0,labels=labels)

    fig, axes = plt.subplots( figsize=(12, 6))
    axes.set_title(title)
    sns.countplot( x=new_column, data=df_y, order=labels)
    if log_scale:
        axes.set(yscale = 'log')
    plt.show()

In [None]:
# print(df_3_charged_off['loan_amnt'].describe( percentiles=[0.25, 0.5, 0.75, 0.9, 0.95,0.98, 0.99,1]))

min_loan_amnt = df_3['loan_amnt'].quantile(0.01)
max_loan_amnt = df_3['loan_amnt'].quantile(0.99)
diff = max_loan_amnt - min_loan_amnt

print(min_loan_amnt, max_loan_amnt, diff)
#min = 1200
#max = 35000


no_of_bins = 7
bin_labels= [
    '0-5k',
    '5k-10k',
    '10k-15k',
    '15k-20k',
    '20k-25k',
    '25k-30k',
    '30k-35k'
]

count_plot_with_custom_bins(df_3_charged_off, 'loan_amnt', 'Distribution of Charged Off Loans by Loan Amount',  bins=no_of_bins, labels=bin_labels, log_scale=False)


no_of_bins = 9
bin_labels= [
    '0-4k',
    '4k-8k',
    '8k-12k',
    '12k-16k',
    '16k-20k',
    '20k-24k',
    '24k-28k',
    '28k-32k',
    '32k-36k'
]

count_plot_with_custom_bins(df_3_charged_off, 'loan_amnt', 'Distribution of Charged Off Loans by Loan Amount',  bins=no_of_bins, labels=bin_labels, log_scale=False)




In [None]:
print(df_3_charged_off['term'].value_counts())

fig, axes = plt.subplots( figsize=(12, 6))
axes.set_title('Distribution of Charged Off Loans by Term')
sns.countplot( x='term', data=df_3_charged_off)
plt.show()

In [None]:
no_of_bins = 5
bin_labels= [
    '5%-9%',
    '9%-13%',
    '13%-17%',
    '17%-21%',
    '21%-24%'
]

count_plot_with_custom_bins(df_3_charged_off, 'int_rate', 'Distribution of Charged Off Loans by Interest Rate',  bins=no_of_bins, labels=bin_labels, log_scale=False)

no_of_bins = 7
bin_labels= [
    '5%-8%',
    '8%-11%',
    '11%-14%',
    '14%-17%',
    '17%-20%',
    '20%-23%',
    '23%-26%'
]

count_plot_with_custom_bins(df_3_charged_off, 'int_rate', 'Distribution of Charged Off Loans by Interest Rate',  bins=no_of_bins, labels=bin_labels, log_scale=False)


no_of_bins = 9
bin_labels = [
    '5%-7%', '7%-9%', '9%-11%', '11%-13%', '13%-15%', '15%-17%', '17%-19%', '19%-21%', '21%-24%'
]
count_plot_with_custom_bins(df_3_charged_off, 'int_rate', 'Distribution of Charged Off Loans by Interest Rate',  bins=no_of_bins, labels=bin_labels, log_scale=False)

In [None]:
# installments

# min - 16.08 
# max - 1305.19

no_of_bins = 5
bin_labels = [
    '0-300', '300-600', '600-900', '900-1200', '1200-1500'
]

count_plot_with_custom_bins(df_3_charged_off, 'installment', 'Distribution of Charged Off Loans by Installment',  bins=no_of_bins, labels=bin_labels, log_scale=False)


no_of_bins = 7
bin_labels = [
    '0-200', '200-400', '400-600', '600-800', '800-1000', '1000-1200', '1200-1400'
]

count_plot_with_custom_bins(df_3_charged_off, 'installment', 'Distribution of Charged Off Loans by Installment',  bins=no_of_bins, labels=bin_labels, log_scale=False)


no_of_bins = 9
bin_labels = [
    '0-150', '150-300', '300-450', '450-600', '600-750', '750-900', '900-1050', '1050-1200', '1200-1350'
]

count_plot_with_custom_bins(df_3_charged_off, 'installment', 'Distribution of Charged Off Loans by Installment',  bins=no_of_bins, labels=bin_labels, log_scale=False)


In [None]:
print(df_3_charged_off['emp_length'].value_counts())

fig, axes = plt.subplots( figsize=(12, 6))
axes.set_title('Distribution of Charged Off Loans by Employment Length')
sns.countplot( x='emp_length', data=df_3_charged_off)
plt.show()

In [None]:
df_3['annual_inc'].describe( percentiles=[.25, .5, .75, .9, .95, .99])

In [None]:
min_value = df_3['annual_inc'].min()  #4000.0
max_value = df_3['annual_inc'].max()  #234996.0

no_of_bins = 5
bin_labels = [
    '0-50k', '50k-100k', '100k-150k', '150k-200k', '200k-250k'
]
count_plot_with_custom_bins(df_3_charged_off, 'annual_inc', 'Distribution of Charged Off Loans by Annual Income',  bins=no_of_bins, labels=bin_labels, log_scale=False)


no_of_bins = 7
bin_labels = [
    '0-35k', '35k-70k', '70k-105k', '105k-140k', '140k-175k', '175k-210k', '210k-245k' 
]
count_plot_with_custom_bins(df_3_charged_off, 'annual_inc', 'Distribution of Charged Off Loans by Annual Income',  bins=no_of_bins, labels=bin_labels, log_scale=False)

no_of_bins = 9
bin_labels = [
    '0-26k', '26k-52k', '52k-78k', '78k-104k', '104k-130k', '130k-156k', '156k-182k', '182k-208k', '208k-234k'
]

count_plot_with_custom_bins(df_3_charged_off, 'annual_inc', 'Distribution of Charged Off Loans by Annual Income',  bins=no_of_bins, labels=bin_labels, log_scale=False)

no_of_bins = 11
bin_labels = [
  '4k-24k',
'24k-45k',
'45k-66k',
'66k-87k',
'87k-108k',
'108k-129k',
'129k-150k',
'150k-171k',
'171k-192k',
'192k-213k',
'213k-234k'
]
count_plot_with_custom_bins(df_3_charged_off, 'annual_inc', 'Distribution of Charged Off Loans by Annual Income',  bins=no_of_bins, labels=bin_labels, log_scale=False)

In [None]:
# dti
min_value = df_3_charged_off['dti'].min()  #0.0
max_value = df_3_charged_off['dti'].max()  #29.99
print(min_value, max_value)

no_of_bins = 3
bin_labels = [
    '0-10', '10-20', '20-30'
]
count_plot_with_custom_bins(df_3_charged_off, 'dti', 'Distribution of Charged Off Loans by Debt to Income Ratio',  bins=no_of_bins, labels=bin_labels, log_scale=False)

no_of_bins = 5
bin_labels = [
    '0-6', '6-12', '12-18', '18-24', '24-30'
]
count_plot_with_custom_bins(df_3_charged_off, 'dti', 'Distribution of Charged Off Loans by Debt to Income Ratio',  bins=no_of_bins, labels=bin_labels, log_scale=False)

no_of_bins = 7
bin_labels = [
    '0-4', '4-8', '8-12', '12-16', '16-20', '20-24', '24-30'
]
count_plot_with_custom_bins(df_3_charged_off, 'dti', 'Distribution of Charged Off Loans by Debt to Income Ratio',  bins=no_of_bins, labels=bin_labels, log_scale=False)

no_of_bins = 9
bin_labels = [
    '0-3', '3-6', '6-9', '9-12', '12-15', '15-18', '18-21', '21-24', '24-30'
]
count_plot_with_custom_bins(df_3_charged_off, 'dti', 'Distribution of Charged Off Loans by Debt to Income Ratio',  bins=no_of_bins, labels=bin_labels, log_scale=False)

no_of_bins = 11
bin_labels = [
    '0-2', '2-4', '4-6', '6-8', '8-10', '10-12', '12-14', '14-16', '16-18', '18-20', '20-30'
]
count_plot_with_custom_bins(df_3_charged_off, 'dti', 'Distribution of Charged Off Loans by Debt to Income Ratio',  bins=no_of_bins, labels=bin_labels, log_scale=False)

In [None]:
# print(df_3_charged_off['inq_last_6mths'].value_counts())

fig, axes = plt.subplots( figsize=(12, 6))
sns.countplot( x='inq_last_6mths', data=df_3_charged_off)
axes.set(yscale = 'log')
plt.show()

In [None]:
print(df_3_charged_off['open_acc'].min(), df_3_charged_off['open_acc'].max())

# min - 2
# max - 38

no_of_bins = 5
bin_labels = [
    '0-8', '8-16', '16-24', '24-32', '32-40'
]
count_plot_with_custom_bins(df_3_charged_off, 'open_acc', 'Distribution of Charged Off Loans by Open Accounts',  bins=no_of_bins, labels=bin_labels, log_scale=False)

# min - 2
# max - 38

no_of_bins = 6
bin_labels = [
    '0-6', '6-12', '12-18', '18-24', '24-30', '30-36'
]
count_plot_with_custom_bins(df_3_charged_off, 'open_acc', 'Distribution of Charged Off Loans by Open Accounts',  bins=no_of_bins, labels=bin_labels, log_scale=False)

# min - 2
# max - 38

no_of_bins = 7
bin_labels = [
    '0-6', '6-12', '12-18', '18-24', '24-30', '30-36', '36-42'
]
count_plot_with_custom_bins(df_3_charged_off, 'open_acc', 'Distribution of Charged Off Loans by Open Accounts',  bins=no_of_bins, labels=bin_labels, log_scale=False)

# min - 2
# max - 38

no_of_bins = 9
bin_labels = [
    '0-4', '4-8', '8-12', '12-16', '16-20', '20-24', '24-28', '28-32', '32-36'
]
count_plot_with_custom_bins(df_3_charged_off, 'open_acc', 'Distribution of Charged Off Loans by Open Accounts',  bins=no_of_bins, labels=bin_labels, log_scale=False)

# min - 2
# max - 38
no_of_bins = 11
bin_labels = [
    '0-3', '3-6', '6-9', '9-12', '12-15', '15-18', '18-21', '21-24', '24-27', '27-30', '30-36'
]
count_plot_with_custom_bins(df_3_charged_off, 'open_acc', 'Distribution of Charged Off Loans by Open Accounts',  bins=no_of_bins, labels=bin_labels, log_scale=False)


In [None]:
fig, axes = plt.subplots(figsize=(15, 5))

sns.countplot(x='addr_state', data=df_3_charged_off)
plt.title('Count of Items in Each Status')
plt.xlabel('Loan Status')
plt.ylabel('Count')
# rotate x-axis labels
plt.xticks(rotation=90)
plt.show()

### Observations from Uni-variate Analysis

From the univariate analysis, we can make the following observations that there is more possibility of defaulting when:
* When the 'grades' is 'B'
* When the 'sub_grade' is 'B5'
* When the purpose is 'debt_consolidation'
* When the 'home_ownership' is 'RENT'
* When the 'term' is '36 months'
* When the 'revol_util' is between '60-80'
* When the 'loan_amnt' is between '4k-8k'
* When the 'int_rate' is between '11%-14%'
* When the 'installment' is between '150-300'
* When the 'emp_length' is between '10+ years'
* When the 'annual_inc' is between '26k-52k'
* When the 'dti' is between '10-20'
* When the 'open_acc' is between '3-6'
* When the 'addr_state' is 'CA'

## Bi-variate Analysis

In [None]:
# df_3.info()

object_cols = df_3.select_dtypes(include=['object']).columns.tolist()
int_float_cols = df_3.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(object_cols)
print(int_float_cols)

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='annual_inc', y='purpose', hue ='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3,x='annual_inc', y='grade', hue ='loan_status',order=['A', 'B', 'C', 'D', 'E', 'F', 'G'])
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3,x='loan_amnt', y='grade', hue ='loan_status', order=['A', 'B', 'C', 'D', 'E', 'F', 'G'])
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='annual_inc', y='home_ownership', hue ='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='annual_inc', y='revol_util_groups', hue ='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='loan_amnt', y='purpose', hue ='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='dti', y='purpose', hue ='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='emp_length', y='purpose', hue ='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='open_acc', y='purpose', hue ='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='open_acc', y='home_ownership', hue ='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='total_acc', y='purpose', hue ='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='loan_amnt', y='addr_state', hue ='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='annual_inc', y='addr_state', hue ='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data =df_3_charged_off,x='inq_last_6mths', y='addr_state', hue ='loan_status')
plt.show()

### Observations from Bi-variate Analysis

From the bi-variate analysis, we can make the following observations that there is more possibility of defaulting when:
* When the 'annual_inc' is between '60k-70k' purpose is 'home_improvement' or 'house' 
* When the 'annual_inc' is between '60k-70k' and 'grade' is 'F' or 'G'
* When the 'annual_inc' is between '60k-70k' and 'home_ownership' is 'MORTGAGE'
* When the 'annual_inc' is above '60k' and 'revol_util' is between '80-100'
* When the 'purpose' is 'small_business' and 'loan_amnt' is '14k-16k' 
* When the 'purpose' is 'credit_card' and 'dti' is between '14-16' 
* When the 'purpose' is 'vacation' and 'emp_length' is between '6-7 years'
* When the 'purpose' is 'house' and 'inq_last_6mths' is between '1-2' 
* When the 'home_ownership' is 'MOREGAGE' and 'open_acc' is close to '10'
* When the 'loan_amnt' is between '15k-20k' and 'grade' is 'F'
* When the 'addr_state' is 'WY' and 'loan_amnt' is above '15k'
* When the 'addr_state' is 'DC' and 'annual_inc' is between '70-80k'
* When the 'addr_state' is 'WY' and 'inq_last_6mths' is above '2'