In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

sns.set_style('whitegrid')

## Step 1 : Reading Data in Dataframe 
Now we will be reading data from "Loan.csv" to Pandas Dataframe.

In [3]:
file1 = "../data/raw/loan.csv"
ChunkSize = 200000
i = 1
for chunk in pd.read_csv(file1, chunksize=ChunkSize, low_memory=False):
    loan_data = chunk if i == 1 else pd.concat([loan_data, chunk])
    print('-->Read Chunk...', i)
    i += 1

-->Read Chunk... 1
-->Read Chunk... 2
-->Read Chunk... 3
-->Read Chunk... 4
-->Read Chunk... 5
-->Read Chunk... 6
-->Read Chunk... 7
-->Read Chunk... 8
-->Read Chunk... 9
-->Read Chunk... 10
-->Read Chunk... 11
-->Read Chunk... 12


## Step 2 : Checking data size 
Let's quickly check the shape of data.This will give us idea as how large is current dataset. As we can see, current data set contains more than 2 million rows and 145 columns. Not all columns are useful for our analysis.

In [4]:
# print(list(data.columns))
print(loan_data.shape)

(2260668, 145)


## Step 3 : Calculating Percentage of missing data per column 
Here we will be creating Dataframe called "df_null". This Data frame consist of column name with percent of missing data in each column in descending order. It is noted that any column that has more than 60% of data missing are useless for our analysis and we will simply drop it. As we see some column like "url", "id", "member_id" has 100% missing values. Apart from that lot of columns have more than 60% missing data. We will simply drop column which has more than 60% data missing.

In [5]:
# print((data.isna().sum()[data.isna().sum() > 0]))
df_null = pd.DataFrame({'Count': loan_data.isnull().sum(), 'Percent': 100*loan_data.isnull().sum()/len(loan_data)})
print(df_null[df_null['Percent'] >= 60].sort_values(by='Percent', ascending=False).round(1))

                                              Count  Percent
id                                          2260668    100.0
url                                         2260668    100.0
member_id                                   2260668    100.0
orig_projected_additional_accrued_interest  2252242     99.6
hardship_dpd                                2250055     99.5
hardship_length                             2250055     99.5
hardship_reason                             2250055     99.5
hardship_status                             2250055     99.5
deferral_term                               2250055     99.5
hardship_amount                             2250055     99.5
hardship_start_date                         2250055     99.5
hardship_end_date                           2250055     99.5
payment_plan_start_date                     2250055     99.5
hardship_loan_status                        2250055     99.5
hardship_type                               2250055     99.5
hardship_payoff_balance_

## Step 4 : Dropping column with more than 60% missing data 
We will create a list of columns that a data missing more than 60%. We will simply drop those columns and create new dataframe called df_clean. We wil print shape of new dataframe to see how much data has been reduced. We can see total count of columns from 145 to reduced to 103.

In [6]:
# Creating list of column with more than 60% data missing. 
list_60 = list(df_null[df_null['Percent']>60].index)

print(len(list_60))
df_clean = loan_data.drop(list_60, axis=1)
print(df_clean.shape)

42
(2260668, 103)


In [7]:
df_sample = df_clean.sample(frac= 0.4)

In [9]:
df_sample.to_csv('../data/raw/loan_sample.csv', index= False)