## P2P Lending Dataset Preparation

This script has the goal of merging the data made available from [Lending Club](https://www.lendingclub.com) between 2007 and 2016.
In this work, we tackle only "Charged Off" and "Fully Paid" loans.
The main steps taken to prepare the dataset are the following:

1. Data load and header sanity check
2. Data concatenation
3. Removal and treatment of string variables
4. Removal of instances (loan requests) with many missing values
5. Removal of features (attributes) with many missing values
6. Removal of variables of low variability
7. Removal of features to avoid data leakage
8. Missing values imputation

### 1. Data load and header sanity check

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df2007to2011 = pd.read_csv("./LoanStats3a_securev1.csv", low_memory=False, skiprows=[0])
df2012to2013 = pd.read_csv("./LoanStats3b_securev1.csv", low_memory=False, skiprows=[0])
df2014       = pd.read_csv("./LoanStats3c_securev1.csv", low_memory=False, skiprows=[0])
df2015       = pd.read_csv("./LoanStats3d_securev1.csv", low_memory=False, skiprows=[0])
df2016Q1     = pd.read_csv("./LoanStats_securev1_2016Q1.csv", low_memory=False, skiprows=[0])
df2016Q2     = pd.read_csv("./LoanStats_securev1_2016Q2.csv", low_memory=False, skiprows=[0])
df2016Q3     = pd.read_csv("./LoanStats_securev1_2016Q3.csv", low_memory=False, skiprows=[0])
df2016Q4     = pd.read_csv("./LoanStats_securev1_2016Q4.csv", low_memory=False, skiprows=[0])

all_dfs = [df2007to2011, df2012to2013, df2014, df2015, df2016Q1, df2016Q2, df2016Q3, df2016Q4]

#### Checking out how the data is shaped and if they match

In [3]:
columnsFirstDF = list(all_dfs[0].columns.values)
error = False
for df in all_dfs:
    if set(df.columns.values) != set(columnsFirstDF):
        error = True

if error:
    print("Subfiles are not maching!")

### 2. Data concatenation

In [4]:
df = pd.concat(all_dfs)
df.set_index('id', inplace = True)
df.reset_index(inplace = True)
df.sort_index(inplace = True)
print(df.shape)

(1321864, 128)


#### Filters dataset to contain only "Charged Off" and "Fully Paid" loans

In [5]:
df = df.loc[(df.loan_status == "Charged Off") | (df.loan_status == "Fully Paid")]
print(df.shape)

(578331, 128)


### 3. Removal and treatment of string variables

In [6]:
df_string = df.select_dtypes(exclude=[np.number])
print(df_string.shape)
pd.set_option('display.max_columns', 30)
# display(df_string.head(1))


#### Converts some features to numeric
def convertToNumeric(dataframe, list_of_attributes):
    for f in list_of_attributes:
        dataframe[f].replace(regex = True, inplace=True, to_replace=r'[^\d.]+', value = r'')
        dataframe[f] = pd.to_numeric(dataframe[f], errors='ignore')

features_to_convert_to_numeric = ['term', 'zip_code', 'revol_util', 'int_rate']
convertToNumeric(df, features_to_convert_to_numeric)


#### Drops pure string text features
df.drop(['emp_title', 'url', 'desc', 'title'], axis = 1, inplace = True)
df_string = df.select_dtypes(exclude=[np.number])


#### Applies one-hot-encoding to categorical variables
def oneHotEncoding(dataframe, columnsToEncode):
    new_dummies = []
    for feature in columnsToEncode:
        # creates dummies
        dummies = pd.get_dummies(dataframe[feature])
        for v in dummies.columns.values:
            new_dummies.append(v)
        # drops the feature
        dataframe.drop(feature, axis = 1, inplace = True)
        # appends n-1 features (the last is not necessary)
        dummies.drop(dummies.columns[len(dummies.columns)-1], axis = 1, inplace=True)
        dataframe = dataframe.join(dummies)
    return dataframe, new_dummies

# for f in df_string.columns.values:
#     display(df[f].value_counts())

categorical_features = ['grade', 'sub_grade', 'emp_length', 'home_ownership', 
                        'verification_status', 'pymnt_plan', 'purpose', 
                        'addr_state', 'initial_list_status', 'application_type']
df, new_dummies = oneHotEncoding(df, categorical_features)


#### TREATS DATE COLUMNS
from datetime import datetime
def separateDates(dataframe, columns):
    for f in columns:
        dataframe[f] = pd.to_datetime(dataframe[f], format='%b-%Y')
        year = dataframe[f].apply(lambda x: x.strftime('%Y') if not pd.isnull(x) else '')
        month = dataframe[f].apply(lambda x: x.strftime('%m') if not pd.isnull(x) else '')    
#         display(year)
#         display(month)
        dataframe.drop(f, axis = 1, inplace = True)
        df[(f + '_month')] = month
        df[(f + '_year')] = year
    return df

date_columns = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 
                'next_pymnt_d', 'last_credit_pull_d']
# all of these dates are in the mmm-YYYY format
# and we wish to break them down into two separate columns: mm and YYYY
df = separateDates(df, date_columns)

print(df.shape)
# display(df.head(1))
# print(new_dummies)

(578331, 26)
(578331, 243)


### 4. Removal of instances (loan requests) with many missing values

In [7]:
#### Getting rid of instances with too many missing values (above 90%)
df.dropna(thresh = 0.1 * df.shape[1], axis = 0, inplace = True)
display(df.shape)

(578331, 243)

### 5. Removal of features (attributes) with many missing values

In [8]:
#### Removes all features with more than 70% of the values missing
df.dropna(thresh = 0.3 * df.shape[0], axis = 1, inplace = True)
display(df.shape)

(578331, 209)

### 6. Removal of variables of low variability (below 25%)

In [9]:
toRemove = []
for attribute in df.columns.values:
#     if df_training[attribute].dtype == object and attribute != 'Credit-Application-Result':
    if attribute != 'loan_status' and attribute not in new_dummies:
        count = pd.Series.value_counts(df[attribute])
        maxCount = np.max(count)
        variability = 1.0 - (float(maxCount) / count.sum())
        if variability < .25:
            print("{} has a variability of {}".format(attribute, variability))            
            toRemove.append(attribute)
            
            
for f in toRemove:
    df.drop(f, axis = 1, inplace = True)
print(df.shape)

term has a variability of 0.23898252039057222
delinq_2yrs has a variability of 0.18065606028381676
pub_rec has a variability of 0.15178677954320274
out_prncp has a variability of 0.0
out_prncp_inv has a variability of 0.0
total_rec_late_fee has a variability of 0.02915631359895976
recoveries has a variability of 0.1213595674449407
collection_recovery_fee has a variability of 0.10549840835092705
collections_12_mths_ex_med has a variability of 0.011065669447926973
policy_code has a variability of 0.0
acc_now_delinq has a variability of 0.004236328331007622
tot_coll_amt has a variability of 0.13598230628431973
chargeoff_within_12_mths has a variability of 0.007214560546452864
delinq_amnt has a variability of 0.0031279665105277132
num_accts_ever_120_pd has a variability of 0.22768339043506824
num_tl_120dpd_2m has a variability of 0.0007747292986883814
num_tl_30dpd has a variability of 0.0031151734171868117
num_tl_90g_dpd_24m has a variability of 0.05583650130629991
pub_rec_bankruptcies has

### 7. Removal of features to avoid data leakage

In [10]:
featuresToRemove = ['id',
#                     'issue_d',
#                     'last_credit_pull_d',
#                     'last_pymnt_amnt',
#                     'last_pymnt_d',
#                     'member_id',
#                     'next_pymnt_d',
#                     'policy_code',
#                     'revol_bal',
#                     'revol_bal_joint',
#                     'revol_util',
#                     'title',
#                     'total_pymnt',
#                     'total_pymnt_inv',
#                     'total_rec_int',
#                     'total_rec_late_fee',
#                     'total_rec_prncp',
#                     'url']
                   ]
df.drop(featuresToRemove, axis = 1, inplace = True)
display(df.head())

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,loan_status,zip_code,dti,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,open_acc,revol_bal,...,VT,WA,WI,WV,f,DIRECT_PAY,INDIVIDUAL,issue_d_month,issue_d_year,earliest_cr_line_month,earliest_cr_line_year,last_pymnt_d_month,last_pymnt_d_year,last_credit_pull_d_month,last_credit_pull_d_year
0,5000.0,5000.0,4975.0,10.65,162.87,24000.0,Fully Paid,860,27.65,735.0,739.0,1.0,,3.0,13648.0,...,0,0,0,0,1,0,1,12,2011,1,1985,1,2015,4,2017
1,2500.0,2500.0,2500.0,15.27,59.83,30000.0,Charged Off,309,1.0,740.0,744.0,5.0,,3.0,1687.0,...,0,0,0,0,1,0,1,12,2011,4,1999,4,2013,10,2016
2,2400.0,2400.0,2400.0,15.96,84.33,12252.0,Fully Paid,606,8.72,735.0,739.0,2.0,,2.0,2956.0,...,0,0,0,0,1,0,1,12,2011,11,2001,6,2014,4,2017
3,10000.0,10000.0,10000.0,13.49,339.31,49200.0,Fully Paid,917,20.0,690.0,694.0,1.0,35.0,10.0,5598.0,...,0,0,0,0,1,0,1,12,2011,2,1996,1,2015,4,2016
4,3000.0,3000.0,3000.0,12.69,67.79,80000.0,Fully Paid,972,17.94,695.0,699.0,0.0,38.0,15.0,27783.0,...,0,0,0,0,1,0,1,12,2011,1,1996,1,2017,1,2017


### 8. Missing values imputation

In [11]:
for f in df.columns.values:
    if df[f].dtype == np.float64 or df[f].dtype == np.int64:
        df[f].fillna(df[f].median(),inplace = True)
    else:
        df[f].fillna(df[f].value_counts().index[0], inplace = True)

## Saves this final DF to a csv file

In [18]:
df.to_csv("./p2p_lendingclub_clean.csv", index = False)

# OTHER STUFF

#### Converting n/a into NaNs and NaNs into 0s

In [None]:
df.replace('n/a', np.nan, inplace = True)
df.replace(np.nan, 0.0, inplace = True)

#### Dropping irrelevant of features that would strongly bias our models

#### Checking data types

In [None]:
# display(df.dtypes)

#### Checking how many missing values we have per feature

In [None]:
# display(df.isnull().sum() / df.shape[0])

#### Getting rid of non-numeric data

In [None]:
# df_num = df.select_dtypes(include=[np.number])
# display(df_num.shape)
# display(df_num)

#### Checking the correlation between features

In [15]:
df_correlation = df.corr().abs()
display(df_correlation.head(5))
#display(df_correlation)


# import seaborn as sns
# sns.heatmap(df_correlation,
#             xticklabels=df_correlation.columns.values,
#             yticklabels=df_correlation.columns.values)

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,zip_code,dti,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,open_acc,revol_bal,revol_util,...,PA,RI,SC,SD,TN,TX,UT,VA,VT,WA,WI,WV,f,DIRECT_PAY,INDIVIDUAL
loan_amnt,1.0,0.998905,0.996433,0.164108,0.953794,0.343733,0.000146,0.010949,0.097511,0.097511,0.010189,0.026296,0.196406,0.327251,0.103727,...,0.007766,0.006231,0.002857,0.00452,0.000809,0.030464,0.000409,0.016641,0.005981,0.003217,0.008672,0.003023,0.074834,0.003844,0.020541
funded_amnt,0.998905,1.0,0.997746,0.164593,0.95545,0.343434,0.000269,0.011273,0.095408,0.095407,0.010478,0.026274,0.197126,0.327021,0.104342,...,0.007733,0.006301,0.002881,0.004478,0.000595,0.030476,0.000447,0.016663,0.005961,0.00336,0.008635,0.002933,0.076943,0.003775,0.020686
funded_amnt_inv,0.996433,0.997746,1.0,0.165536,0.953116,0.342465,0.000576,0.012041,0.092713,0.092712,0.011791,0.024845,0.197991,0.326093,0.105314,...,0.007519,0.006286,0.002929,0.004387,0.000173,0.030461,0.000436,0.016515,0.005931,0.003284,0.008579,0.002721,0.081333,0.00366,0.020864
int_rate,0.164108,0.164593,0.165536,1.0,0.159958,0.062902,0.005846,0.05635,0.458455,0.458451,0.22302,0.017819,0.007037,0.015379,0.28253,...,0.001621,0.000144,0.000261,0.001371,0.008895,0.005956,0.000133,0.000738,0.00321,0.001651,0.005052,0.001292,0.069017,0.046714,0.03926
installment,0.953794,0.95545,0.953116,0.159958,1.0,0.336469,0.00647,0.010489,0.047084,0.047084,0.011859,0.028809,0.186511,0.314858,0.125522,...,0.01203,0.006441,0.003623,0.003863,0.001332,0.032988,0.001816,0.013516,0.00607,0.004421,0.009856,0.00539,0.036317,0.000638,0.021357


#### Let's take a look at string data

#### Runs a decision tree to see what's going on

In [None]:
# from sklearn import tree
# from sklearn.model_selection import train_test_split

# X = df_num
# y = df['loan_status']

# xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.33, random_state=42)

# clf = tree.DecisionTreeClassifier()
# clf.fit(xTrain, yTrain)


In [None]:
# df.groupby('loan_status').count()

In [None]:
# from IPython.display import Image  
# import pydotplus 
# dot_data = tree.export_graphviz(clf, out_file=None, 
#                          feature_names=X.columns.values,  
#                          class_names=y,  
#                          filled=True, rounded=True,  
#                          special_characters=True)  
# graph = pydotplus.graph_from_dot_data(dot_data)  
# Image(graph.create_png())