### Work in Progress
Created a second version of this lending jupyter notebook to work on a subset of the original lending club dataset. The subset contains only 27 features, of the original 150+ features, more commonly used for teaching of classification problem in machine learning. 

In [1]:
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Stats models 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer as Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
warnings.filterwarnings('ignore')

In [126]:
# Read in a subset of the dataset
usecols = ["loan_amnt", "term", "int_rate", "installment", "grade", "sub_grade", "emp_title",
           "emp_length", "home_ownership", "annual_inc", "verification_status", "issue_d", "loan_status", "purpose",
           "title", "zip_code", "addr_state", "dti", "earliest_cr_line", "open_acc", "pub_rec",
           "revol_bal", "revol_util", "total_acc", "initial_list_status", "application_type", "mort_acc", "pub_rec_bankruptcies"]

df = pd.read_csv("../dataset/lendingclub/accepted_2007_to_2018Q4.csv",
                 usecols=usecols)

In [127]:
# Perform some data cleaning for the numerical data similar to lending_v1.ipynb
# Certain features were not part of the 27 features.
df = df.iloc[:-2, :]

# Subset data frame based on loan status - Charged Off and Fully Paid
mapping = {"Does not meet the credit policy. Status:Charged Off": "Charged Off",
           "Default": "Charged Off",
           "Does not meet the credit policy. Status:Fully Paid": "Fully Paid"}

df["loan_status"] = df["loan_status"].replace(mapping)

df = df[~df["loan_status"].isin(["Current", 
                                 "In Grace Period", 
                                 "Late (16-30 days)",
                                 "Late (31-120 days)"])]

# Determine the percentage of missing values from each feature.
ms_values_count = df.isnull().sum()
ms_values_perc = 100 * ms_values_count / len(df)

ms_values_df = pd.DataFrame({"ms_values_count": ms_values_count,
                             "ms_values_perc": ms_values_perc})
ms_values_df.sort_values("ms_values_perc", ascending=False, inplace=True)

# Remove features with more than 50% missing values. 
feat_rm = list(ms_values_df[ms_values_df["ms_values_perc"] > 50].index)
df.drop(feat_rm, axis=1, inplace=True)

# Drop rows that contain missing values.
df.dropna(axis=0, how="any", inplace=True)

* emp_title

There are too many employment titles in the feature `emp_title` for one-hot encoding. I tried binning the titles together if the strings contain certain keywords like `["general manager", "sales manager"] -> "manager"` down to 30 job titles, and converting the remaining into `"others"`. At the end, the 31 job titles were one-hot encoded. Random forest models were trained once with the one-hot encoded features and once without the employment title. There was no difference in the model's accuracy and other metrics for classification problem. Thus the `emp_title` will be removed from the analysis.

In [129]:
df.drop("emp_title", axis=1, inplace=True)

In [130]:
df.shape

(1199638, 27)

In [84]:
# Deprecated
# Codes for binning employment titles for one-hot encoding. 

# Feature engineering with the employment title provided by the borrowers
emp_title = df["emp_title"].apply(lambda title: title.lower())
emp_title.replace({"rn": "registered nurse"}, inplace=True)

# Bin the categories into top 30 employment titles based on the title.
emp_title[emp_title.str.contains("teacher")] = "teacher"
emp_title[emp_title.str.contains("manager")] = "manager"
emp_title[emp_title.str.contains("owner")] = "owner"
emp_title[emp_title.str.contains("driver")] = "driver"
emp_title[emp_title.str.contains("supervisor")] = "supervisor"
emp_title[emp_title.str.contains("engineer")] = "engineer"
emp_title[emp_title.str.contains("director")] = "director"
emp_title[emp_title.str.contains("engineer")] = "engineer"
emp_title[emp_title.str.contains("president")] = "president"
emp_title[emp_title.str.contains("ceo")] = "president"
emp_title[emp_title.str.contains("police")] = "police"
emp_title[emp_title.str.contains("account")] = "accountant"
emp_title[emp_title.str.contains("nurse")] = "registered nurse"
emp_title[emp_title.str.contains("technician")] = "technician"
emp_title[emp_title.str.contains("attorney")] = "attorney"
emp_title[emp_title.str.contains("mechanic")] = "mechanic"
emp_title[emp_title.str.contains("analyst")] = "analyst"
emp_title[emp_title.str.contains("server")] = "server"
emp_title[emp_title.str.contains("foreman")] = "foreman"
emp_title[emp_title.str.contains("executive")] = "executive"
emp_title[emp_title.str.contains("administrative")] = "executive"
emp_title[emp_title.str.contains("administrator")] = "executive"
emp_title[emp_title.str.contains("operator")] = "operator"
emp_title[emp_title.str.contains("controller")] = "controller"
emp_title[emp_title.str.contains("paralegal")] = "paralegal"
emp_title[emp_title.str.contains("consultant")] = "consultant"
emp_title[emp_title.str.contains("clerk")] = "clerk"
emp_title[emp_title.str.contains("principal")] = "principal"
emp_title[emp_title.str.contains("professor")] = "professor"
emp_title[emp_title.str.contains("lpn")] = "lpn"
emp_title[emp_title.str.contains("machinist")] = "machinist"
emp_title[emp_title.str.contains("customer service")] = "customer service"
emp_title[emp_title.str.contains("electrician")] = "electrician"
emp_title[emp_title.str.contains("cna")] = "cna"

# Replace remaining employment titles with "others"
emp_list = list(emp_title.value_counts().head(30).index)

def replace_title(title):
    if title not in emp_list:
        return "others"
    else:
        return title

emp_title = emp_title.map(replace_title)
df["new_emp_title"] = emp_title