In [1]:
import category_encoders as ce
from joblib import dump, load
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor

history = pd.read_csv('LoanStats_securev1_2019Q1.csv.zip', engine='python', skiprows=1, skipfooter=2)

condition = (history.grade.isin(['A','B','C','D'])) & (history.term==' 36 months')
history = history[condition]

history['Interest Rate'] = history['int_rate'].str.strip('%').astype(float)

history = history.rename(columns=                     
    {'annual_inc': 'Annual Income', 
     'fico_range_high': 'Credit Score', 
     'funded_amnt': 'Loan Amount', 
     'title': 'Loan Purpose'})

history['Monthly Debts'] = history['Annual Income'] / 12 * history['dti'] / 100

columns = ['Annual Income', 
           'Credit Score', 
           'Loan Amount', 
           'Loan Purpose', 
           'Monthly Debts', 
           'Interest Rate']

history = history[columns]
history = history.dropna()

X = history.drop(columns='Interest Rate')
y = history['Interest Rate']
y_log = np.log1p(y)

pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    XGBRegressor(n_estimators=200, n_jobs=-1)
)

pipeline.fit(X, y_log)
dump(pipeline, 'pipeline.joblib')

history['Annual Income'] = history['Annual Income'].astype(int)
history['Monthly Debts'] = history['Monthly Debts'].round(2)
history.to_csv('lending-club.csv', index=False)

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, iplot
from plotly.tools import mpl_to_plotly
import seaborn as sns
init_notebook_mode(connected=True)

y_pred_log = pipeline.predict(X)
y_pred = np.expm1(y_pred_log)

fig, ax = plt.subplots()
sns.distplot(history['Interest Rate'], hist=False, kde=True, ax=ax, label='Actual')
sns.distplot(y_pred, hist=False, kde=True, ax=ax, label='Predicted')
ax.set_title('Distribution of predictions is simpler and less spread than actuals')
ax.legend().set_visible(False)

plotly_fig = mpl_to_plotly(fig)
plotly_fig['layout']['showlegend'] = True

iplot(plotly_fig)

In [3]:
from sklearn.inspection import partial_dependence
features = [X.columns.get_loc('Annual Income'), 
            X.columns.get_loc('Credit Score')]
X_encoded = pipeline.named_steps['onehotencoder'].transform(X)
model = pipeline.named_steps['xgbregressor']
partial_dependence(model, X_encoded, features=[0,1])

ValueError: feature_names mismatch: ['Annual Income', 'Credit Score', 'Loan Amount', 'Loan Purpose_Home improvement', 'Loan Purpose_Credit card refinancing', 'Loan Purpose_Debt consolidation', 'Loan Purpose_Medical expenses', 'Loan Purpose_Other', 'Loan Purpose_Car financing', 'Loan Purpose_Major purchase', 'Loan Purpose_Business', 'Loan Purpose_Home buying', 'Loan Purpose_Moving and relocation', 'Loan Purpose_Vacation', 'Loan Purpose_Green loan', 'Monthly Debts'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15']
expected Loan Purpose_Vacation, Loan Purpose_Business, Loan Purpose_Home buying, Loan Purpose_Major purchase, Loan Amount, Loan Purpose_Other, Loan Purpose_Credit card refinancing, Loan Purpose_Debt consolidation, Monthly Debts, Credit Score, Loan Purpose_Moving and relocation, Loan Purpose_Home improvement, Loan Purpose_Green loan, Annual Income, Loan Purpose_Medical expenses, Loan Purpose_Car financing in input data
training data did not have the following fields: f10, f0, f8, f11, f1, f2, f14, f3, f4, f9, f6, f15, f13, f7, f5, f12

https://plot.ly/scikit-learn/plot-partial-dependence/

In [4]:
import plotly.graph_objs as go

import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence
from sklearn.datasets.california_housing import fetch_california_housing

cal_housing = fetch_california_housing()

# split 80/20 train-test
X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
                                                    cal_housing.target,
                                                    test_size=0.2,
                                                    random_state=1)
names = cal_housing.feature_names

print("Training GBRT...")
clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
                                learning_rate=0.1, loss='huber',
                                random_state=1)
clf.fit(X_train, y_train)
print(" done.")

fig.suptitle('Partial dependence of house value on nonlocation features\n'
             'for the California housing dataset')
print('Custom 3d plot via ``partial_dependence``')
fig = plt.figure()

target_feature = (1, 5)
pdp, axes = partial_dependence(clf, target_feature,
                               X=X_train, grid_resolution=50)
XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].reshape(list(map(np.size, axes))).T

surf = go.Surface(x=XX, y=YY, z=Z)

layout = go.Layout(title='Partial dependence of house value on median age and '
                          'average occupancy',
                   scene=dict(xaxis=dict(title=names[target_feature[0]],
                                         showticklabels=False),
                              yaxis=dict(title=names[target_feature[1]],
                                         showticklabels=False),
                              zaxis=dict(title='Partial dependence',
                                         showticklabels=False))
                   )
fig = go.Figure(data = [surf], layout=layout)
iplot(fig)





Training GBRT...
 done.
Custom 3d plot via ``partial_dependence``



Function partial_dependence is deprecated; The function ensemble.partial_dependence has been deprecated in favour of inspection.partial_dependence in 0.21 and will be removed in 0.23.



<Figure size 432x288 with 0 Axes>