In [None]:
# https://www.kaggle.com/lct14558/imbalanced-data-why-you-should-not-use-roc-curve

In [None]:
import sys
import os
import math
from tqdm import tqdm
sys.path.append(os.path.join(os.path.expanduser('~'), 'projects'))
import j_utils.munging as mg
import seaborn as sns

from sklearn.metrics import classification_report,confusion_matrix,roc_curve,auc,precision_recall_curve,roc_curve
from sklearn.linear_model import LogisticRegression

ppath = os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', )
dpath = os.path.join(ppath,'data')
pd.options.display.max_columns = 999

In [None]:
check_cols = ['maturity_time', 'maturity_paid', 'maturity_time_stat_adj', 'maturity_paid_stat_adj',
              'target_loose', 'target_strict', 'loan_status', 'issue_d', 'end_d', 'id']
good_statuses = ['paid', 'current']

In [None]:
eval_loan_info = pd.read_feather(os.path.join(dpath,'eval_loan_info.fth'))
# scaled_pmt_hist = pd.read_feather(os.path.join(dpath,'scaled_pmt_hist.fth'))
base_loan_info = pd.read_feather(os.path.join(dpath,'base_loan_info.fth'))
# str_loan_info = pd.read_feather(os.path.join(dpath,'str_loan_info.fth'))

In [None]:
# get loans that are >= .95 maturity time, maturity paid, or status is in defaulted, charged_off, paid
trainable_loan_ids = eval_loan_info[(eval_loan_info['maturity_time_stat_adj'] >= .95) |
                               (eval_loan_info['maturity_paid_stat_adj'] >= .95) |
                               (eval_loan_info['loan_status'].isin(['paid', 'charged_off', 'defaulted']))
                              ]['id'].values

# fix step to data

In [None]:
# quick fix to eval_loan_info, future times see if this cell needs to be run
eval_loan_info['maturity_time_stat_adj'] = np.where(
    (eval_loan_info['maturity_time_stat_adj'] == -1) & (eval_loan_info['loan_status'] == 'paid'), 1,
         np.where(
             (eval_loan_info['maturity_time_stat_adj'] == -1) & (eval_loan_info['loan_status'] == 'current'), 
 eval_loan_info['maturity_time'], eval_loan_info['maturity_time_stat_adj']))

eval_loan_info['maturity_paid_stat_adj'] = np.where(
    (eval_loan_info['maturity_paid_stat_adj'] == -1) & (eval_loan_info['loan_status'] == 'paid'), 1,
         np.where(
             (eval_loan_info['maturity_paid_stat_adj'] == -1) & (eval_loan_info['loan_status'] == 'current'), 
 eval_loan_info['maturity_paid'], eval_loan_info['maturity_paid_stat_adj']))

In [None]:
eval_loan_info.to_feather(os.path.join(dpath,'eval_loan_info.fth'))

In [None]:
(eval_loan_info['maturity_paid_stat_adj'] < 0).sum()

In [None]:
(eval_loan_info['maturity_time_stat_adj'] < 0).sum()

# continue

In [None]:
trainable_loans = eval_loan_info[eval_loan_info['id'].isin(trainable_loan_ids)]
untrainable_loans = eval_loan_info[~eval_loan_info['id'].isin(trainable_loan_ids)]

In [None]:
# make sure untrainable loans actually look untrainable
untrainable_loans.groupby('loan_status',).apply(lambda x: x.sample(min(len(x), 3)))

In [None]:
# make sure untrainable loans actually look untrainable
untrainable_loans.groupby('loan_status',).apply(lambda x: x.sample(min(len(x), 3)))

In [None]:
trainable_eli = trainable_loans
trainable_li = base_loan_info[base_loan_info['id'].isin(trainable_loan_ids)]
print(trainable_eli.shape, trainable_li.shape)

In [None]:
df = trainable_li.merge(trainable_eli[['target_strict', 'id', 'loan_status']], on='id')

# split into time_series_cv splits

In [None]:
df.drop('loan_status', axis=1, inplace=True)

In [None]:
# get categorical feature indices for catboost
obj_cols = df.select_dtypes(['object', 'datetime']).columns
categorical_features_indices = [df.columns.get_loc(col) for col in obj_cols]
obj_cols

In [None]:
# map dates and strings to numbers
date_cols = df.select_dtypes(['datetime', 'object']).columns
for col in date_cols:
    mapper = {np.NaN: 0}
    for i, val in enumerate(sorted(df[col].unique()), 1):
        mapper[val] = i
    df[col] = df[col].replace(mapper)
    if col == 'issue_d':
        issue_d_mapper = mapper

In [None]:
# for scikit, make sure everything is number and nulls are filled
non_num_cols = df.select_dtypes(['object', 'datetime']).columns
assert len(non_num_cols) == 0
df.fillna(-9999, inplace=True)

In [None]:
# split out the test set, say the last 12 months
test = df[df['issue_d'] >= (df['issue_d'].max() - 12)]
train = df[df['issue_d'] < (df['issue_d'].max() - 12)]

In [None]:
print(test.shape)
test.head()

In [None]:
print(train.shape)
train.head()

In [None]:
# see if theres any difference in pct defaulted from train and test
print(test['target_strict'].sum()/len(test), train['target_strict'].sum()/len(train))

# a slightly higher percentage of defaulting loans. This makes sense due to defaulting loans finishing faster, thus being able to be included in trainable loans, compared to ongoing loans that won't default but still need to reach term

In [None]:
# basic split for now, probably some leakage (using some loans issued in same month for train and validation. Go with this for now, come back and make better custom splits later)
from sklearn.model_selection import TimeSeriesSplit

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

In [None]:
for train_index, test_index in tscv.split(train):
    print("TRAIN:", train_index, "TEST:", test_index)

In [None]:
y = train['target_strict']
train.drop('target_strict', axis=1, inplace=True)
X = train

X_train = X.iloc[train_index,:]
y_train = y.iloc[train_index]

X_valid = X.iloc[test_index,:]
y_valid = y.iloc[test_index]

y_test = test['target_strict']
test.drop('target_strict', axis=1, inplace=True)
X_test = test

# train

In [None]:
y_test_non_def, y_test_def = y_test.value_counts()
y_test_non_def, y_test_def

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
pred = lr_model.predict(X_test)

In [None]:
def PlotConfusionMatrix(y_test,pred,y_test_legit,y_test_fraud):

    cfn_matrix = confusion_matrix(y_test,pred)
    cfn_norm_matrix = np.array([[1.0 / y_test_legit,1.0/y_test_legit],[1.0/y_test_fraud,1.0/y_test_fraud]])
    norm_cfn_matrix = cfn_matrix * cfn_norm_matrix

    fig = plt.figure(figsize=(15,5))
    ax = fig.add_subplot(1,2,1)
    sns.heatmap(cfn_matrix,cmap='coolwarm_r',linewidths=1,annot=True,ax=ax)
    plt.title('Confusion Matrix')
    plt.ylabel('Real Classes')
    plt.xlabel('Predicted Classes')

    ax = fig.add_subplot(1,2,2)
    sns.heatmap(norm_cfn_matrix,cmap='coolwarm_r',linewidths=1,annot=True,ax=ax)

    plt.title('Normalized Confusion Matrix')
    plt.ylabel('Real Classes')
    plt.xlabel('Predicted Classes')
    plt.show()
    
    print('---Classification Report---')
    print(classification_report(y_test,pred))

In [None]:
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
pred = lr_model.predict(X_test)
PlotConfusionMatrix(y_test,pred,y_test_non_def,y_test_def)

In [None]:
lr_model = LogisticRegression(class_weight='balanced')
lr_model.fit(X_train, y_train)
pred = lr_model.predict(X_test)
PlotConfusionMatrix(y_test,pred,y_test_non_def,y_test_def)

In [None]:
for w in [1,5,10,100,500,1000]:
    print('---Weight of {} for Default class---'.format(w))
    lr_model = LogisticRegression(class_weight={0:1,1:w})
    lr_model.fit(X_train, y_train)
    pred = lr_model.predict(X_test)
    PlotConfusionMatrix(y_test,pred,y_test_non_def,y_test_def)

In [None]:
fig = plt.figure(figsize=(15,8))
ax1 = fig.add_subplot(1,2,1)
ax1.set_xlim([-0.05,1.05])
ax1.set_ylim([-0.05,1.05])
ax1.set_xlabel('Recall')
ax1.set_ylabel('Precision')
ax1.set_title('PR Curve')

ax2 = fig.add_subplot(1,2,2)
ax2.set_xlim([-0.05,1.05])
ax2.set_ylim([-0.05,1.05])
ax2.set_xlabel('False Positive Rate')
ax2.set_ylabel('True Positive Rate')
ax2.set_title('ROC Curve')

for w,k in zip([1,5,10,20,50,100,10000],'bgrcmykw'):
    lr_model = LogisticRegression(class_weight={0:1,1:w})
    lr_model.fit(X_train,y_train)
    pred_prob = lr_model.predict_proba(X_test)[:,1]

    p,r,_ = precision_recall_curve(y_test,pred_prob)
    tpr,fpr,_ = roc_curve(y_test,pred_prob)
    
    ax1.plot(r,p,c=k,label=w)
    ax2.plot(tpr,fpr,c=k,label=w)
ax1.legend(loc='lower left')    
ax2.legend(loc='lower left')

plt.show()

In [None]:
# make a custom loss function that puts more emphasis on missclasifying defaulting
class CustomObjective(object):
    '''
    for derivations
    https://stats.stackexchange.com/questions/231220/how-to-compute-the-gradient-and-hessian-of-logarithmic-loss-question-is-based
    https://math.stackexchange.com/questions/78575/derivative-of-sigmoid-function-sigma-x-frac11e-x
    https://socratic.org/questions/what-is-the-derivative-of-e-x-8
    '''
    
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            # der 1 and 2 have negative in front because they are gradient?
            der1 = 4*(-1)*(p - 1) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

In [None]:
params = {
    'iterations': 1000,
#     'one_hot_max_size': 45,
#     'learning_rate': 0.01,
    'has_time': True,
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True,
    'task_type': 'GPU',
    'boosting_type': 'Ordered',
#     'loss_function': 'Log',
    'custom_metric': ['F1', 'Precision', 'Recall', 'Accuracy', 'AUC'],
    'od_type': 'Iter',
    'od_wait': 300,
}

In [None]:
model = CatBoostClassifier(**params)

In [None]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_valid, y_valid),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

In [None]:
import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(**params)
    
    print(model.get_params())
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params(),
        type = 'TimeSeries',
        plot=True,
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

In [None]:
from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
#     'iterations': 1000,
#     'one_hot_max_size': 45,
#     'learning_rate': 0.01,
    'has_time': True,
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True,
    'task_type': 'GPU',
    'boosting_type': 'Ordered',
    'loss_function': 'Logloss',
#     'custom_metric': ['F1', 'Precision', 'Recall', 'Accuracy', 'AUC'],
#     'od_type': 'Iter',
#     'od_wait': 300,
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

In [None]:
cv??