In this workbook, I assume that I have cleaned the data for 2017 and for 2014-15-16. Instead of splitting the data into training and testing set and may suffer from class imbalances, I decide to use data from 2014 through 2016 as the training set and data for 2017 as the testing set. If this approach is successful, I can validate it by further backtesting:
- Train data on 2014, test on 2015
- Train data on 2014-15, train on 2016

In [1]:
import os 
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import datetime
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics

from sklearn.externals import joblib
salary_limit = 7e4

os.chdir(r"C:\Users\nkieu\Desktop\Python\Loan data")
accepted2017_clean = pd.read_csv(r".\input\accepted2017_clean.csv", encoding = "ISO-8859-1")
accepted_train_clean = pd.read_csv(r".\input\acceptedtrain_clean.csv", encoding = "ISO-8859-1")
# accepted_train_copy = accepted_train.copy()

In [2]:
print("Training size", accepted_train[accepted_train.loan_status != 'Current'].shape)
print("Testing size", accepted2017[accepted2017.loan_status != 'Current'].shape)

NameError: name 'accepted_train' is not defined

The cleaned dataset are created using the following functions:

In [2]:
def getAUC(model, X_train, y_train, X_test, y_test):
    res = [0.0, 0.0]
    y_train_score = model.predict_proba(X_train)[:, 1]
    res[0] = metrics.roc_auc_score(y_train, y_train_score)
    print("In sample", res[0])
    
    y_test_score = model.predict_proba(X_test)[:, 1]
    res[1] = metrics.roc_auc_score(y_test, y_test_score)
    print("Out of sample", res[1])
    
    return res

def GetAccepted2017():
    accepted_original = pd.read_csv('accepted2017.csv', encoding = "ISO-8859-1")
    
    accepted = accepted_original.drop(['title', 'funded_amnt'], axis = 1)
    # accepted = accepted.drop(['title', 'funded_amnt', 'funded_amnt_inv'], axis = 1)
    
    # linear combination of other columns
    accepted = accepted.drop(['out_prncp_inv','total_rec_prncp','total_pymnt_inv'], axis = 1)
    
    # repeated information
    accepted = accepted.drop(["fico_range_low", "last_fico_range_low"], axis = 1)
    accepted = accepted.drop(['addr_state', 'initial_list_status', 'pymnt_plan'], axis = 1)
    accepted = accepted.drop(['application_type', 'hardship_flag', 'disbursement_method','debt_settlement_flag','sub_grade'], axis = 1)
    accepted = accepted.drop(['zip_code', 'id','policy_code','tax_liens', 'tax_liens'], axis = 1)
    
    ##############
    #
    # Beginning transformation
    #
    ##############
    accepted.issue_d = pd.to_datetime(accepted.issue_d)
    accepted.earliest_cr_line = pd.to_datetime(accepted.earliest_cr_line)
    
    accepted['issue_yr'] = accepted.issue_d.dt.year
    
    accepted['earliest_cr_yr'] = pd.to_datetime(accepted['earliest_cr_line']).dt.year
    accepted = accepted.drop('earliest_cr_line', axis = 1)
    

    accepted.term = accepted.term.apply(str)
    accepted['term'] = accepted['term'].apply(lambda x: x.strip().split(" ")[0])
    
    ### Later: we do not even use emp_length
    accepted.emp_length[accepted.emp_length == '10+ years'] = '10 years'
    accepted.emp_length[accepted.emp_length == '< 1 year'] = '0 year'
    accepted.emp_length = accepted.emp_length.apply(lambda x: int(x.split(' ')[0]))
    
    accepted.emp_title = accepted.emp_title.apply(lambda x: x.strip().lower())
    
    accepted.emp_title = accepted.emp_title.replace(['registered nurse', 'rn'], 'nurse')
    accepted.home_ownership = accepted.home_ownership.replace(['ANY', 'NONE'], 'RENT')
    accepted = accepted.drop("avg_cur_bal", axis = 1)
    
    accepted['target'] = 1
    accepted.loc[(accepted.loan_status == 'Current') |(accepted.loan_status == 'Fully Paid'),
          'target'] = 0
    
    leq1mil = accepted['annual_inc'] <= 1e6
    accepted = accepted[leq1mil]
    accepted = accepted[accepted.dti < 100.0]
    
    return accepted

def GetLog(dataframe, header, drop = True):
    if header not in dataframe.columns.values:
        print(header, "not found in data frame")
        return dataframe
    
    dataframe.loc[:, "log_"+header] = np.log(dataframe.loc[:, header])
    if drop:
        return dataframe.drop([header], axis = 1)
    else:
        return dataframe
    
def GetXY(train, features):
    # To leave behind Fully paid / Default
    train = train.loc[(train.loan_status != 'Current') | (train.loan_status != 'In Grace Period'),
                      features]

    categorical_features = ['term', 'home_ownership', 'verification_status']
    
    for cat_feature in categorical_features:
        if cat_feature in train.columns:
            train = pd.get_dummies(train, prefix = [cat_feature], columns = [cat_feature], drop_first = True)
            
    X = train.drop(['loan_status', 'target'], axis = 1)
    y = train.target
    
    return X, y

Let's explore our testing data to see the rough relationship between variables.

In [3]:
accepted2017.columns

NameError: name 'accepted2017' is not defined

Generally, *bc* stands for bank cards. *il* stands for installment.
*num_actv_bc_tl* is number of active bank card accounts.

I calculate a few more variables:
- *Balance_annual_inc* is the ratio of current balance over annual income, similarly for *install_annual*.
- *install_loan_ratio* is the ratio of monthly installment / loan amount. This should be very similar to rate, and therefore to the grade of the loan
- *disposable_inc* is the amount of disposable income each month after mortgage payment and payment for Lending Club loans
- *dti_new* is *disposable_inc* \* 12 / *annual_inc*

I plan to approach using a series of tree-model, which does not handle well the difference and interaction between features. 

There are a number of variables that may be considered look-ahead here, which are of no use in practice. I manually filtered for these variables and come up a list of potential variables:

In [3]:
selected_feature = ['loan_amnt', 'term', 'int_rate', 'installment'
            # , 'grade' - grade is dropped because it is highly correlated with int_rate
            , 'emp_length', 'home_ownership', 'annual_inc', 'verification_status'
            # ,'issue_d'
            # , 'purpose'
            ,'dti', 'delinq_2yrs'
            ,'fico_range_high', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal'
            ,'revol_util', 'total_acc'
            #, 'out_prncp', 'total_pymnt', 'total_rec_int','total_rec_late_fee', 'recoveries', 'collection_recovery_fee'
            #,'last_pymnt_d', 'last_pymnt_amnt', 'last_credit_pull_d',
            #,'last_fico_range_high', 'collections_12_mths_ex_med', 'acc_now_delinq'
            #,'tot_coll_amt', 
            ,'tot_cur_bal', 'total_rev_hi_lim','acc_open_past_24mths', 'bc_open_to_buy', 'bc_util'
            ,'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct'
            ,'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl'
            ,'mort_acc', 'mths_since_recent_bc', 'num_accts_ever_120_pd'
            ,'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl'
            ,'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0'
            ,'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m'
            ,'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75'
            ,'pub_rec_bankruptcies', 'tot_hi_cred_lim'
            ,'total_bal_ex_mort'
            ,'total_bc_limit', 'total_il_high_credit_limit'
            #,'issue_yr'
            #,'earliest_cr_yr'
            ,'loan_status','target'
            ,'cr_yr_before_loan'
            ,'log_annual_inc', 'log_installment'
            ,'disposable_inc', 'install_loan_ratio', 'dti_new', 'balance_annual_inc'
            ,'install_annual']

In [4]:
def Transform_2(accepted_train):
    accepted_train = accepted_train.loc[accepted_train.loan_status != 'Current']
    
    # If it is debt consolidation then I assume that the borrower will save some money (half of the monthly payment)
    # else assume that the borrow will need to pay extra, which decrease his disposable income
    accepted_train.loc[:, 'temp'] = np.where((accepted_train.purpose == 'debt_consolidation') | (accepted_train.purpose == 'credit_card'),0.5, -1.0)
    accepted_train.loc[:, 'disposable_inc'] = (1 - accepted_train['dti']/100)*accepted_train['annual_inc']/12 + accepted_train['temp'] * accepted_train['installment']
    
    accepted_train.loc[:, 'dti_new']= accepted_train['disposable_inc']*12/ accepted_train['annual_inc']

    accepted_train.loc[:, 'earliest_cr_yr'] = pd.to_datetime(accepted_train.earliest_cr_line).dt.year
    accepted_train.loc[:, 'cr_yr_before_loan'] = accepted_train['issue_yr'] - accepted_train['earliest_cr_yr']
    return accepted_train.drop('temp', axis = 1)

In [5]:
accepted_train = Transform_2(accepted_train_clean)
X, y = GetXY(accepted_train, selected_feature)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [6]:
accepted2017 = accepted2017_clean.loc[accepted2017_clean.loan_status != 'Current', :]
accepted2017.loc[:, 'temp'] = np.where((accepted2017.purpose == 'debt_consolidation') | (accepted2017.purpose == 'credit_card'),0.5, -1.0)
accepted2017.loc[:, 'disposable_inc'] = (1 - accepted2017['dti'] / 100)*accepted2017['annual_inc']/12 + accepted2017['temp']*accepted2017['installment']

accepted2017.loc[:, 'dti_new']= accepted2017['disposable_inc']*12/ accepted2017['annual_inc']

# accepted2017['earliest_cr_yr'] = pd.to_datetime(accepted2017.earliest_cr_line).dt.year
accepted2017.loc[:, 'cr_yr_before_loan'] = accepted2017['issue_yr'] - accepted2017['earliest_cr_yr']
accepted2017 = accepted2017.drop('temp', axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [7]:
X_test, y_test = GetXY(accepted2017, selected_feature)

In [8]:
pd.options.display.float_format = '{:,.2f}'.format
print("Shape of training is", X.shape)
X.head().T

Shape of training is (508676, 61)


Unnamed: 0,0,1,2,4,5
loan_amnt,15000.00,10400.00,21425.00,7650.00,9600.00
int_rate,12.39,6.99,15.59,13.66,13.66
installment,336.64,321.08,516.36,260.20,326.53
emp_length,10.00,8.00,6.00,0.00,10.00
annual_inc,78000.00,58000.00,63800.00,50000.00,69000.00
dti,12.03,14.92,18.49,34.81,25.81
delinq_2yrs,0.00,0.00,0.00,0.00,0.00
fico_range_high,754.00,714.00,689.00,689.00,684.00
inq_last_6mths,0.00,2.00,0.00,1.00,0.00
open_acc,6.00,17.00,10.00,11.00,12.00


Let's test a GBM model on the training data:

In [31]:
gbm = GradientBoostingClassifier(max_depth = 6, n_estimators= 400, max_features = 0.3)
gbm.fit(X, y)
y_score = gbm.predict_proba(X)[:,1]

# joblib.dump(gbm, 'gbm_0402.pkl')
metrics.roc_auc_score(y, y_score)   # 0.77

0.77623497039370504

In [40]:
y_score_test = gbm.predict_proba(X_test)[:,1]
metrics.roc_auc_score(y_test, y_score_test)   # 0.69

0.69535824948250335

Let's try a Random Forest Classifier and compare the feature importances between these 2 approaches to zero in on the important features.

In [49]:
rfc = RandomForestClassifier(n_estimators= 600, max_depth = 8)
rfc.fit(X, y)
y_score_rfc = rfc.predict_proba(X)[:,1]
metrics.roc_auc_score(y, y_score_rfc)

0.72440799242850162

In [50]:
y_score_test_rfc = rfc.predict_proba(X_test)[:,1]
metrics.roc_auc_score(y_test, y_score_test_rfc) 

0.67542684914564854

In [18]:
pd.options.display.float_format = '{:,.2f}'.format
feature_imp = pd.DataFrame({'name': X.columns, 'imp': gbm.feature_importances_}).sort_values(by = 'imp', ascending = False)
feature_imp['mult_gbm'] = feature_imp.imp.max() / feature_imp['imp']

feature_imp['mult_rfc'] = rfc.feature_importances_.max()/rfc.feature_importances_

#rfc_imp = pd.DataFrame({'name': X.columns, 'imp': rfc.feature_importances_}).sort_values(by = 'imp', ascending = False)
#rfc_imp['mult'] = rfc_imp.imp.max() / rfc_imp['imp']
feature_imp


Unnamed: 0,imp,name,mult_gbm,mult_rfc
52,0.06,install_loan_ratio,1.00,20.31
1,0.06,int_rate,1.01,1.00
55,0.03,install_annual,1.74,27.63
53,0.03,dti_new,1.86,510.86
21,0.03,mo_sin_old_il_acct,1.89,42.07
22,0.03,mo_sin_old_rev_tl_op,1.93,5.92
11,0.03,revol_bal,1.95,334.03
54,0.03,balance_annual_inc,2.02,4.79
51,0.03,disposable_inc,2.05,188.94
44,0.03,tot_hi_cred_lim,2.12,188.88


In [53]:
# Dump all the results
from sklearn.externals import joblib
date_txt = '0402'
joblib.dump(gbm, "gbm_"+ date_txt + '.pkl')
joblib.dump(rfc, "rfc_"+ date_txt + '.pkl')
feature_imp.to_csv("Feature importance 0420.csv", index = False)

# Further testing
Remove some features due to too low values of feature importance

In [9]:
# gbm = joblib.load(r'C:\Users\nkieu\Desktop\Python\Loan data\2018-04-02\gbm_0402.pkl')
# rfc = joblib.load(r'C:\Users\nkieu\Desktop\Python\Loan data\2018-04-02\rfc_0402.pkl')

feature_imp = pd.read_csv(r"C:\Users\nkieu\Desktop\Python\Loan data\2018-04-02\Feature importance 0420.csv")
xx = feature_imp.loc[feature_imp.mult_gbm < 6, ['name','mult_gbm']]
X_reduced = X[xx.name.values] # shape is around 42
X_test_reduced = X_test[xx.name.values]

X_reduced.shape
# gbm_2 = GradientBoostingClassifier(max_depth = 6, n_estimators= 400, max_features = 0.3)
# gbm_2.fit(X_test, y)

# date_txt = '0409'
# joblib.dump(gbm_2, "gbm_" + date_txt + '.pkl')


(508676, 42)

In [17]:
gbm_2 = joblib.load(r'C:\Users\nkieu\Desktop\Python\Loan data\2018-04-09\gbm_0409.pkl')

In [24]:
# Try logistic regression
from sklearn import linear_model
logistic = linear_model.LogisticRegression(penalty='l1', C=0.5)
logistic.fit(X_reduced, y)

joblib.dump(logistic, "logistic"+'0410'+ '.pkl')

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
from sklearn import linear_model

logistic_l2 = linear_model.LogisticRegression(penalty='l2')
logistic_l2.fit(X_reduced, y)
y_score = logistic_l2.predict(X_reduced)
print("In sample", metrics.roc_auc_score(y, y_score)) # 0.51

y_test_score = logistic_l2.predict(X_test_reduced)
print("Out of sample", metrics.roc_auc_score(y_test, y_test_score)) #0.50


In sample 0.511529222645
Out of sample 0.506961365992


In [11]:
logistic = joblib.load(r"C:\Users\nkieu\Desktop\Python\Loan data\logistic0410.pkl")

y_score = logistic.predict(X_reduced)
print("In sample", metrics.roc_auc_score(y, y_score)) # 0.56

y_test_score = logistic.predict(X_test_reduced)
print("Out of sample", metrics.roc_auc_score(y_test, y_test_score)) # 0.55


In sample 0.560843830239
Out of sample 0.555258767477


Let's attempt to overfit the model and see whether there is any room to improve performance on the testing data set.

In [None]:
### 04-11- increase depth of trees
gbm_4 = GradientBoostingClassifier(max_depth = 10, n_estimators= 200,
                                   subsample = 0.8, max_features = 0.5)
gbm_4.fit(X_reduced, y)
joblib.dump(gbm_4, "gbm4_" + '0411'+ '.pkl')

### Alternatively, RandomUnderSampler
y_score_train = gbm_4.predict_proba(X_reduced)[:,1]
metrics.roc_auc_score(y, y_score_train)   # 0.88

y_score_test = gbm_4.predict_proba(X_test_reduced)[:, 1]
metrics.roc_auc_score(y_test, y_score_test)   # 0.68


In [62]:
from imblearn.under_sampling import RandomUnderSampler
sampler = RandomUnderSampler(random_state=1234)

X_res, y_res = sampler.fit_sample(X_reduced, y)
gbm_5 = GradientBoostingClassifier(max_depth = 6, n_estimators= 200,subsample = 0.8, max_features = 0.3)
gbm_5.fit(X_res, y_res)
joblib.dump(gbm_5, "gbm5_" + '0411'+ '.pkl')

res = getAUC(gbm_5, X_res, y_res, X_test_reduced, y_test)

In sample 0.748723525732
In sample 0.672409136889


In [None]:
# Using all samples 
appended = X_reduced.append(X_test_reduced, ignore_index = True)
y_appended = y.append(y_test)

gbm_appended = GradientBoostingClassifier(max_depth = 8, n_estimators= 300,
                                   subsample = 0.8, max_features = 0.3,
                                   learning_rate = 0.8)

gbm_appended.fit(appended, y_appended)
appended_score = gbm_appended.predict_proba(appended)[:, 1]
metrics.roc_auc_score(y_appended, appended_score) # 0.91

joblib.dump(gbm_appended, "gbm_alldata_" + '0411' + '.pkl')

However, if we reduce max_depth = 5, our in sample AUC is reduced to 0.78.

In the next experiment, I attempt a feature engineering method from Airbnb [here](https://medium.com/airbnb-engineering/overcoming-missing-values-in-a-random-forest-classifier-7b1fc1fc03ba) and transform all features into 0 and 1 scale. Luckily, my reduced set of features are all numerical.

In [103]:
X_reshaped_target = X_reduced[y == 1] # Shape: 125819, 42

def Transform_to_CDF(data_, compare):
    data_sorted = data_.sort_values()
    data = data_sorted.values
    compare = compare.sort_values().values
    output = [0] * len(data)
    idx_data = 0
    idx_compare = 0
    loop = True
    
    tmp = 0
    
    while loop:
        if idx_compare == len(compare):
            if idx_data < len(data):
                for i in range(idx_data, len(data)):
                    output[i] = tmp    
            break
        if idx_data == len(data):
            break
        if data[idx_data] < compare[idx_compare]:
            output[idx_data] = tmp
            idx_data += 1
        else:
            tmp += 1
            idx_compare += 1
    
    output = pd.Series(output, index = data_sorted.index)
    return output[data_.index]

X_reshaped = X_reduced.copy()
for i in range(X_reduced.shape[1]):
    X_reshaped.iloc[:, i] = Transform_to_CDF(X_reduced.iloc[:, i], X_reshaped_target.iloc[:, i])
X_reshaped = X_reshaped / X_reshaped_target.shape[0]


X_test_reshaped = X_test_reduced.copy()
for i in range(X_test_reduced.shape[1]):
    X_test_reshaped.iloc[:, i] = Transform_to_CDF(X_test_reduced.iloc[:, i], X_reshaped_target.iloc[:, i])
X_test_reshaped = X_test_reshaped / X_reshaped_target.shape[0]

gbm_feature_trans = GradientBoostingClassifier(max_depth = 5, n_estimators= 300,
                                   subsample = 0.8, max_features = 0.5)
gbm_feature_trans.fit(X_reshaped, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=0.5, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              presort='auto', random_state=None, subsample=0.8, verbose=0,
              warm_start=False)

In [104]:
res = getAUC(gbm_feature_trans, X_reshaped, y, X_test_reshaped, y_test)

In sample 0.750952999628
In sample 0.690618386132


Now that all the features are within 0 and 1, we can apply SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1234)
X_smote, y_smote = smote.fit_sample(X_reshaped, y)

gbm_smote = GradientBoostingClassifier(max_depth = 5, n_estimators= 300,
                                   subsample = 0.8, maxv_features = 0.2)
gbm_smote.fit(X_smote, y_smote)
joblib.dump(gbm_smote, "gbm_reshaped_smote_0417.pkl")

In [70]:
res = getAUC(gbm_smote, X_reshaped, y, X_test_reshaped, y_test)

In sample 0.735191662897
In sample 0.681365550218


In [105]:
from sklearn import linear_model

logistic1 = linear_model.LogisticRegression(penalty='l1', C=0.5)
logistic1.fit(X_reshaped, y)
res1 = getAUC(logistic1, X_reshaped, y, X_test_reshaped, y_test)
print("")
logistic2 = linear_model.LogisticRegression()
logistic2.fit(X_reshaped, y)
res2 = getAUC(logistic2, X_reshaped, y, X_test_reshaped, y_test)

In sample 0.718069853128
In sample 0.678938536834

In sample 0.718098987798
In sample 0.678927442437


In [108]:
from imblearn.under_sampling import RandomUnderSampler
sampler = RandomUnderSampler(random_state=1234)

X_reshaped_RUS, y_reshaped_RUS = sampler.fit_sample(X_reshaped, y)
gbm_reshaped_RUS = GradientBoostingClassifier(max_depth = 5, n_estimators= 300,subsample = 0.8, max_features = 0.3)
gbm_reshaped_RUS.fit(X_reshaped_RUS, y_reshaped_RUS)
#joblib.dump(gbm_reshaped_RUS, "gbm5_" + '0411'+ '.pkl')
res = getAUC(gbm_reshaped_RUS, X_reshaped, y, X_test_reshaped, y_test)

In sample 0.747628548369
In sample 0.688978426893


In [112]:
# Increase the depth of the GBM on this new, reshaped data
# We can increase the AUC a bit, perhaps not worth the problem
gbm = GradientBoostingClassifier(max_depth = 8, n_estimators= 200, max_features = 0.3)
gbm.fit(X_reshaped, y)
getAUC(gbm, X_reshaped, y, X_test_reshaped, y_test)

In sample 0.800087222605
In sample 0.690271903969


[0.80008722260458209, 0.69027190396855909]

# Mas-o-Menos
Mas-o-Menos is a very simple classification techniques by researchers at Harvard University. I read about this technique in the paper 50 years of Data Science as  an illustration of a simple, effective techique that trumps many other complicated algorithms in practice. Not surprisingly, this simple technique performs quite well for this dataset.

In [135]:
from sklearn.base import BaseEstimator

class MasOrMenos(BaseEstimator):
    def __init__(self):
        self.NumFeatures = 0
        self.coeffs = None
        self.model = LogisticRegression()
        
    def fit(self, X_train, y_train):
        self.NumFeatures = X_train.shape[1]
        sqrtNumFeatures = np.sqrt(self.NumFeatures)
        self.coeffs = [0.0] * self.NumFeatures
        for i in range(self.NumFeatures):
            self.model.fit(X_train.iloc[:, i:(i+1)], y_train)
            self.coeffs[i] = np.sign(self.model.coef_[0]) / sqrtNumFeatures
        
        self.coeffs = np.array(self.coeffs).reshape(-1,1)
    
    def sigmoid(self, x):
        return (1/ (1 + np.exp(-x)))

    def predict_proba(self, X_test):
        tmp = np.zeros((X_test.shape[0], 2))
        tmp[:, 1] = self.sigmoid(np.matmul(X_test, self.coeffs))[:,0]
    
        return tmp
    
    def predict(self, X_test):
        return self.predict_proba(X_test)[:, 1]
        
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

X_scaled = pd.DataFrame(scale.fit_transform(X_reduced))
X_test_scaled = scale.transform(X_test_reduced)

mm = MasOrMenos()    
mm.fit(X_scaled, y)

test = mm.predict_proba(X_test_scaled)
getAUC(mm, X_scaled, y, X_test_scaled, y_test)

In sample 0.67127240112
Out of sample 0.654865624042


[0.67127240111986919, 0.65486562404220094]