In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, BaggingRegressor
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
np.random.seed(0)

def get_bootstrap_samples(data, count):
    return data[np.random.randint(0, len(data), (count, len(data)))]

def stat_intervals(stat, alpha):
    return np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

def delete_nan(data):
    for col in data.columns:
        data[col]= data[col].fillna(data[col].median())
    return data 

data = pd.read_csv('Data/credit_scoring_sample.csv', sep =';')
data.info()

target = 'SeriousDlqin2yrs'
features = [i for i in data.columns.values if i != target]
X = data[features]
y = data[target]
X = delete_nan(X)

print('Distribution of target')
data[target].value_counts()/data.shape[0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45063 entries, 0 to 45062
Data columns (total 8 columns):
SeriousDlqin2yrs                        45063 non-null int64
age                                     45063 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    45063 non-null int64
DebtRatio                               45063 non-null float64
NumberOfTimes90DaysLate                 45063 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    45063 non-null int64
MonthlyIncome                           36420 non-null float64
NumberOfDependents                      43946 non-null float64
dtypes: float64(3), int64(5)
memory usage: 2.8 MB
Distribution of target


0    0.777511
1    0.222489
Name: SeriousDlqin2yrs, dtype: float64

In [3]:
target_1 = data[data[target] == 1]['age'].values
mean_scores = [np.mean(sample) 
               for sample in get_bootstrap_samples(target_1,1000)]
stat_intervals(mean_scores, 0.1)

array([45.71379414, 46.12700479])

In [4]:
lr = LogisticRegression(random_state = 5, class_weight = 'balanced')
parameters = {'C':  [10**i for i in range(-5, 2)]}
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 5)
gcv = GridSearchCV(lr, parameters,scoring = 'roc_auc', cv = skf, verbose = 1)
gcv.fit(X, y)
gcv.best_params_

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:    5.5s finished


{'C': 0.001}

In [5]:
gcv.cv_results_['std_test_score'][gcv.best_index_]*100

0.6386161024680357

In [6]:
lr = LogisticRegression(C = 0.001, random_state = 5, class_weight = 'balanced')
lr.fit(X, y)
weights = (lr.coef_ / np.linalg.norm(lr.coef_, ord = 2))[0]
print('Max weight:', X.columns[(weights).argmax()])

Max weight: NumberOfTime30-59DaysPastDueNotWorse


In [7]:
def softmax(w, index):
    return np.exp(w[index])/(np.exp(w).sum())

print('Contribution by DebtRatio:', '%.2f' % softmax(weights, X.columns.get_loc('DebtRatio')))

Contribution by DebtRatio: 0.11


In [9]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 42, oob_score = True, 
                            class_weight = 'balanced')
parameters = {'max_features': [1, 2, 4], 
              'min_samples_leaf': [3, 5, 7, 9], 
              'max_depth': [5,10,15]}

gcv_tree = GridSearchCV(estimator = rf, cv = skf, param_grid = parameters, scoring = 'roc_auc')
gcv_tree.fit(X, y)
print( '%.2f' % (gcv_tree.best_score_ - gcv.best_score_) )

0.04


In [10]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, oob_score=True,
                             class_weight='balanced', max_features=2, min_samples_leaf=9, max_depth=10)
clf.fit(X, y)
print('Min importance:', X.columns[(clf.feature_importances_).argmin()])


Min importance: NumberOfDependents


In [12]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier
parameters = {'max_features': [2, 3, 4], 'max_samples': [0.5, 0.7, 0.9], 
              "base_estimator__C": [0.0001, 0.001, 0.01, 1, 10, 100]}

bc = BaggingClassifier(base_estimator = lr, n_estimators = 100, random_state=42)
rsb = RandomizedSearchCV(estimator = bc, cv = skf, param_distributions = parameters, scoring = 'roc_auc',
                         n_iter=20, random_state=1)
rsb.fit(X, y)
print(rsb.best_score_, rsb.best_params_)

0.8076172570918905 {'max_samples': 0.7, 'max_features': 2, 'base_estimator__C': 0.001}
