In [1]:
import pandas as pd
import numpy as np
import feather

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, recall_score, f1_score, confusion_matrix


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from tqdm import tqdm_notebook
import warnings
import gc

warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
def bining_age(age):
    if age <= 25:
        return '0'
    elif 26 <= age < 35:
        return '2'
    elif 35 < age <=  45:
        return '3'
    elif 45 < age <= 55:
        return '4'
    else:
        return  '5'
    
    
def enc_job(job):
    if job in ['admin.', 'blue-collar', 'management', 'entrepreneur']:
        #office worker
        return 'office worker'
    elif job in ['housemaid', 'services', 'technician', 'self-employed']:
        return 'other worker'
    else:
        return job
    
def enc_education(education):
    if education in ['basic.4y', 'basic.6y', 'basic.9y']:
        #基礎教育
        return 'basic'
    elif education == 'high.school':
        #高等教育
        return 'higher'
    elif education in ['university.degree', 'professional.course']:
        #専門教育
        return 'university'
    else:
        return education
    
def enc_marital(marital):
    if marital == 'married':
        #パートナーあり
        return 'has partner'
    elif marital in ['single', 'divorced']:
        #パートナーなし
        return 'single'
    else:
        return marital

In [3]:
def load_cleansing_data(test_file=None):
    if test_file is None:
        #実験環境
        train_data = feather.read_dataframe('./input/train_data')
        test_data = feather.read_dataframe('./input/test_data')
    else:
        #本番環境
        train_data = pd.read_csv('./input/bank_marketing_train.csv')
        test_data = pd.read_csv('./input/'+test_file)
        
    #concat
    train_len = len(train_data)
    traintest = pd.concat([train_data, test_data])
    
    #target map
    traintest['y'] =  traintest['y'].map(lambda x : 1 if x == 'yes' else 0)
    
    #not using duration
    traintest.drop('duration', axis=1, inplace=True)
    
    #cleansing
    #missing value
    cat_cols = traintest.select_dtypes('object')
    num_cols = traintest.select_dtypes(exclude='object').drop('y', axis=1)
    
    for col in num_cols:
        missing = train_data[col].isnull().sum()
        if missing > 0:
            #平均値で埋める
            mean = train_data[col].mean()
            train_data[col].fillna(mean, inplace=True)
        
        #print(col + ' : fill missing count {}'.format(missing))
        
    for col in cat_cols:
        missing = train_data[col].isnull().sum()
        if missing > 0:
            #unknownで埋める
            train_data[col].fillna('unknown', inplace=True)
            
        #print(col + ' : fill missing count {}'.format(missing))
        
    return traintest, train_len

In [4]:
def cv_loop_prediction(train_data, test_data, model):
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds_proba = np.zeros(len(train_data))
    preds_proba  = np.zeros(len(test_data))
    
    X = train_data.drop('y', axis=1)
    y = train_data.y
    
    X_test = test_data.drop('y', axis=1)
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('fold {}'.format(str(fold_+1)))
        X_train,  y_train = X.iloc[trn_idx], y.iloc[trn_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)

        oof_preds_proba[val_idx] = model.predict_proba(X_val)[:, 1]
        print('auc: {:.4f}'.format(roc_auc_score(y_val, oof_preds_proba[val_idx])))

        preds_proba += model.predict_proba(X_test)[:, 1] / folds.n_splits

    print('\n------OOF socre-------')
    calc_metrics(y, oof_preds_proba, 0.5)
        
    print('\n--------Optimazed--------')
    search_result  = threshold_search(y,  oof_preds_proba)
    print('search_result', search_result)
    threshold = search_result['threshold']
    
    calc_metrics(y, oof_preds_proba, threshold)

    return oof_preds_proba, preds_proba, threshold

In [5]:
def calc_metrics(y_true, predict_proba, threshold):
    print('auc {:.4f}'.format(roc_auc_score(y_true, predict_proba)))
    predict_proba_optimaized = (predict_proba >= threshold).astype(int)
    print('f1-score: {:.4f}'.format(f1_score(y_true, predict_proba_optimaized)))
    
    print('confution matarix \n', confusion_matrix(y_true, predict_proba_optimaized))
    print('expect profit: {}'.format(calc_expected_profit(y_true, predict_proba_optimaized)))

In [6]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for threshold in tqdm_notebook([i * 0.01 for i in range(100)], disable=True):
            score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
            if score > best_score:
                best_threshold = threshold
                best_score = score
        search_result = {'threshold': best_threshold, 'f1': best_score}

    return search_result

In [7]:
def calc_expected_profit(y_test, y_pred):
    attack_list = np.where(y_pred == 1)[0]
    success = y_test.values[attack_list]
    
    #成功数*2000 - 架電数*500
    expected_profit = np.sum(success)*2000 - len(attack_list) * 500 
    
    return expected_profit

## データのクレンジング
- 万が一の欠損値があったときのために
    - Numeric feature: fill mean
    - Category feature: fill 'unknonw'
    
## 予測パート
- モデリングにおける評価指標はインバランスなのでAUC
- StratifiedKFold (k=5) でモデルを学習させる。各Foldのモデルでtest全体を予測して平均する
- 各Foldで得られたtrainの予測を使って、F1を最大化させる閾値を機械的に求める
    - もちろんtrainで最良の閾値なのでtestで最良とは限らないが、汎化してれば近い値で最適化できはずと考えた結果
    - F1最大=profit最大とは限らないがそれでもだいたい比例関係にあるはず

In [8]:
traintest, train_len = load_cleansing_data(test_file='bank_marketing_test-1.csv')

# RogisticRegression

- jobをカテゴライズ、One-hot-Encodingしたのち、ageをマッピング
- pdaysは999を新規として、それ以外を既存としてカテゴリ変数化
- ドロップするカラム
    - 'age', 'job', 'education', 'marital', 'previous', 'loan', 'nr.employed', 'cons.conf.idx', 'cons.price.idx'

In [9]:
def preprocess_for_linear(traintest, train_len):
    
    job_dummy = pd.get_dummies(traintest['job'].map(enc_job))
    job_dummy = pd.DataFrame(job_dummy * traintest['age'].values.reshape(-1, 1), columns=job_dummy.columns)
    traintest = pd.concat([traintest, job_dummy], axis=1)
    
    traintest['pdays'] = traintest['pdays'].map(lambda x: 'newly' if x == 999 else 'exisits')
    
    traintest.drop(['age', 'job', 'education', 'marital', 'previous', 'loan', 'nr.employed', 'cons.conf.idx', 'cons.price.idx'], axis=1, inplace=True)
    
    cat_cols = traintest.select_dtypes('object').columns
    num_cols = traintest.select_dtypes(exclude='object').columns
        
    #category columns dummy変数化
    traintest_dummy = pd.get_dummies(traintest[cat_cols], drop_first=True)
    
    traintest = pd.concat([traintest_dummy, traintest[num_cols]], axis=1)
    
    print('columns...\n', traintest.columns.tolist())

    #split
    train_traintest = traintest.iloc[:train_len , :].copy()
    test_traintest = traintest.iloc[train_len: , :].copy()
    
    return train_traintest, test_traintest

In [10]:
traintest_linear = traintest.copy()

In [11]:
train_data, test_data =  preprocess_for_linear(traintest_linear, train_len)

columns...
 ['default_unknown', 'default_yes', 'housing_unknown', 'housing_yes', 'contact_telephone', 'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed', 'pdays_newly', 'poutcome_nonexistent', 'poutcome_success', 'campaign', 'emp.var.rate', 'euribor3m', 'y', 'office worker', 'other worker', 'retired', 'student', 'unemployed', 'unknown']


In [12]:
lr_train_preds_proba, lr_preds_proba, threshold = cv_loop_prediction(train_data, test_data, LogisticRegression())

fold 1
auc: 0.6997
fold 2
auc: 0.7137
fold 3
auc: 0.7194
fold 4
auc: 0.6832
fold 5
auc: 0.7155

------OOF socre-------
auc 0.7061
f1-score: 0.0914
confution matarix 
 [[34228    93]
 [ 2611   136]]
expect profit: 157500

--------Optimazed--------
search_result {'threshold': 0.18, 'f1': 0.31869369369369366}
auc 0.7061
f1-score: 0.3187
confution matarix 
 [[32589  1732]
 [ 1898   849]]
expect profit: 407500


In [13]:
calc_metrics(test_data.y, lr_preds_proba, threshold)

auc 0.5799
f1-score: 0.5934
confution matarix 
 [[ 771 1456]
 [ 480 1413]]
expect profit: 1391500


# Random Forest

- FEなしが結局最良だったのでラベルエンコードしてそのままモデルに与える

In [14]:
def preprocess_for_tree(data, train_len): 

    #label encording
    cat_cols = data.select_dtypes('object')
    for col in cat_cols:
        enc = LabelEncoder()
        data[col] = enc.fit_transform(data[col])
        
    print('columns...\n', traintest.columns.tolist())
        
    #split
    train_data = data.iloc[:train_len , :].copy()
    test_data = data.iloc[train_len: , :].copy()
    
    return train_data, test_data

In [15]:
traintest_tree = traintest.copy()
train_data, test_data = preprocess_for_tree(traintest_tree , train_len)

columns...
 ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']


In [16]:
model = RandomForestClassifier(n_estimators=500, max_depth=8, random_state=1024)

In [17]:
rf_train_preds_proba, rf_preds_proba, threshold = cv_loop_prediction(train_data, test_data, model)

fold 1
auc: 0.7131
fold 2
auc: 0.7150
fold 3
auc: 0.7254
fold 4
auc: 0.6952
fold 5
auc: 0.7373

------OOF socre-------
auc 0.7163
f1-score: 0.0870
confution matarix 
 [[34230    91]
 [ 2618   129]]
expect profit: 148000

--------Optimazed--------
search_result {'threshold': 0.17, 'f1': 0.3447607979856673}
auc 0.7163
f1-score: 0.3448
confution matarix 
 [[32795  1526]
 [ 1857   890]]
expect profit: 572000


In [18]:
calc_metrics(test_data.y, rf_preds_proba, threshold)

auc 0.6048
f1-score: 0.6298
confution matarix 
 [[   2 2225]
 [   0 1893]]
expect profit: 1727000


*********

In [20]:
print(train_data.y.mean())
print(test_data.y.mean())

0.07410704650911838
0.4594660194174757


テストデータは45%の顧客がクラス1＝成約している。  
つまり、ほとんどすべての人にアタックにすることでROIを上げられることになる。  

**経済指標のような外的要因によってターゲットが大きく作用される場合は、モデルを作ったときと実際に適用させる時に外的要因が大きくことなるものにならないかが重要**