# models

In [4]:
# warningの無視
import warnings
warnings.filterwarnings('ignore')

In [50]:
# ---------------------------------
# データ等の準備
# ----------------------------------
import numpy as np
import pandas as pd

In [51]:
# pickleで読込む
import pickle

# 前処理済みデータ取得
with open('../data/dataset/pre/pre_data.pickle','rb') as f:
    pre_data = pickle.load(f)

In [52]:
# 欠損値処理
pre_data=pre_data.fillna(pre_data.mean())

In [53]:
# 目的変数と説明変数の取得
y=pre_data['LoanStatus']
X=pre_data.drop(columns='LoanStatus')

In [1]:
# 学習データの標準化
scaler =StandardScaler()
tr_X=scaler.fit_transform(tr_X)
va_X=scaler.transform(va_X)
test_X=scaler.transform(test_X)

In [26]:
# 訓練データとテストデータの分割
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True)
train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, test_size=0.2, shuffle=True)

In [None]:
# cross validation
# KFold
from sklearn.model_selection import KFold
def predict_cv(model, train_x, train_y, test_x):
    preds = []
    preds_test = []
    va_idxes = []

    kf = KFold(n_splits=4, shuffle=True, random_state=71)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

In [None]:
# グリッドサーチ
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [10, 20, 30],
    'n_estimators': [10, 100, 1000]}

grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=0), # 識別器
    param_grid=params, #　最適化したいパラメータセット
    n_jobs=1,
    cv=5,#　5分割交差検証
    scoring='f1',
    verbose=2)
grid.fit(train_X,  train_y) 

In [None]:
# 最適解でモデルを作成
max_depth=grid.best_params_.get('max_depth')
n_estimators=grid.best_params_.get('n_estimators')
model = RandomForestClassifier(random_state=0, max_depth=max_depth, n_estimators=n_estimators)

In [11]:
def importances(model):
    """ 変数重要度
    """
    importances = model.feature_importances_
    indices = np.argsort(importances)

    plt.barh(range(len(indices)), importances[indices] , align='center')
    plt.yticks(range(len(indices)), feature_X[indices])
    plt.title('decision tree feature importance')
    plt.xlabel('feature importance')
    plt.ylabel('variable')
    plt.show()

In [37]:
#Linear SVC
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0)
clf.fit(X_train, y_train)

print("prediction score:", clf.score(X_test, y_test))

prediction score: 0.9385964912280702


In [35]:
# SVC
from sklearn.svm import SVC

clf = SVC(random_state=0, probability=True)
clf.fit(X_train, y_train)

print("prediction score:", clf.score(X_test, y_test))

prediction score: 0.9298245614035088


In [43]:
# 線形回帰
from sklearn.linear_model import LinearRegression

clf = LinearRegression()

clf.fit(X_train, y_train)
print("prediction score:", clf.score(X_train,y_train))

clf.predict(X_test)
print("prediction score:", clf.score(X_test, y_test))

prediction score: 0.7770403608610561
prediction score: 0.7333104208963648


In [44]:
# ロジスティック回帰
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0)

clf.fit(X_train, y_train)
print("prediction score:", clf.score(X_train,y_train))

clf.predict(X_test)
print("prediction score:", clf.score(X_test, y_test))

prediction score: 0.9516483516483516
prediction score: 0.9473684210526315


In [45]:
# 決定木
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)

clf.fit(X_train, y_train)
print("prediction score:", clf.score(X_train,y_train))

clf.predict(X_test)
print("prediction score:", clf.score(X_test, y_test))

prediction score: 1.0
prediction score: 0.9122807017543859


In [46]:
# ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)

clf.fit(X_train, y_train)
print("prediction score:", clf.score(X_train,y_train))

clf.predict(X_test)
print("prediction score:", clf.score(X_test, y_test))

prediction score: 1.0
prediction score: 0.9649122807017544


In [47]:
# 勾配ブースティング木
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=0)

clf.fit(X_train, y_train)
print("prediction score:", clf.score(X_train,y_train))

clf.predict(X_test)
print("prediction score:", clf.score(X_test, y_test))

prediction score: 1.0
prediction score: 0.9649122807017544


In [None]:
# k-NN
from sklearn.neighbors import  KNeighborsClassifier

def kNN(k,data):
    #全サンプルと観測データ間のユークリッド距離を計算する．
    #その際，距離を計算したサンプルのラベルも記憶しておく
    dist_list = []
    label_list = []
    for i,j in enumerate(dataset):
        dist_list.append([np.sqrt((float(j[0])-u[0])**2+(float(j[1])-u[1])**2),j[2]])
        #print(dist_list[i])

    #距離が小さい順に並べ替える
    dist_list.sort()
    #最近傍サンプルk個でクラス判別のための多数決を行う
    class_label = [0,0,0]
    for i in range(k):
        if dist_list[i][1] == '1': class_label[0] += 1
        elif dist_list[i][1] == '2': class_label[1] += 1
        elif dist_list[i][1] == '3': class_label[2] += 1

    print("最近傍サンプル",k,"個の内訳 : [赤(1),青(2),緑(3)] =",class_label)
    print()
    print("kNNによる判別の結果，観測データのクラスラベルは", class_label.index(max(class_label))+1, "です．")

    if class_label.index(max(class_label))+1 == 1: color = 'red'
    elif class_label.index(max(class_label))+1 == 2: color = 'blue'
    elif class_label.index(max(class_label))+1 == 3: color = 'green'

    fig = plt.figure(figsize=(12, 8))
    plt.scatter(C1x,C1y,color='red',s=10)
    plt.scatter(C2x,C2y,color='blue',s=10)
    plt.scatter(C3x,C3y,color='green',s=10)
    plt.scatter(unknown_data[0],unknown_data[1],color=color,s=100)
    plt.show()

In [49]:
# xgboostによるモデル
import xgboost as xgb
class model_Xgb:

    def __init__(self):
        self.model = None

    def fit(self, tr_X, tr_y, va_x, va_y):
        params = {
            'booster':'gbtree',
            'objective': 'multi:softmax',
            'eta': 0.5,
            'gamma': 0.0,
            #'alph': 0.0,
            #     'lambda': 1.0,
            #     'min_child_weight': 1,
            'max_depth': 2,
            #     'subsample': 0.8,
            #     'colsample_bytree': 0.8,
            #     'random_state': 1,
            'num_class': 5,
            'eval_metric': 'mlogloss'
                }
        num_round = 10
        
        dtrain = xgb.DMatrix(tr_X, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)
        
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        
        # early_stopping_rounds / 
        # 連続して評価指標が改善しなかったら学習を中断する回数
        rounds=int(10 / params['eta'])
        
        self.model = xgb.train(params,
                               dtrain,
                               num_round,
                               evals=watchlist,
                               early_stopping_rounds=rounds)
#         # 検証結果の確認
#         print('Best Score:{0:.4f}, Iteratin:{1:d}, Ntree_Limit:{2:d}'.format(
#              self.model.best_score, model.best_iteration, model.best_ntree_limit))
    
    def predict(self, x):
        data = xgb.DMatrix(x)
        pred = self.model.predict(data)
        
        # 精度の確認
        accuracyScore = accuracy_score(test_y, pred)
        f1Score=f1_score(test_y,pred)
        print('accuracy_score:{0:.4f}'.format(accuracyScore))
        print('f1_score:{0:.4f}'.format(f1Score))
        return pred
    
    def plot_importance(self):
        return xgb.plot_importance(self,model,max_num_features=20)

In [50]:
clf=model_Xgb()
clf(train_X,train_y,test_X)

NameError: name 'train_X' is not defined

In [11]:
# lightgbmによるモデル
import lightgbm as lgb
class model_lightgbm:

    def __init__(self):
        self.model = None

    def fit(self, tr_X, tr_y, va_X, va_y):
        params ={'objective':'binary',
                 'seed':71,
                 'verbose':0,
                 'metrics':'binary_logloss'}
        num_round = 10
        
        lgb_train=lgb.Dataset(tr_X,tr_y)
        lgb_eval=lgb.Dataset(va_X,va_y)
        
        self.model = lgb.train(params,
                               lgb_train,
                               num_boost_round=num_round,
                               #categorical_feature=categorical_featutres,
                               valid_names=['train','valid'],
                               valid_sets=[lgb_train,lgb_eval])

    def predict(self, X):
#         data = lgb.Dataset(X)
#         pred = self.model.predict(data)
        pred = self.model.predict(X)
        logloss=log_loss(pred,X,eps=1e-7)
        accuracy_score=accuracy_score(pred, X)
        f1_score=f1_score(train_y, pred_train_1a)
        print(f'logloss_score:{logloss}')
        print(f'accuracy_score:{accuracy}')
        print(f'f1_score:{f1_score}')        
        return pred

In [37]:
# tensorflowの警告抑制
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [None]:
# ニューラルネットによるモデル
from keras.models import Sequential
from keras.layers import Dense, Dropout
class model_NN:

    def __init__(self):
        self.model = None
        self.scaler = None

    def fit(self, tr_X, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_X)

        batch_size = 128
        epochs = 10

        tr_X = self.scaler.transform(tr_X)
        va_x = self.scaler.transform(va_x)
        model = Sequential()
        model.add(Dense(256, activation='relu', input_shape=(tr_X.shape[1],)))
        model.add(Dropout(0.2))
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(loss='binary_crossentropy', optimizer='adam')

        history = model.fit(tr_X, tr_y,
                            batch_size=batch_size, epochs=epochs,
                            verbose=1, validation_data=(va_x, va_y))
        self.model = model

    def predict(self, x):
        x = self.scaler.transform(x)
        pred = self.model.predict_proba(x).reshape(-1)
        return pred

#### stacking

In [56]:
# stacking
from sklearn.ensemble import StackingClassifier

estimators = [('logreg', LogisticRegression(random_state=0)),
                ('lsvc', LinearSVC(random_state=0))]

stacking_clf = StackingClassifier(estimators=estimators,
                                  final_estimator=DecisionTreeClassifier(random_state=0))

stacking_clf.fit(X_train, y_train)
print("prediction score:", stacking_clf.score(X_train,y_train))

stacking_clf.predict(X_test)
print("prediction score:", stacking_clf.score(X_test, y_test))

prediction score: 0.9472527472527472
prediction score: 0.9649122807017544


In [None]:
y_pred=

In [None]:
# pipe_line
"""一連の処理ステップをEstimatorとしてまとめることができる。
① 標準化 → ② 次元削減 → ③ ランダムフォレストで学習
の流れを、グリッドサーチ＋CVで検証する場合の例を記載。
"""
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# パイプライン生成
pipe = Pipeline([('scaler', StandardScaler()),
                 ('pca', PCA()),
                 ('rf', RandomForestClassifier(random_state=0))])

# グリッドサーチ用の探索ハイパーパラメータ設定
param_grid = {
    'pca__n_components': [2, 3, 4],
    'rf__n_estimators' : [2, 10, 100],
    'rf__max_depth' : [10, 100, 1000]
}

grid_search = GridSearchCV(pipe, param_grid , cv=5)
grid_search.fit(X_train, y_train)

print('test_score : {}'.format(grid_search.score(X_test, y_test)))
print('best_params : {}'.format(grid_search.best_params_))

In [None]:
# make_pipeline

from sklearn.pipeline import make_pipeline

pipe = make_pipeline(StandardScaler(), PCA(), RandomForestClassifier(random_state=0))

param_grid = {
    'pca__n_components': [2, 3, 4],
    'randomforestclassifier__n_estimators' : [2, 10, 100],
    'randomforestclassifier__max_depth' : [10, 100, 1000]
}

### 評価指標

https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

分類

In [39]:
# classification_report
from sklearn.metrics import classification_report
classification_report(y_true, y_pred)

NameError: name 'y_true' is not defined

In [None]:
# 混同行列
from sklearn.metrics import confusion_matrix
confusion_matrix=confusion_matrix(test_y,pred_y)
print(f'accuracy_score : {confusion_matrix}')

In [21]:
# 正解率
# 結果がどの程度正解していたか
from sklearn.metrics import accuracy_score
accuracy_score=accuracy_score(test_y,pred_y)
print(f'accuracy_score : {accuracy_score}')

In [None]:
# 適合率
# Positiveと予想したデータの中で正解した割合
# 偽陽性の数を制限したい場合
from sklearn.metrics import precision_score
precision_score=precision_score(test_y,pred_y)
print(f'precision_score : {precision_score}')

In [None]:
# 再現率
# 陽性のうち、陽性と予測されたデータの割合
from sklearn.metrics import recall_score
recall_score=recall_score(test_y,pred_y)
print(f'recall_score : {recall_score}')

In [None]:
# F値
# 適合率と再現率の調和平均
from sklearn.metrics import f1_score
f1_score=f1_score(test_y,pred_y)
print(f'f1_score : {f1_score}')

回帰

In [None]:
# RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt
RMSE=sqrt(mean_squared_error(test_y,pred_y))
print(f'RMSE : {RMSE}')

In [23]:
"""‘explained_variance’
metrics.explained_variance_score

‘max_error’
metrics.max_error

‘neg_mean_absolute_error’
metrics.mean_absolute_error

‘neg_mean_squared_error’
metrics.mean_squared_error

‘neg_root_mean_squared_error’
metrics.mean_squared_error

‘neg_mean_squared_log_error’
metrics.mean_squared_log_error

‘neg_median_absolute_error’
metrics.median_absolute_error

‘r2’
metrics.r2_score

‘neg_mean_poisson_deviance’
metrics.mean_poisson_deviance

‘neg_mean_gamma_deviance’
metrics.mean_gamma_deviance

‘neg_mean_absolute_percentage_error’
metrics.mean_absolute_percentage_error"""

'‘explained_variance’\nmetrics.explained_variance_score\n\n‘max_error’\nmetrics.max_error\n\n‘neg_mean_absolute_error’\nmetrics.mean_absolute_error\n\n‘neg_mean_squared_error’\nmetrics.mean_squared_error\n\n‘neg_root_mean_squared_error’\nmetrics.mean_squared_error\n\n‘neg_mean_squared_log_error’\nmetrics.mean_squared_log_error\n\n‘neg_median_absolute_error’\nmetrics.median_absolute_error\n\n‘r2’\nmetrics.r2_score\n\n‘neg_mean_poisson_deviance’\nmetrics.mean_poisson_deviance\n\n‘neg_mean_gamma_deviance’\nmetrics.mean_gamma_deviance\n\n‘neg_mean_absolute_percentage_error’\nmetrics.mean_absolute_percentage_error'

In [32]:
# モデルの保存
with open('../_submit/stacking_model_Xgb_NN.pickle', mode='wb') as f:
    pickle.dump(model_2, f)

### 提出用データの予測/作成

In [35]:
# テストデータの取得
with open('../data/dataset/raw/test_data.pickle','rb') as f:
    pred_data = pickle.load(f)

In [36]:
# モデルの取り出し
with open('../_submit/stacking_model_Xgb_NN.pickle', mode='rb') as f:
    model = pickle.load(f)

In [37]:
# 予測
pred_data = model.predict(pred_data)

ValueError: could not convert string to float: 'aaaavgcebciyrso'

In [None]:
# dump pred_data
filename= featureName + "_" + modelName + submitVer + ".pickle"

with open(filepath + filename','wb') as f:
    pickle.dump(pred_data,f)