# 試したもの保存場所

### lightgbmでほぼ全ての変数(employment_length以外)投入

In [None]:
# 特徴量の選択
col_x = ['loan_amnt', 'term','grade','credit_score', 'application_type','credit_card', 
         'debt_consolidation','home_improvement','house', 
         'major_purchase', 'medical','moving', 'other',
         'renewable_energy', 'small_business', 'vacation', 'wedding']
col_y = ['loan_status']
sc_light = StandardScaler()
sc_light_df = sc_light.fit_transform(df[col_x])
y = df[col_y]
# データの分割
x_train, x_val, y_train, y_val = train_test_split(sc_light_df, y, random_state=0, test_size=0.2)
# LightGBMを訓練
model = lgb.LGBMClassifier(n_estimators=200, max_depth=3, random_state=0, learning_rate=0.005, verbosity=-1)
# 後程グラフの縦軸に変数名を出させるために一旦データフレーム化
lgbm = pd.DataFrame(x_train, columns=col_x)
# 学習曲線を出すために、学習中の誤差を評価するための指標を決めておく
model.fit(lgbm, y_train, eval_set=[(x_train,y_train),(x_val,y_val)], verbose=False)
model.score(x_train,y_train),model.score(x_val,y_val)

### class_weightを調整したlightgbmで再挑戦。混同行列も確認しているので、汎化性の高めでは？

In [None]:
# 使用する特徴量の選択
col_x = ['loan_amnt','grade','credit_score','debt_consolidation']
col_y = ['loan_status']
sc_light = StandardScaler()
sc_light_df = sc_light.fit_transform(df[col_x])
y = df[col_y]
# データの分割
x_train, x_val, y_train, y_val = train_test_split(sc_light_df, y, random_state=0, test_size=0.2)
# LightGBMを訓練
model = lgb.LGBMClassifier(n_estimators=500, max_depth=3, random_state=0, class_weight='balanced' ,learning_rate=0.005, verbosity=-1)
# 後程グラフの縦軸に変数名を出させるために一旦データフレーム化
lgbm = pd.DataFrame(x_train, columns=col_x)
# 学習曲線を出すために、学習中の誤差を評価するための指標を決めておく
model.fit(lgbm, y_train, eval_set=[(x_train,y_train),(x_val,y_val)], verbose=False)

### optunaを利用してハイパラ最適化

In [None]:
# optuna実装
class Objective:
    def __init__(self, X, y):
        # 変数X,yの初期化
        self.X = X
        self.y = y

    def __call__(self, trial):
        # ハイパーパラメータの設定
        params = {
            #ブースティング実行の回数
            "n_estimators": trial.suggest_int("n_estimators", 100, 800),
            # 決定木の最大の深さ
            "max_depth": trial.suggest_int("max_depth", 1, 15),
            # 学習率、購買の修正具合
            "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1),
            #　ノードの最小データ数
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
            # 学習結果を非表示
            'verbosity': -1,
            'random_state':0,
            # データの偏りに対して強くする
            "class_weight":'balanced',
            # L2正則化を入れる
            "reg_lambda":trial.suggest_float("reg_lambda",0, 100),
            }

        model = lgb.LGBMClassifier(**params)

        # 評価指標として正解率の最大化を目指す
        scores = cross_validate(model,
                                X=self.X, 
                                y=self.y,
                                scoring='accuracy', # 正解率を指定（https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter）
                                n_jobs=-1) # 並行して実行するジョブの数（-1は全てのプロセッサを使用）
        return scores['test_score'].mean()

# ハイパーパラメータの探索
objective = Objective(x_train, y_train)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction='maximize') # 最大化
study.optimize(objective, timeout=180) #探索時間の最大化

# optunaの値で再度訓練
# LightGBMを訓練
op_model = lgb.LGBMClassifier(n_estimators=study.best_params['n_estimators'],
                              max_depth=study.best_params['max_depth'],
                              random_state=0,
                              class_weight='balanced',
                              learning_rate=study.best_params['learning_rate'],
                              min_child_samples=study.best_params['min_child_samples'],
                              reg_lambda = study.best_params['reg_lambda'],
                              verbosity=-1)
# 後程グラフの縦軸に変数名を出させるために一旦データフレーム化
op_lgbm = pd.DataFrame(x_train, columns=col_x)
# 学習曲線を出すために、学習中の誤差を評価するための指標を決めておく
op_model.fit(op_lgbm, y_train, eval_set=[(x_train,y_train),(x_val,y_val)],
             verbose=False, early_stopping_rounds=50)
print(op_model.score(x_train,y_train),op_model.score(x_val,y_val))
lgb.plot_metric(op_model)
plt.show()
lgb.plot_importance(op_model)
plt.show()
y_pred = op_model.predict(x_val)
print(classification_report(y_val, y_pred))

### ロジスティック回帰
正解率：0.64

In [1]:
# 使用した特徴量
col_x = ['loan_amnt', 'term','grade','credit_score', 'debt_consolidation']
col_y = ['loan_status']

# model
model = LogisticRegression(C=0.1,class_weight='balanced')
model.fit(sc_x_df,y)
y_pred = lgr.predict(x_val)
print(lgr.score(x_train,y_train))
print(classification_report(y_val, y_pred))

NameError: name 'LogisticRegression' is not defined

### 特徴量エンジニアリング

In [None]:
# 特徴量エンジニアリング
pf = PolynomialFeatures(degree=2, include_bias=False)
pf_col = ['loan_amnt', 'term', 'interest_rate', 'grade','credit_score',
          'application_type','debt_consolidation', 'home_improvement', 'house',
          'major_purchase','medical', 'moving', 'other', 'renewable_energy',
          'small_business','vacation', 'wedding']
pf_x = pf.fit_transform(df[pf_col])
pf_df = pd.DataFrame(pf_x ,columns=pf.get_feature_names())
df = pd.concat([df,pf_df], axis=1)

# 使用した特徴量
col_x = ['x3^2', 'x2^2', 'interest_rate', 'x2 x4', 'x3 x4', 'credit_score*grade',
       'grade', 'x4 x5', 'x2 x3', 'application_type', 'x5^2', 'term', 'x3 x12',
       'x2 x12', 'x1 x3', 'x4 x9', 'loan_amnt', 'x2 x10', 'x2 x16', 'x4 x13',
       'credit_score', 'x3 x10', 'x0 x2', 'x2 x8', 'x3 x8',
       'employment_length_bool', 'major_purchase', 'x9^2', 'x1 x14', 'x0 x3']

# model
model = lgb.LGBMClassifier(n_estimators=study.best_params['n_estimators'],
                              max_depth=study.best_params['max_depth'],
                              random_state=0,
                              class_weight='balanced',
                              learning_rate=study.best_params['learning_rate'],
                              min_child_samples=study.best_params['min_child_samples'],
                              reg_lambda = study.best_params['reg_lambda'],
                              verbosity=-1)

# パラメーター
params: {'n_estimators': 1244, 'max_depth': 9, 'learning_rate': 0.07870987685379975, 
         'min_child_samples': 109, 'reg_lambda': 84.39310868452604, 
         'reg_alpha': 5.315213290722924}
