<a href="https://colab.research.google.com/github/lightuse/AML/blob/master/Automated_Machine_Learning_Binary_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# supervised learning
## binary classification

Setting

In [95]:
# set pipelines for different algorithms
from sklearn.pipeline import Pipeline
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
evaluation_list = {'AUC':'roc_auc',
                   'F1':'f1',
                   'Recall':'recall',
                   'Precision':'precision',
                   'Accuracy':'accuracy'}
evaluation_function_list = {'AUC':roc_auc_score,
                            'F1':f1_score,
                            'Recall':recall_score,
                            'Precision':precision_score,
                            'Accuracy':accuracy_score}
options_algorithm = ['knn', 'logistic', 'rsvc', 'tree', 'rf', 'gb', 'mlp', 'lightgbm']
# 出力を predict_proba にするか 
is_predict_proba = True
pipelines = {
    'lightgbm':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', lgb.LGBMClassifier(random_state=1))]),
    'xgboost':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', xgb.XGBClassifier(random_state=1))]),
    'knn':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski'))]),
    'logistic':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', LogisticRegression(random_state=1))]),
    'rsvc':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', SVC(C=1.0, kernel='rbf', class_weight='balanced', probability=is_predict_proba, random_state=1))]),
    'tree':
        Pipeline([('pca', PCA(random_state=1)),
                  ('est', DecisionTreeClassifier(random_state=1))]),
    'rf':
        Pipeline([('pca', PCA(random_state=1)),
                  ('est', RandomForestClassifier(random_state=1))]),
    'gb':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', GradientBoostingClassifier(random_state=1))]),
    'mlp':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', MLPClassifier(hidden_layer_sizes=(3,3), max_iter=10000, random_state=1))])
}
# Feature selection by RandomForestClassifier
feature_selection_rf_list = ['knn', 'logistic', 'rsvc', 'mlp']
feature_importances_algorithm_list = ['tree', 'rf', 'gb', 'lightgbm', 'xgboost']
#tuning_prarameter_list = ['lightgbm', 'knn', 'logistic']
#tuning_prarameter_list = ['knn', 'logistic']
#tuning_prarameter_list = ['logistic', 'rf', 'gb']
tuning_prarameter_list = ['knn', 'logistic', 'rf', 'gb', 'lightgbm']
# パラメータグリッドの設定
tuning_prarameter = {
    'lightgbm':{
        'est__learning_rate': [0.01, 0.02],
        'est__n_estimators': [300, 400, 600, 800, 1000],
        # 'num_leaves':[4,8,16,32],
        'est__max_depth': [2, 3, 4, 5, 6],
        'est__boosting_type': ['gbdt'],
        #'est__objective': ['lambdarank'],
        'est__random_state': [1],
        # feature_fraction -> colsample_bytree
        # bagging_fraction -> subsample
        # bagging_freq -> subsample_freq
        'est__min_data_in_leaf': [10, 20],
        'est__scoring': ['ndcg'],
        # 'colsample_bytree' : [0.25,0.5,0.6,0.7,0.8],
        # 'colsample_bytree' : [0.6,0.7,0.8,0.9],
        'est__feature_fraction': [1, 0.9, 0.8, 0.4],
        'est__subsample': [1, 0.9, 0.8, 0.5],
        'est__max_bin': [50, 100, 200],
        'est__is_unbalance': [True, False],
        # 'min_child_weight':[5,10,25,50],
        # 'n_jobs': [3]
    },
    'knn':{
        'est__n_neighbors':[1,2,3,4,5,],
        'est__weights':['uniform','distance'],
        'est__algorithm':['auto','ball_tree','kd_tree','brute'],
        'est__leaf_size':[1,10,20,30,40,50],
        'est__p':[1,2]
    },
    'logistic':{
        'pca__n_components':[5,7,9],
        'est__C':[0.1,1.0,10.0,100.0]
    },
    'rf':{
        'est__n_estimators':[5,10,20,50,100],
        'est__max_depth':[1,2,3,4,5],
    },
    'gb':{
        'est__loss':['deviance','exponential'],
        'est__n_estimators':[5,10,50,100,500],
    }
}

In [96]:
# 表示オプションの変更
import pandas as pd
pd.options.display.max_columns = 50
# カテゴリ変数をリストで設定
ohe_columns = ['sales',
               'salary']
# カテゴリ変数をobject型で読み込むための準備
my_dtype = {'sales':object,
            'salary':object,
            'satisfaction_level':float,
            'last_evaluation':float}
id_label = 'index'
target_label = 'left'
train_file_name = './data/final_hr_analysis_train.csv'
test_file_name = './data/final_hr_analysis_test.csv'
model_columns_file_name = './data/model_columns.csv'
# feature_selection range 50-100
n_features_to_select = 10
# ファイル出力拡張子
file_extention = 'csv'
# ホールドアウト有無
is_holdout = True
# k-fold 法を利用するか
is_k_fold = False

In [97]:
def input_train_file(filename, my_dtype, id_label):
    df = pd.read_csv(train_file_name, header=0, dtype=my_dtype)
    # データの形式に合わせて適時修正
    df = df.drop(id_label, axis=1)
    y = df.iloc[:,0]
    #y.to_csv('./data/' + 'y.csv')
    class_mapping = {0:0, 1:1}
    y = y.map(class_mapping)
    X = df.drop(target_label, axis=1)
    #X.to_csv('./data/' + 'X.csv')
    return X, y

One-hot encoding

In [98]:
# one-hot encoding
def one_hot_encoding(X, ohe_columns):
    X_ohe = pd.get_dummies(X, dummy_na=True, columns=ohe_columns)
    return X_ohe

Null imputation

In [99]:
from sklearn.impute import SimpleImputer
# imputation
def imputation(X_ohe):
    imp = SimpleImputer(strategy='mean')
    imp.fit(X_ohe)
    dump(imp, './data/imputer.joblib')
    X_ohe_columns = X_ohe.columns.values
    X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)
    return X_ohe, X_ohe_columns

Feature selection

In [100]:
from sklearn.feature_selection import RFE
# feature selection
def feature_selection(X, y, X_ohe_columns, algorithm_name, estimator):
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=.05)
    selector.fit(X, y)
    dump(selector, './data/' + algorithm_name + '_selector.joblib')
    X_fin = X.loc[:, X_ohe_columns[selector.support_]]
    return X_fin

Holdout

In [101]:
from sklearn.model_selection import train_test_split
# holdout
def holdout(X_ohe, y):
    X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=0.3, random_state=1)
    return X_train, X_test, y_train, y_test

Modeling

In [102]:
from joblib import dump
from sklearn.model_selection import GridSearchCV
# train
def train_model(X, y, X_ohe_columns, evaluation):
    for pipe_name, pipeline in pipelines.items():
        print(pipe_name)
        if pipe_name in feature_selection_rf_list:
            X_featured = feature_selection(X, y, X_ohe_columns, pipe_name, pipelines['rf'].named_steps['est'])
        else:
            X_featured = feature_selection(X, y, X_ohe_columns, pipe_name, pipeline.named_steps['est'])      
        if is_holdout:
            X_train, X_valid, y_train, y_valid = holdout(X_featured, y)
            if pipe_name in tuning_prarameter_list:
                gs = GridSearchCV(estimator=pipeline,
                            param_grid=tuning_prarameter[pipe_name],
                            scoring=evaluation_list[evaluation],
                            cv=3,
                            return_train_score=False)
                gs.fit(X_train, y_train)
                dump(gs, './data/' + pipe_name + '_classiffier.joblib')
                gs.fit(X_valid, y_valid)
                # 探索した結果のベストスコアとパラメータの取得
                print(pipe_name + ' Best Score:', gs.best_score_)
                print(pipe_name + ' Best Params', gs.best_params_)
            else:
                clf = pipeline.fit(X_train, y_train)
                dump(clf, './data/' + pipe_name + '_classiffier.joblib')
        else:
            if pipe_name in tuning_prarameter_list:
                gs = GridSearchCV(estimator=pipeline,
                            param_grid=tuning_prarameter[pipe_name],
                            scoring=evaluation_list[evaluation],
                            cv=3,
                            return_train_score=False)
                gs.fit(X_featured, y)
                dump(gs, './data/' + pipe_name + '_classiffier.joblib')
            else:
                clf = pipeline.fit(X_featured, y)
                dump(clf, './data/' + pipe_name + '_classiffier.joblib')
            X_train = X_featured
            X_valid = _
            y_train = y
            y_valid = _
    return X_train, X_valid, y_train, y_valid

Scoring

In [103]:
from joblib import load
def scoring(algorithm_name :str, X, is_predict_proba = False):
    clf = load('./data/' + algorithm_name + '_classiffier.joblib')
    if is_predict_proba:
        return clf.predict_proba(X)[:, 1]
    return clf.predict(X)

In [104]:
def evaluation(scores, X_train, y_train, text, function_evaluation):
    for pipe_name, pipeline in pipelines.items():
        scores[(pipe_name, text)] = function_evaluation(y_train, scoring(pipe_name, X_train))

In [105]:
def get_input(x):
    return x

In [106]:
from ipywidgets import interact, interactive,fixed, interact_manual
from IPython.display import display
import ipywidgets as widgets
def choice(options):
    input = get_input(widgets.RadioButtons(options=options))
    display(input)
    return input

In [107]:
input_evaluation = choice(evaluation_list.keys())

RadioButtons(options=('AUC', 'F1', 'Recall', 'Precision', 'Accuracy'), value='AUC')

In [108]:
X, y = input_train_file(train_file_name, my_dtype, id_label)
X_ohe = one_hot_encoding(X, ohe_columns)
X_ohe, X_ohe_columns = imputation(X_ohe)
pd.DataFrame(X_ohe_columns).to_csv(model_columns_file_name, index=False)

In [109]:
# train
X_train, X_valid, y_train, y_valid = train_model(X_ohe, y, X_ohe_columns, input_evaluation.value)

lightgbm


KeyboardInterrupt: ignored

CV Score

In [None]:
from sklearn.model_selection import cross_val_score, KFold
if is_k_fold:
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    for pipe_name, est in pipelines.items():
        cv_results = cross_val_score(est,
                                    X_ohe, y,
                                    cv=kf,
                                    scoring=evaluation_list[input_evaluation.value])
        print('----------')
        print('algorithm:', pipe_name)
        print('cv_results:', cv_results)
        print('avg +- std_dev', cv_results.mean(),'+-', cv_results.std())

In [None]:
scores = {}
if is_holdout:
    evaluation(scores, X_train, y_train, 'train', evaluation_function_list[input_evaluation.value])
    evaluation(scores, X_valid, y_valid, 'valid', evaluation_function_list[input_evaluation.value])
else:
    evaluation(scores, X_train, y_train, 'train', evaluation_function_list[input_evaluation.value])
# sort score
#sorted_score = sorted(scores.items(), key=lambda x:-x[1])
print('評価指標:' + input_evaluation.value)
display(pd.Series(scores).unstack().sort_values(by='train', ascending=False))

In [None]:
def input_test_file(filename, my_dtype, id_label):
    df_s = pd.read_csv(filename, header=0, dtype=my_dtype)
    df_s = df_s.drop(target_label, axis=1)
    ID_s = df_s.iloc[:,[0]]
    X_s  = df_s.drop(id_label, axis=1)
    return df_s, X_s

Preprocessing

In [None]:
from joblib import load
# preprocessing
def preprocessing(algorithm_name, X_s, X_ohe_s):
    model_columns = pd.read_csv(model_columns_file_name)
    X_ohe_columns = model_columns.values.flatten()
    cols_model = set(X_ohe_columns)
    cols_score = set(X_ohe_s.columns.values)
    diff1 = cols_model - cols_score
    diff2 = cols_score - cols_model
    df1 = pd.DataFrame([[1,2,3]], columns=['c1','c2','c3'])
    df2 = pd.DataFrame([[3,2,1]], columns=['c1','c2','c3'])
    df_all = pd.concat([df1, df2])
    df3 = pd.DataFrame([[0,1,2,3]], columns=['c0','c1','c3','c4'])
    df_all = pd.concat([df_all, df3])
    df_cols_m = pd.DataFrame(None, columns=X_ohe_columns, dtype=float)
    X_ohe_s2 = pd.concat([df_cols_m, X_ohe_s])
    set_Xm = set(X_ohe_columns)
    set_Xs = set(X_ohe_s.columns.values)
    X_ohe_s3 = X_ohe_s2.drop(list(set_Xs-set_Xm), axis=1)
    X_ohe_s3.loc[:,list(set_Xm-set_Xs)] = X_ohe_s3.loc[:,list(set_Xm-set_Xs)].fillna(0, axis=1)
    test = pd.DataFrame([[1,2,3]], columns=['c1','c2','c3'])
    test = test.reindex(['c2','c3','c1'], axis=1)
    X_ohe_s3 = X_ohe_s3.reindex(X_ohe_columns, axis=1)
    imp = load('./data/imputer.joblib')
    X_ohe_s4 = pd.DataFrame(imp.transform(X_ohe_s3), columns=X_ohe_columns)
    selector = load('./data/' + algorithm_name + '_selector.joblib')
    X_fin_s = X_ohe_s4.loc[:, X_ohe_columns[selector.support_]]
    return X_fin_s

In [None]:
import datetime
def output_file(df, id_label :str, y, model_name :str, extension :str, header=True):
    file_name = './data/' + 'submittion_' + model_name + '_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + '.' + extension
    separator = ','
    if extension == 'tsv':
        separator = '\t'
    #pd.concat([df[id_label], pd.DataFrame(y, columns=['y'])], axis=1).to_csv(file_name, index=False, sep=separator, header=header)
    pd.DataFrame(df[id_label]).join(pd.DataFrame(y)).to_csv(file_name, index=False, sep=separator, header=header)

In [None]:
input_algorithm = choice(options_algorithm)

In [None]:
# 永続化したデータ読込し、 同じ評価指標ではtrain_modelは一度のみ実行で、かつ、選択されたモデルでスコアリングする
def main():
    algorithm_name = input_algorithm.value
    df_s, X_s = input_test_file(test_file_name, my_dtype, id_label)
    X_ohe_s = one_hot_encoding(X_s, ohe_columns)
    X_predicted = preprocessing(algorithm_name, X_s, X_ohe_s)
    predict = scoring(algorithm_name, X_predicted, is_predict_proba);
    output_file(df_s, id_label, predict, algorithm_name, file_extention, header=False)
    #print(input_evaluation.value + ' selected')
    #print(algorithm_name + ' selected')
    if algorithm_name in feature_importances_algorithm_list:
        feature_importances = pipelines[algorithm_name]['est'].feature_importances_
        feature_importances = pd.Series(feature_importances, index=X_predicted.columns.values.tolist())
        sorted_feature_importances = sorted(feature_importances.items(), key=lambda x:-x[1])
        display(pd.DataFrame(sorted_feature_importances))

In [None]:
if __name__ == '__main__':
    main()