<a href="https://colab.research.google.com/github/lightuse/AML/blob/master/Automated_Machine_Learning_Binary_Classification_with_function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna

In [None]:
!pip install catboost

In [None]:
from common import function

# supervised learning
## binary classification

In [None]:
from sklearn.pipeline import Pipeline
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

Setting

In [None]:
# set pipelines for different algorithms
evaluation_list = {'AUC':'roc_auc',
                   'F1':'f1',
                   'Recall':'recall',
                   'Precision':'precision',
                   'Accuracy':'accuracy'}
evaluation_function_list = {'AUC':roc_auc_score,
                            'F1':f1_score,
                            'Recall':recall_score,
                            'Precision':precision_score,
                            'Accuracy':accuracy_score}
options_evaluation = ['Accuracy', 'AUC', 'F1', 'Recall', 'Precision']
options_algorithm = ['lightgbm', 'knn', 'rsvc', 'logistic', 'rf', 'gb', 'mlp', 'xgboost', 'catboost']
# 出力を predict_proba にするか 
is_predict_proba = False
# 
is_one_hot_encoding = True
is_imputation = False
exception_algorithm_list = ['tree', 'knn', 'xgboost', 'logistic', 'rsvc', 'rf', 'gb', 'mlp', 'catboost']
pipelines = {
    'lightgbm':
        Pipeline([('pca', PCA(random_state=1)),
                  ('est', lgb.LGBMClassifier(random_state=1))]),
    'xgboost':
        Pipeline([('pca', PCA(random_state=1)),
                  ('est', xgb.XGBClassifier(random_state=1))]),
    'catboost':
        Pipeline([('pca', PCA(random_state=1)),
                  ('est', CatBoostClassifier(random_state=1))]),
    'knn':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski'))]),
    'logistic':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', LogisticRegression(random_state=1))]),
    'rsvc':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', SVC(C=1.0, kernel='rbf', class_weight='balanced', probability=is_predict_proba, random_state=1))]),
    'tree':
        Pipeline([('pca', PCA(random_state=1)),
                  ('est', DecisionTreeClassifier(random_state=1))]),
    'rf':
        Pipeline([('pca', PCA(random_state=1)),
                  ('est', RandomForestClassifier(random_state=1))]),
    'gb':
        Pipeline([('pca', PCA(random_state=1)),
                  ('est', GradientBoostingClassifier(random_state=1))]),
    'mlp':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', MLPClassifier(hidden_layer_sizes=(3,3), max_iter=10000, random_state=1))])
}

if not is_one_hot_encoding:
    if 'lightgbm' in pipelines:
        pipelines.pop('lightgbm')
    if 'xgboost' in pipelines:
        pipelines.pop('xgboost')
    if 'knn' in pipelines:
        pipelines.pop('knn')
    if 'logistic' in pipelines:
        pipelines.pop('logistic')
    if 'rsvc' in pipelines:
        pipelines.pop('rsvc')
    if 'mlp' in pipelines:
        pipelines.pop('mlp')

for algorithm in exception_algorithm_list:
    if algorithm in pipelines:
        pipelines.pop(algorithm)

# Feature selection by RandomForestClassifier
feature_selection_rf_list = ['knn', 'logistic', 'rsvc', 'mlp']
feature_importances_algorithm_list = ['tree', 'rf', 'gb', 'lightgbm', 'xgboost', 'catboost']
tuning_prarameter_list = ['gb', 'lightgbm', 'rf']
tuning_prarameter_list = []
# パラメータグリッドの設定
tuning_prarameter = {
    'lightgbm':{
        'est__learning_rate': [0.1,0.05,0.01],
        'est__n_estimators':[1000,2000],
        'est__num_leaves':[31,15,7,3],
        'est__max_depth':[4,8,16]
    },
    'tree':{
        "est__min_samples_split": [10, 20, 40],
        "est__max_depth": [2, 6, 8],
        "est__min_samples_leaf": [20, 40, 100],
        "est__max_leaf_nodes": [5, 20, 100],
    },
    'rf':{
        'est__n_estimators':[5,10,20,50,100],
        'est__max_depth':[1,2,3,4,5],
    },
    'knn':{
        'est__n_neighbors':[1,2,3,4,5,],
        'est__weights':['uniform','distance'],
        'est__algorithm':['auto','ball_tree','kd_tree','brute'],
        'est__leaf_size':[1,10,20,30,40,50],
        'est__p':[1,2]
    },
    'logistic':{
        'pca__n_components':[5,7,9],
        'est__C':[0.1,1.0,10.0,100.0]
    },
    'gb':{
        'est__loss':['deviance','exponential'],
        'est__n_estimators':[5,10,50,100,500],
    }
}

In [None]:
# 表示オプションの変更
import pandas as pd
pd.set_option('display.max_rows', 100)
# カテゴリ変数をリストで設定
ohe_columns = ['lobby-mode',
               'mode',
               'stage',
               'A1-weapon',
               'A1-rank',
               'A2-weapon',
               'A2-rank',
               'A3-weapon',
               'A3-rank',
               'A4-weapon',
               'A4-rank',
               'B1-weapon',
               'B1-rank',
               'B2-weapon',
               'B2-rank',
               'B3-weapon',
               'B3-rank',
               'B4-weapon',
               'B4-rank',
            ]
# カテゴリ変数をobject型で読み込むための準備
my_dtype = {'game-ver':object,
               'lobby-mode':object,
               'lobby':object,
               'mode':object,
               'stage':object,
               'A1-weapon':object,
               'A1-rank':object,
               'A2-weapon':object,
               'A2-rank':object,
               'A3-weapon':object,
               'A3-rank':object,
               'A4-weapon':object,
               'A4-rank':object,
               'B1-weapon':object,
               'B1-rank':object,
               'B2-weapon':object,
               'B2-rank':object,
               'B3-weapon':object,
               'B3-rank':object,
               'B4-weapon':object,
               'B4-rank':object,
               'A1-level':float,
               'A2-level':float,
               'A3-level':float,
               'A4-level':float,
               'B1-level':float,
               'B2-level':float,
               'B3-level':float,
               'B4-level':float,
            }
id_label = 'id'
target_label = 'y'
drop_columns = ['game-ver', 'lobby']
out_put_data_dir = "/content/drive/My Drive/Colab Notebooks/game_winner/data/"
train_file_name = out_put_data_dir + 'train_data.csv'
test_file_name = out_put_data_dir + 'test_data.csv'
model_columns_file_name = out_put_data_dir + 'model_columns.csv'
# feature_selection range 50-100
n_features_to_select = 200
# ファイル出力拡張子
file_extention = 'csv'
# ホールドアウト有無
is_holdout = False
# k-fold 法を利用するか
is_k_fold = True

In [None]:
is_optuna = False
is_header = True

In [None]:
def input_train_file(filename, my_dtype):
    df = pd.read_csv(train_file_name, header=0, dtype=my_dtype)
    # データの形式に合わせて適時修正
    df = df.drop(id_label, axis=1)
    y = df.iloc[:,-1]
    class_mapping = {0:0, 1:1}
    y = y.map(class_mapping)
    X = df.drop(target_label, axis=1)
    X = X.reset_index(drop=True)
    print('欠損個数（数値変数の欠損補完前）:input_train_file', X.isnull().sum().sum())
    return X, y

In [None]:
import pandas as pd
import datetime as dt
import re
import numpy as np
def transform_data(X:pd.core.series.Series):
    X['period'] =  pd.to_datetime(X['period']).map(pd.Timestamp.to_julian_date)
    publishedAt = pd.to_datetime(X['period'], utc=True)
    X["year"] = publishedAt.apply(lambda x: x.year)
    X["month"] = publishedAt.apply(lambda x: x.month)
    X["day"] = publishedAt.apply(lambda x: x.day)
    X["week"] = publishedAt.apply(lambda x: x.weekday())
    for column in ohe_columns:
        X['frequency_encode_' + column] = function.convert_to_frequency_encode(X, column)
        #X['label_encode_' + column] = function.convert_to_label_encode(X, column)
        X['count_encode_' + column] = function.convert_to_count_encode(X, column)
        X['label_count_encode_' + column] = function.convert_to_label_count_encode(X, column)
    # 不要カラム削除
    for column in drop_columns:
        X = X.drop(column, axis=1)
    X = X.reset_index(drop=True)
    return X

Modeling

In [None]:
import optuna.integration.lightgbm as lgb
from joblib import dump
from sklearn.model_selection import GridSearchCV
# train
def train_model(X, y, X_ohe_columns, evaluation):
    for pipe_name, pipeline in pipelines.items():
        print(pipe_name)
        if pipe_name in feature_selection_rf_list:
            X_featured = function.feature_selection(out_put_data_dir, n_features_to_select, X, y, X_ohe_columns, pipe_name, pipelines['rf'].named_steps['est'])
        else:
            X_featured = function.feature_selection(out_put_data_dir, n_features_to_select, X, y, X_ohe_columns, pipe_name, pipeline.named_steps['est'])
        if is_holdout:
            X_train, X_valid, y_train, y_valid = function.holdout(X_featured, y)
        else:
            X_train, X_valid, y_train, y_valid = X_featured, X_featured, y, y
        if pipe_name in tuning_prarameter_list:
            gs = GridSearchCV(estimator=pipeline,
                        param_grid=tuning_prarameter[pipe_name],
                        scoring=evaluation_list[evaluation],
                        cv=3,
                        return_train_score=False)
            gs.fit(X_train, y_train)
            dump(gs, out_put_data_dir + pipe_name + '_classiffier.joblib')
            gs.fit(X_valid, y_valid)
            # 探索した結果のベストスコアとパラメータの取得
            print(pipe_name + ' Best Score:', gs.best_score_)
            print(pipe_name + ' Best Params', gs.best_params_)
        else:
            if is_optuna and pipe_name in 'lightgbm':
                lgb_train = lgb.Dataset(X_train, (y_train))
                lgb_eval = lgb.Dataset(X_valid, (y_valid), reference=lgb_train)
                params = {
                    'objective': 'binary',
                    'metric': 'binary_logloss',
                }
                best = lgb.train(params,
                            lgb_train,
                            valid_sets=[lgb_train, lgb_eval],
                            verbose_eval=0)
                dump(best, out_put_data_dir + pipe_name + '_classiffier.joblib')
            else:
                clf = pipeline.fit(X_train, y_train)
                dump(clf, out_put_data_dir + pipe_name + '_classiffier.joblib')
                if is_holdout:
                    clf = pipeline.fit(X_valid, y_valid)
    return X_train, X_valid, y_train, y_valid

Scoring

In [None]:
from joblib import load
def scoring(algorithm_name :str, X, is_predict_proba = False):
    clf = load(out_put_data_dir + algorithm_name + '_classiffier.joblib')
    if is_optuna:
        return clf.predict(X)
    if is_predict_proba:
        return clf.predict_proba(X)[:, 1]
    return clf.predict(X)

In [None]:
def evaluation(scores, X_train, y_train, text, evaluation_function_list, input_evaluation, is_predict_proba):
    for pipe_name, pipeline in pipelines.items():
        if input_evaluation.value == 'Accuracy':
            scores[(pipe_name, text)] = evaluation_function_list[input_evaluation.value](y_train, scoring(pipe_name, X_train, is_predict_proba).round())
        else:
            scores[(pipe_name, text)] = evaluation_function_list[input_evaluation.value](y_train, scoring(pipe_name, X_train, is_predict_proba))

In [None]:
input_evaluation = function.choice(options_evaluation)

In [None]:
X, y = input_train_file(train_file_name, my_dtype)

In [None]:
X = transform_data(X)

In [None]:
X_ohe = function.one_hot_encoding(X, ohe_columns)
X_ohe.to_csv(out_put_data_dir + "X_ohe.csv", index=False, header=True)
print('欠損個数（数値変数の欠損補完前）', X_ohe.isnull().sum().sum())
print('')
print(X_ohe.isnull().sum())
X_ohe, X_ohe_columns = function.imputation(out_put_data_dir, model_columns_file_name, X_ohe)

Train

In [None]:
X_train, X_valid, y_train, y_valid = train_model(X_ohe, y, X_ohe_columns, input_evaluation.value)

CV Score

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
str_all_print = 'n_features_to_select:' + str(n_features_to_select) + '\n'
if is_k_fold:
    print('評価指標:' + input_evaluation.value)
    str_print = ''
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for pipe_name, est in pipelines.items():
        cv_results = cross_val_score(est,
                                    X_ohe, y,
                                    cv=kf,
                                    scoring=evaluation_list[input_evaluation.value])  
        str_print = '----------' + '\n' + 'algorithm:' + str(pipe_name) + '\n' + 'cv_results:' + str(cv_results) + '\n' + 'avg +- std_dev ' + str(cv_results.mean()) + '+-' + str(cv_results.std()) + '\n'
        print(str_print)
        str_all_print += str_print
    import datetime
    with open(out_put_data_dir + 'cv_results' + '_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + '.txt', mode='w') as f:
        f.write(str_all_print)

In [None]:
if is_optuna:
    import optuna.integration.lightgbm as lgb
    from sklearn.model_selection import StratifiedKFold
    dtrain = lgb.Dataset(X_ohe, label=y)
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }
    tuner = lgb.LightGBMTunerCV(
        params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=StratifiedKFold(n_splits=3)
    )
    tuner.run()
    print("Best score:", tuner.best_score)
    best_params = tuner.best_params
    print("Best params:", best_params)
    print("  Params: ")
    for key, value in best_params.items():
        print("    {}: {}".format(key, value))

In [None]:
scores = {}
if is_holdout:
    evaluation(scores, X_train, y_train, 'train', evaluation_function_list, input_evaluation, is_predict_proba)
    evaluation(scores, X_valid, y_valid, 'valid', evaluation_function_list, input_evaluation, is_predict_proba)
else:
    evaluation(scores, X_train, y_train, 'train', evaluation_function_list, input_evaluation, is_predict_proba)
print('評価指標:' + input_evaluation.value)
if is_holdout:
    display(pd.Series(scores).unstack().sort_values(by=['train', 'valid']))
else:
    display(pd.Series(scores).unstack().sort_values(by=['train']))

In [None]:
def input_test_file(filename, my_dtype, id_label):
    df_s = pd.read_csv(filename, header=0, dtype=my_dtype)
    X_s  = df_s.drop(id_label, axis=1)
    df_s = df_s.reset_index(drop=True)
    return df_s, X_s

In [None]:
input_algorithm = function.choice(options_algorithm)

In [None]:
import datetime
# 永続化したデータ読込し、 同じ評価指標ではtrain_modelは一度のみ実行で、かつ、選択されたモデルでスコアリングする
def main():
    algorithm_name = input_algorithm.value
    df_s, X_s = input_test_file(test_file_name, my_dtype, id_label)
    X_s = transform_data(X_s)
    X_ohe_s = function.one_hot_encoding(X_s, ohe_columns)
    X_predicted = function.preprocessing(out_put_data_dir, model_columns_file_name, algorithm_name, X_ohe, X_ohe_s)
    predict = scoring(algorithm_name, X_predicted, is_predict_proba);
    function.output_file(out_put_data_dir, n_features_to_select, target_label, df_s, id_label, predict, algorithm_name, file_extention, header=is_header)
    print(input_evaluation.value + ' selected')
    print(algorithm_name + ' selected')
    if algorithm_name in feature_importances_algorithm_list:
        if is_optuna and algorithm_name == 'lightgbm':
            ;
        else:
            feature_importances = pipelines[algorithm_name]['est'].feature_importances_
            feature_importances = pd.Series(feature_importances, index=X_predicted.columns.values.tolist())
            sorted_feature_importances = sorted(feature_importances.items(), key=lambda x:-x[1])
            sorted_feature_importances = pd.DataFrame(sorted_feature_importances)
            sorted_feature_importances.to_csv(out_put_data_dir + 'feature_importances_' + algorithm_name + '_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + '.csv', index=False)
            display(sorted_feature_importances)

In [None]:
if __name__ == '__main__':
    main()