<a href="https://colab.research.google.com/github/lightuse/AML/blob/master/Automated_Machine_Learning_Binary_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# supervised learning
## binary classification

Setting

In [124]:
# set pipelines for different algorithms
from sklearn.pipeline import Pipeline
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
options_algorithm = ['knn', 'logistic', 'rsvc', 'lsvc', 'tree', 'rf', 'gb', 'mlp', 'lightgbm']
evaluation_list = {'AUC':'roc_auc',
                   'F1':'f1',
                   'Recall':'recall',
                   'Precision':'precision'}
evaluation_function_list = {'AUC':roc_auc_score,
                            'F1':f1_score,
                            'Recall':recall_score,
                            'Precision':precision_score}
pipelines = {
    'knn':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski'))]),
    'logistic':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', LogisticRegression(random_state=1))]),
    'rsvc':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', SVC(C=1.0, kernel='rbf', class_weight='balanced', random_state=1))]),
    'lsvc':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', LinearSVC(C=1.0, class_weight='balanced', random_state=1, max_iter=10000))]),
    'tree':
        Pipeline([('pca', PCA(random_state=1)),
                  ('est', DecisionTreeClassifier(random_state=1))]),
    'rf':
        Pipeline([('pca', PCA(random_state=1)),
                  ('est', RandomForestClassifier(random_state=1))]),
    'gb':
        Pipeline([('pca', PCA(random_state=1)),
                  ('est', GradientBoostingClassifier(random_state=1))]),
    'mlp':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', MLPClassifier(hidden_layer_sizes=(3,3), max_iter=10000, random_state=1))]),
    'lightgbm':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(random_state=1)),
                  ('est', lgb.LGBMClassifier(random_state=1))])
}
# Feature selection by RandomForestClassifier
feature_selection_rf_list = ['knn',
                             'logistic',
                             'rsvc',
                             'lsvc',
                             'mlp']
feature_importances_algorithm_list = ['tree',
                                       'rf',
                                       'gb',
                                       'lightgbm']
tuning_prarameter_list = ['knn',
                          'logistic']
# パラメータグリッドの設定
tuning_prarameter = {
    'knn':{
        'est__n_neighbors':[1,2,3,4,5,],
        'est__weights':['uniform','distance'],
        'est__algorithm':['auto','ball_tree','kd_tree','brute'],
        'est__leaf_size':[1,10,20,30,40,50],
        'est__p':[1,2]
    },
    'logistic':{
        'pca__n_components':[5,7,9],
        'est__C':[0.1,1.0,10.0,100.0]
    }
}
# カテゴリ変数をリストで設定
ohe_columns = ['Dependents',
               'Gender',
               'Married',
               'Education',
               'Self_Employed',
               'Property_Area']
# カテゴリ変数をobject型で読み込むための準備
my_dtype = {'Dependents':object,
            'Gender':object,
            'Married':object,
            'Education':object,
            'Self_Employed':object,
            'Property_Area':object}
# 表示オプションの変更
import pandas as pd
pd.options.display.max_columns = 50
id_label = "Loan_ID"
train_file_name = './data/av_loan_u6lujuX_CVtuZ9i.csv'
test_file_name = './data/av_loan_test_Y3wMUE5_7gLdaTN.csv'
# feature_selection
n_features_to_select = 10
# ファイル出力拡張子
file_extention = 'csv'
# ホールドアウト有無
is_holdout = True
# 出力ファイルを predict_proba にするか 
is_predict_proba = True
# initialize
dict_selector = {}

In [125]:
def input_train_file(filename, my_dtype, id_label):
    df = pd.read_csv(train_file_name, header=0, dtype=my_dtype)
    # データの形式に合わせて適時修正
    X  = df.iloc[:,:-1]
    ID = X.iloc[:,[0]]
    X  = X.drop(id_label, axis=1)
    y  = df.iloc[:,-1]
    class_mapping = {'N':1, 'Y':0}
    y = y.map(class_mapping)
    return X, y

One-hot encoding

In [126]:
import numpy as np
# one-hot encoding
def one_hot_encoding(X, ohe_columns):
    X_ohe = pd.get_dummies(X, dummy_na=True, columns=ohe_columns)
    #print(X_ohe.columns.values)
    np.save('one_hot_encoding', np.array(X_ohe.columns.values))
    return X_ohe

Null imputation

In [127]:
from sklearn.impute import SimpleImputer
# imputation
def imputation(X_ohe):
    imp = SimpleImputer(strategy='mean')
    imp.fit(X_ohe)
    X_ohe_columns = X_ohe.columns.values
    X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)
    return imp, X_ohe, X_ohe_columns

Feature selection

In [128]:
from sklearn.feature_selection import RFE
# feature selection
def feature_selection(X_ohe, y, X_ohe_columns, estimator):
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=.05)
    selector.fit(X_ohe, y)
    X_fin = X_ohe.loc[:, X_ohe_columns[selector.support_]]
    return X_fin, selector

Holdout

In [129]:
from sklearn.model_selection import train_test_split
# holdout
def holdout(X_ohe, y):
    X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=0.3, random_state=1)
    return X_train, X_test, y_train, y_test

Modeling

In [130]:
from joblib import dump
from sklearn.model_selection import GridSearchCV
# train
def train_model(X, y, X_ohe_columns, evaluation, dict_selector):
    for pipe_name, pipeline in pipelines.items():
        if pipe_name in feature_selection_rf_list:
            X_featured, selector = feature_selection(X, y, X_ohe_columns, pipelines['rf'].named_steps['est'])
        else:
            X_featured, selector = feature_selection(X, y, X_ohe_columns, pipeline.named_steps['est'])      
        dict_selector[pipe_name] = selector
        if is_holdout:
            X_train, X_valid, y_train, y_valid = holdout(X_featured, y)
            if pipe_name in tuning_prarameter_list:
                gs = GridSearchCV(estimator=pipeline,
                            param_grid=tuning_prarameter[pipe_name],
                            scoring=evaluation_list[evaluation],
                            cv=3,
                            return_train_score=False)
                gs.fit(X_train, y_train)
                dump(gs, pipe_name + '_classiffier.joblib')
                gs.fit(X_valid, y_valid)
            else:
                clf = pipeline.fit(X_train, y_train)
                dump(clf, pipe_name + '_classiffier.joblib')
        else:
            if pipe_name in tuning_prarameter_list:
                gs = GridSearchCV(estimator=pipeline,
                            param_grid=tuning_prarameter[pipe_name],
                            scoring=evaluation_list[evaluation],
                            cv=3,
                            return_train_score=False)
                gs.fit(X_featured, y)
                dump(gs, pipe_name + '_classiffier.joblib')
            else:
                clf = pipeline.fit(X_featured, y)
                dump(clf, pipe_name + '_classiffier.joblib')
            X_train = X_featured
            X_valid = _
            y_train = y
            y_valid = _
    return X_train, X_valid, y_train, y_valid

In [131]:
def input_test_file(filename, my_dtype, id_label):
    df_s = pd.read_csv(filename, header=0, dtype=my_dtype)
    ID_s = df_s.iloc[:,[0]]
    X_s  = df_s.drop(id_label, axis=1)
    return df_s, X_s

Preprocessing

In [132]:
# preprocessing
def preprocessing(X_s, X_ohe, X_ohe_s, imp, X_ohe_columns, selector):
    cols_model = set(X_ohe.columns.values)
    cols_score = set(X_ohe_s.columns.values)
    diff1 = cols_model - cols_score
    diff2 = cols_score - cols_model
    df1 = pd.DataFrame([[1,2,3]], columns=['c1','c2','c3'])
    df2 = pd.DataFrame([[3,2,1]], columns=['c1','c2','c3'])
    df_all = pd.concat([df1, df2])
    df3 = pd.DataFrame([[0,1,2,3]], columns=['c0','c1','c3','c4'])
    df_all = pd.concat([df_all, df3])
    df_cols_m = pd.DataFrame(None, columns=X_ohe_columns, dtype=float)
    X_ohe_s2 = pd.concat([df_cols_m, X_ohe_s])
    set_Xm = set(X_ohe.columns.values)
    set_Xs = set(X_ohe_s.columns.values)
    X_ohe_s3 = X_ohe_s2.drop(list(set_Xs-set_Xm), axis=1)
    X_ohe_s3.loc[:,list(set_Xm-set_Xs)] = X_ohe_s3.loc[:,list(set_Xm-set_Xs)].fillna(0, axis=1)
    test = pd.DataFrame([[1,2,3]], columns=['c1','c2','c3'])
    test = test.reindex(['c2','c3','c1'], axis=1)
    X_ohe_s3 = X_ohe_s3.reindex(X_ohe.columns.values, axis=1)
    X_ohe_s4 = pd.DataFrame(imp.transform(X_ohe_s3), columns=X_ohe_columns)
    X_fin_s = X_ohe_s4.loc[:, X_ohe_columns[selector.support_]]
    return X_fin_s

Scoring

In [133]:
from joblib import load
def scoring(algorithm_name :str, X, is_predict_proba = False):
    clf = load(algorithm_name + '_classiffier.joblib')
    if is_predict_proba:
        return clf.predict_proba(X)[:, 1]
    return clf.predict(X)

In [134]:
def evaluation(scores, X_train, y_train, text, function_evaluation):
    for pipe_name, pipeline in pipelines.items():
        scores[(pipe_name, text)] = function_evaluation(y_train, scoring(pipe_name, X_train))

In [135]:
def get_input(x):
    return x

In [136]:
from ipywidgets import interact, interactive,fixed, interact_manual
from IPython.display import display
import ipywidgets as widgets
def choice(options):
    input = get_input(widgets.RadioButtons(options=options))
    display(input)
    return input

In [137]:
import datetime
def output_file(df, id_label :str, y :float, model_name :str, extension :str, header=True):
    file_name = 'submittion_' + model_name + '_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + '.' + extension
    separator = ','
    if extension == 'tsv':
        separator = '\t'
    pd.concat([df[id_label], pd.DataFrame(y, columns=['y'])], axis=1).to_csv(file_name, index=False, sep=separator, header=header)

In [138]:
input_evaluation = choice(evaluation_list.keys())

RadioButtons(options=('AUC', 'F1', 'Recall', 'Precision'), value='AUC')

In [139]:
X, y = input_train_file(train_file_name, my_dtype, id_label)
X_ohe = one_hot_encoding(X, ohe_columns)
imp, X_ohe, X_ohe_columns = imputation(X_ohe)
X_train, X_valid, y_train, y_valid = train_model(X_ohe, y, X_ohe_columns, input_evaluation.value, dict_selector)

In [140]:
scores = {}
if is_holdout:
    evaluation(scores, X_train, y_train, 'train', evaluation_function_list[input_evaluation.value])
    evaluation(scores, X_valid, y_valid, 'valid', evaluation_function_list[input_evaluation.value])
else:
    evaluation(scores, X_train, y_train, 'train', evaluation_function_list[input_evaluation.value])
# sort score
#sorted_score = sorted(scores.items(), key=lambda x:-x[1])
display(pd.Series(scores).unstack().sort_values(by='train', ascending=False))

Unnamed: 0,train,valid
lightgbm,1.0,0.705843
tree,0.772914,0.674379
rf,0.761092,0.672924
gb,0.736846,0.677353
knn,0.721874,0.652631
rsvc,0.721412,0.673057
mlp,0.716802,0.696854
logistic,0.715124,0.684625
lsvc,0.702956,0.693747


In [141]:
input_algorithm = choice(options_algorithm)

RadioButtons(options=('knn', 'logistic', 'rsvc', 'lsvc', 'tree', 'rf', 'gb', 'mlp', 'lightgbm'), value='knn')

In [142]:
def main():
    algorithm_name = input_algorithm.value
    df_s, X_s = input_test_file(test_file_name, my_dtype, id_label)
    X_ohe_s = one_hot_encoding(X_s, ohe_columns)
    X_predicted = preprocessing(X_s, X_ohe, X_ohe_s, imp, X_ohe_columns, dict_selector[algorithm_name])
    predict = scoring(algorithm_name, X_predicted, is_predict_proba);
    output_file(df_s, id_label, predict, algorithm_name, file_extention, header=False)
    #print(input_evaluation.value + ' selected')
    #print(algorithm_name + ' selected')
    if algorithm_name in feature_importances_algorithm_list:
        feature_importances = pipelines[algorithm_name]['est'].feature_importances_
        feature_importances = pd.Series(feature_importances, index=X_predicted.columns.values.tolist())
        sorted_feature_importances = sorted(feature_importances.items(), key=lambda x:-x[1])
        display(pd.DataFrame(sorted_feature_importances))

In [143]:
if __name__ == '__main__':
    main()