<a href="https://colab.research.google.com/github/lightuse/AML/blob/master/Automated_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# supervised learning
## binary classification

Setting

In [0]:
options_evaluation = ['AUC', 'Recall', 'Precision', 'F1']
options_algorithm = ['knn', 'logistic', 'rsvc', 'lsvc', 'tree', 'rf', 'gb', 'mlp', 'lightgbm']
# カテゴリ変数をリストで設定
ohe_columns = ['Dependents',
               'Gender',
               'Married',
               'Education',
               'Self_Employed',
               'Property_Area']
# カテゴリ変数をobject型で読み込むための準備
my_dtype = {'Dependents':object,
            'Gender':object,
            'Married':object,
            'Education':object,
            'Self_Employed':object,
            'Property_Area':object}
# 表示オプションの変更
import pandas as pd
pd.options.display.max_columns = 50
id_label = "Loan_ID"
train_file_name = './data/av_loan_u6lujuX_CVtuZ9i.csv'
test_file_name = './data/av_loan_test_Y3wMUE5_7gLdaTN.csv'
# ファイル出力拡張子
file_extention = 'csv'
# ホールドアウト有無
is_holdout = True

In [0]:
def input_train_file(filename, my_dtype, id_label):
    df = pd.read_csv(train_file_name,
                    header=0,
                    dtype=my_dtype)
    # データの形式に合わせて適時修正
    X  = df.iloc[:,:-1]
    ID = X.iloc[:,[0]]
    X  = X.drop('Loan_ID', axis=1)
    y  = df.iloc[:,-1]
    class_mapping = {'N':1, 'Y':0}
    y = y.map(class_mapping)
    return X, y

In [0]:
# one-hot encoding
def one_hot(X, ohe_columns):
    X_ohe = pd.get_dummies(X,
                          dummy_na=True,
                          columns=ohe_columns)
    return X_ohe

In [0]:
# imputation
from sklearn.impute import SimpleImputer
def imputation(X_ohe):
    imp = SimpleImputer(strategy='mean')
    imp.fit(X_ohe)
    X_ohe_columns = X_ohe.columns.values
    X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)
    return imp, X_ohe, X_ohe_columns

In [0]:
# train
from sklearn.feature_selection import RFE
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
def train_model(X_train, y_train, score, text, function_evaluation):
    # set pipelines for different algorithms
    pipelines = {
        'knn':
            Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', KNeighborsClassifier())]),
        'logistic':
            Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', LogisticRegression(random_state=1))]),
        'rsvc':
            Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', SVC(C=1.0, kernel='rbf', class_weight='balanced', random_state=1))]),
        'lsvc':
            Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', LinearSVC(C=1.0, class_weight='balanced', random_state=1, max_iter=10000))]),
        'tree':
            Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', DecisionTreeClassifier(random_state=1))]),
        'rf':
            Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', RandomForestClassifier(random_state=1))]),
        'gb':
            Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', GradientBoostingClassifier(random_state=1))]),
        'mlp':
            Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', MLPClassifier(hidden_layer_sizes=(3,3), max_iter=10000, random_state=1))]),
        'lightgbm':
            Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', lgb.LGBMClassifier(random_state=1))])
    }
    from joblib import dump
    # fit & evaluation
    for pipe_name, pipeline in pipelines.items():
        clf = pipeline.fit(X_train, y_train)
        if text == 'train':
            dump(clf, pipe_name + '_classiffier.joblib')
        scores[(pipe_name,text)] = function_evaluation(y_train, pipeline.predict(X_train))

In [0]:
def input_test_file(filename, my_dtype, id_label):
    df_s = pd.read_csv(filename,
                      header=0,
                      dtype=my_dtype)
    ID_s = df_s.iloc[:,[0]]
    X_s  = df_s.drop(id_label, axis=1)
    return df_s, X_s

In [0]:
# preprocessing
def preprocessing(X_s, X_ohe, X_ohe_s, imp, X_ohe_columns):
    cols_model = set(X_ohe.columns.values)
    cols_score = set(X_ohe_s.columns.values)
    diff1 = cols_model - cols_score
    diff2 = cols_score - cols_model
    df1 = pd.DataFrame([[1,2,3]], columns=['c1','c2','c3'])
    df2 = pd.DataFrame([[3,2,1]], columns=['c1','c2','c3'])
    df_all = pd.concat([df1, df2])
    df3 = pd.DataFrame([[0,1,2,3]],columns=['c0','c1','c3','c4'])
    df_all = pd.concat([df_all, df3])
    df_cols_m = pd.DataFrame(None,
                         columns=X_ohe_columns,
                         dtype=float)
    X_ohe_s2 = pd.concat([df_cols_m, X_ohe_s])
    set_Xm = set(X_ohe.columns.values)
    set_Xs = set(X_ohe_s.columns.values)
    X_ohe_s3 = X_ohe_s2.drop(list(set_Xs-set_Xm),axis=1)
    X_ohe_s3.loc[:,list(set_Xm-set_Xs)] = X_ohe_s3.loc[:,list(set_Xm-set_Xs)].fillna(0,axis=1)
    test = pd.DataFrame([[1,2,3]], columns=['c1','c2','c3'])
    test = test.reindex(['c2','c3','c1'], axis=1)
    X_ohe_s3 = X_ohe_s3.reindex(X_ohe.columns.values,axis=1)
    X_ohe_s4 = pd.DataFrame(imp.transform(X_ohe_s3), columns=X_ohe_columns)
    return X_ohe_s4

In [0]:
# holdout
from sklearn.model_selection import train_test_split
def holdout(X_ohe, y):
    X_train, X_test, y_train, y_test = train_test_split(X_ohe,
                                                y,
                                                test_size=0.3,
                                                random_state=1)
    return X_train, X_test, y_train, y_test

In [0]:
from joblib import load
def scoring(algorithm_name, X):
    clf = load(algorithm_name + '_classiffier.joblib')
    return clf.predict(X)

In [0]:
def get_input(x):
    return x

In [0]:
from ipywidgets import interact,interactive,fixed,interact_manual
from IPython.display import display
import ipywidgets as widgets
def choice(options):
    input = get_input(widgets.RadioButtons(options=options))
    display(input)
    return input

In [0]:
import datetime
def output_file(df, id_label, y, model_name, extension, header=True):
    file_name = "submittion_" + model_name + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + "." + extension
    separator = ','
    if extension == 'tsv':
        separator = '\t'
    pd.concat([df[id_label], pd.DataFrame(y, columns=["y"])], axis=1).to_csv(file_name, index=False, sep=separator, header=header)

In [164]:
input_evaluation = choice(options_evaluation)

RadioButtons(options=('AUC', 'Recall', 'Precision', 'F1'), value='AUC')

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
function_evaluation = accuracy_score
if input_evaluation.value == 'AUC':
    function_evaluation = accuracy_score
elif input_evaluation.value == 'Recall':
    function_evaluation = recall_score
elif input_evaluation.value == 'Precision':
    function_evaluation = precision_score
elif input_evaluation.value == 'F1':
    function_evaluation = f1_score

In [166]:
X, y = input_train_file(train_file_name, my_dtype, id_label)
X_ohe = one_hot(X, ohe_columns)
imp, X_ohe, X_ohe_columns = imputation(X_ohe)
scores = {}
if is_holdout:
    X_train, X_valid, y_train, y_valid = holdout(X_ohe, y)
    train_model(X_train, y_train, scores, 'train', function_evaluation)
    train_model(X_valid, y_valid, scores, 'valid', function_evaluation)
else:
    train_model(X_train, y_train, scores, 'train', function_evaluation)
# sort score
#sorted_score = sorted(scores.items(), key=lambda x:-x[1])
display(pd.Series(scores).unstack().sort_values(by='train', ascending=False))

Unnamed: 0,train,valid
rf,1.0,1.0
tree,1.0,1.0
gb,0.958042,1.0
lightgbm,0.867133,0.897297
rsvc,0.862471,0.891892
logistic,0.822844,0.843243
mlp,0.818182,0.864865
knn,0.806527,0.8
lsvc,0.771562,0.794595


In [167]:
input_algorithm = choice(options_algorithm)

RadioButtons(options=('knn', 'logistic', 'rsvc', 'lsvc', 'tree', 'rf', 'gb', 'mlp', 'lightgbm'), value='knn')

In [0]:
df_s, X_s = input_test_file(test_file_name, my_dtype, id_label)
X_ohe_s = one_hot(X_s, ohe_columns)
X_predicted = preprocessing(X_s, X_ohe, X_ohe_s, imp, X_ohe_columns)

In [0]:
def main():
    algorithm_name = input_algorithm.value
    predict = scoring(algorithm_name, X_predicted);
    output_file(df_s, id_label, predict, algorithm_name, file_extention, header=False)
    print(algorithm_name + ' selected')
    print(input_evaluation.value + ' selected')

In [170]:
if __name__ == '__main__':
    main()

knn selected
AUC selected
