<a href="https://colab.research.google.com/github/lightuse/AML/blob/master/Automated_Machine_Learning_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# supervised learning
## Regression

Setting

In [0]:
options_evaluation = ['R2', 'MAE:neg_mean_absolute_error',
                      'RMSE:neg_root_mean_squared_error',
                      'MAPE:Mean Absolute Persentage Error',
                      'RMSPE:Root Mean Squared Persentage Error']
options_evaluation = ['RMSE', 'MAE', 'R2']
options_algorithm = ['ols', 'ridge', 'tree', 'rf', 'gbr1', 'gbr2', 'lightgbm']
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
# set pipelines for different algorithms
pipelines = {
    'ols': Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', LinearRegression())]),
    'ridge':Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', Ridge(random_state=0))]),
    'tree': Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', DecisionTreeRegressor(random_state=0))]),
    'rf': Pipeline([('scl', StandardScaler()),
                    ('reduct', PCA(random_state=1)),
                    ('est', RandomForestRegressor(random_state=0))]),
    'gbr1': Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', GradientBoostingRegressor(random_state=0))]),
    'gbr2': Pipeline([('scl', StandardScaler()),
                      ('reduct', PCA(random_state=1)),
                      ('est', GradientBoostingRegressor(n_estimators=250, random_state=0))]),
    'lightgbm': Pipeline([('scl', StandardScaler()),
                          ('reduct', PCA(random_state=1)),
                          ('est', lgb.LGBMRegressor(random_state=0))])     
}
# カテゴリ変数をリストで設定
ohe_columns = ['bed_type',
               'cancellation_policy',
               'city',
               'room_type',
               'cleaning_fee',
               #'property_type',
               'instant_bookable',
               'host_identity_verified',
               'host_has_profile_pic']
# カテゴリ変数をobject型で読み込むための準備
my_dtype = {'bed_type':object,
            'cancellation_policy':object,
            'city':object,
            'room_type':object,
            'cleaning_fee':object,
            #'property_type':object,
            'instant_bookable':object,
            'host_identity_verified':object,
            'host_has_profile_pic':object}
# 表示オプションの変更
import pandas as pd
pd.options.display.max_columns = 50
id_label = 'id'
drop_columns = []
train_file_name = 'train.csv'
test_file_name = 'test.csv'
out_put_dir = ''
# ファイル出力拡張子
file_extention = 'csv'
# ホールドアウト有無
is_holdout = True

In [0]:
import pandas as pd
import numpy as np
def input_train_file(filename, my_dtype, id_label):
    # set data by role
    df = pd.read_csv(train_file_name, header=0, dtype=my_dtype)
    # データの形式に合わせて適時修正
    X  = df.iloc[:,:-1]
    X = X.drop(id_label, axis=1)
    ID = X.iloc[:,[0]]
    y  = df.iloc[:,-1]
    return X, y

In [0]:
# one-hot encoding
def one_hot(X, ohe_columns):
    X_ohe = pd.get_dummies(X,
                          dummy_na=True,
                          columns=ohe_columns)
    return X_ohe

In [0]:
# imputation
from sklearn.impute import SimpleImputer
def imputation(X_ohe):
    imp = SimpleImputer(strategy='mean')
    imp.fit(X_ohe)
    X_ohe_columns = X_ohe.columns.values
    X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)
    return imp, X_ohe, X_ohe_columns

In [0]:
# train
from joblib import dump
def train_model(X_train, y_train):
    # fit
    for pipe_name, pipeline in pipelines.items():
        clf = pipeline.fit(X_train, y_train)
        dump(clf, pipe_name + '_regressor.joblib')

In [0]:
def input_test_file(filename, my_dtype, id_label):
    df_s = pd.read_csv(filename,
                      header=0,
                      dtype=my_dtype)
    ID_s = df_s.iloc[:,[0]]
    X_s  = df_s.drop(id_label, axis=1)
    return df_s, X_s

In [0]:
# preprocessing
def preprocessing(X_s, X_ohe, X_ohe_s, imp, X_ohe_columns):
    cols_model = set(X_ohe.columns.values)
    cols_score = set(X_ohe_s.columns.values)
    diff1 = cols_model - cols_score
    diff2 = cols_score - cols_model
    df1 = pd.DataFrame([[1,2,3]], columns=['c1','c2','c3'])
    df2 = pd.DataFrame([[3,2,1]], columns=['c1','c2','c3'])
    df_all = pd.concat([df1, df2])
    df3 = pd.DataFrame([[0,1,2,3]], columns=['c0','c1','c3','c4'])
    df_all = pd.concat([df_all, df3])
    df_cols_m = pd.DataFrame(None, columns=X_ohe_columns, dtype=float)
    X_ohe_s2 = pd.concat([df_cols_m, X_ohe_s])
    set_Xm = set(X_ohe.columns.values)
    set_Xs = set(X_ohe_s.columns.values)
    X_ohe_s3 = X_ohe_s2.drop(list(set_Xs-set_Xm), axis=1)
    X_ohe_s3.loc[:,list(set_Xm-set_Xs)] = X_ohe_s3.loc[:,list(set_Xm-set_Xs)].fillna(0, axis=1)
    test = pd.DataFrame([[1,2,3]], columns=['c1','c2','c3'])
    test = test.reindex(['c2','c3','c1'], axis=1)
    X_ohe_s3 = X_ohe_s3.reindex(X_ohe.columns.values,axis=1)
    X_ohe_s4 = pd.DataFrame(imp.transform(X_ohe_s3), columns=X_ohe_columns)
    return X_ohe_s4

In [0]:
import pandas as pd
import datetime as dt
def transform_data(X):
    X = X.drop('amenities', axis=1)
    X = X.drop('description', axis=1)
    X = X.drop('name', axis=1)
    X = X.drop('neighbourhood', axis=1)
    X = X.drop('thumbnail_url', axis=1)
    X = X.drop('zipcode', axis=1)
    X = X.drop('property_type', axis=1)
    #X = X.drop('room_type', axis=1)
    #X = X.drop('host_since', axis=1)
    X['first_review'] = pd.to_datetime(X['first_review']).map(pd.Timestamp.to_julian_date)
    X['last_review'] = pd.to_datetime(X['last_review']).map(pd.Timestamp.to_julian_date)
    X['host_since'] = pd.to_datetime(X['host_since']).map(pd.Timestamp.to_julian_date)
    X['host_response_rate'] = X['host_response_rate'].str.strip('%')
    return X

In [0]:
# holdout
from sklearn.model_selection import train_test_split
def holdout(X_ohe, y):
    X_train, X_test, y_train, y_test = train_test_split(X_ohe,
                                                y,
                                                test_size=0.3,
                                                random_state=1)
    return X_train, X_test, y_train, y_test

In [0]:
from joblib import load
def scoring(algorithm_name, X):
    clf = load(algorithm_name + '_regressor.joblib')
    return clf.predict(X)

In [0]:
def evaluation(scores, X_train, y_train, text, function_evaluation):
    for pipe_name, pipeline in pipelines.items():
        scores[(pipe_name,text)] = function_evaluation(y_train, scoring(pipe_name, X_train))

In [0]:
def get_input(x):
    return x

In [0]:
from ipywidgets import interact,interactive,fixed,interact_manual
from IPython.display import display
import ipywidgets as widgets
def choice(options):
    input = get_input(widgets.RadioButtons(options=options))
    display(input)
    return input

In [0]:
import datetime
def output_file(df, id_label, y, model_name, extension, header=True):
    file_name = "submittion_" + model_name + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + "." + extension
    separator = ','
    if extension == 'tsv':
        separator = '\t'
    if id_label != '':
        pd.concat([df[id_label], pd.DataFrame(y, columns=["y"])], axis=1).to_csv(file_name, index=False, sep=separator, header=header)
    else:
        pd.concat([df, pd.DataFrame(y, columns=["y"])], axis=1).to_csv(file_name, index=False, sep=separator, header=header)

In [17]:
X, y = input_train_file(train_file_name, my_dtype, id_label)
X = transform_data(X)
X_ohe = one_hot(X, ohe_columns)
imp, X_ohe, X_ohe_columns = imputation(X_ohe)
if is_holdout:
    X_train, X_valid, y_train, y_valid = holdout(X_ohe, y)
    train_model(X_train, y_train)
    train_model(X_valid, y_valid)
else:
    train_model(X_train, y_train)

欠損個数（数値変数の欠損補完前）:input_train_file 62827
欠損個数（数値変数の欠損補完前1）:hokan 50806


In [18]:
input_evaluation = choice(options_evaluation)

RadioButtons(options=('RMSE', 'MAE', 'R2'), value='RMSE')

In [0]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
function_evaluation = mean_squared_error
if input_evaluation.value == 'RMSE':
    function_evaluation = mean_squared_error
elif input_evaluation.value == 'MAE':
    function_evaluation = mean_absolute_error
elif input_evaluation.value == 'R2':
    function_evaluation = r2_score

In [20]:
scores = {}
if is_holdout:
    evaluation(scores, X_train, y_train, 'train', function_evaluation)
    evaluation(scores, X_valid, y_valid, 'valid', function_evaluation)
else:
    evaluation(scores, X_train, y_train, 'train', function_evaluation)
# sort score
#sorted_score = sorted(scores.items(), key=lambda x:-x[1])
ascending = True
if input_evaluation.value == 'R2':
    ascending = False
display(pd.Series(scores).unstack().sort_values(by='train', ascending=ascending))

Unnamed: 0,train,valid
lightgbm,13597.441826,7888.959
gbr2,13825.065761,9356.464
rf,14080.251321,2042.016
gbr1,14214.259734,11793.2
ols,16116.768526,16226.24
ridge,16318.685603,16377.08
tree,30174.622211,1.341655e-31


In [21]:
input_algorithm = choice(options_algorithm)

RadioButtons(options=('ols', 'ridge', 'tree', 'rf', 'gbr1', 'gbr2', 'lightgbm'), value='ols')

In [22]:
df_s, X_s = input_test_file(test_file_name, my_dtype, id_label)
X_s = hokan(X_s)
X_ohe_s = one_hot(X_s, ohe_columns)
X_predicted = preprocessing(X_s, X_ohe, X_ohe_s, imp, X_ohe_columns)

欠損個数（数値変数の欠損補完前1）:hokan 16892
モデルのみに存在する項目: set()
スコアのみに存在する項目: set()
欠損個数（数値変数の欠損補完前） 50806


In [0]:
def main():
    algorithm_name = input_algorithm.value
    predict = scoring(algorithm_name, X_predicted);
    output_file(df_s, id_label, predict, algorithm_name, file_extention, header=False)
    print(algorithm_name + ' selected')
    print(input_evaluation.value + ' selected')

In [24]:
if __name__ == '__main__':
    main()

ols selected
RMSE selected
