In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import catboost
import xgboost
import lightgbm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor

import optuna

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### データの前処理

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# trainから id = 170514を削除する (外れ値)
train = train[train["id"] != 170514]
train = train.reset_index(drop = True)

# 説明変数と目的変数の分離
# 列の抜き出し
features = train.columns
features = features.drop(["id", "target"])

# 分離
train_target = train["target"]
train_id = train["id"]
test_id = test["id"]
train = train[features]
test = test[features]

In [4]:
# KMeans法によるラベリング(特徴量エンジニアリング)
km = KMeans(n_clusters=2, random_state=0)
km.fit(train)
km_train = pd.DataFrame(km.predict(train), columns = ["km_cluster"])
km_test = pd.DataFrame(km.predict(test), columns = ["km_cluster"])

# 結合
train = pd.concat([train, km_train], axis = 1)
test = pd.concat([test, km_test], axis = 1)

In [5]:
## 決定木系モデル用のデータを作成
train_dt = train.copy()
test_dt = test.copy()

# 混合ガウス分布による特徴量のラベリング(特徴量エンジニアリング)
def get_gmm_class_feature(feat, train_df, test_df, n):
    gmm = GaussianMixture(n_components=n, random_state=1234)
    gmm.fit(train_df[feat].values.reshape(-1, 1))
    train_df[f'{feat}_class'] = gmm.predict(train_df[feat].values.reshape(-1, 1))
    test_df[f'{feat}_class'] = gmm.predict(test_df[feat].values.reshape(-1, 1))

get_gmm_class_feature('cont1', train_dt, test_dt, 4)
get_gmm_class_feature('cont2', train_dt, test_dt, 10)
get_gmm_class_feature('cont3', train_dt, test_dt, 6)
get_gmm_class_feature('cont4', train_dt, test_dt, 4)
get_gmm_class_feature('cont5', train_dt, test_dt, 3)
get_gmm_class_feature('cont6', train_dt, test_dt, 2)
get_gmm_class_feature('cont7', train_dt, test_dt, 3)
get_gmm_class_feature('cont8', train_dt, test_dt, 4)
get_gmm_class_feature('cont9', train_dt, test_dt, 4)
get_gmm_class_feature('cont10', train_dt, test_dt, 8)
get_gmm_class_feature('cont11', train_dt, test_dt, 5)
get_gmm_class_feature('cont12', train_dt, test_dt, 4)
get_gmm_class_feature('cont13', train_dt, test_dt, 6)
get_gmm_class_feature('cont14', train_dt, test_dt, 6)    

In [6]:
# 線形回帰、K近傍法用にデータを加工する
train_l = train_dt.copy()
test_l = test_dt.copy()

# ダミー変数化
for i in range(1,15):
    temp = pd.get_dummies(train_l["cont{}_class".format(str(i))], drop_first = True)
    train_l = pd.concat([train_l, temp], axis = 1)
    del train_l["cont{}_class".format(str(i))]

for i in range(1,15):
    temp = pd.get_dummies(test_l["cont{}_class".format(str(i))], drop_first = True)
    test_l = pd.concat([test_l, temp], axis = 1)
    del test_l["cont{}_class".format(str(i))]

In [7]:
# train_test_split
X_train_dt, X_val_dt, y_train_dt, y_val_dt = train_test_split(train_dt, train_target, test_size = 0.25, random_state = 1234)
X_train_l, X_val_l, y_train_l, y_val_l = train_test_split(train_l, train_target, test_size = 0.25, random_state = 1234)

### CatBoostのハイパーパラメーターチューニング

In [50]:
def objective_cat(trial, X_train = X_train_dt, y_train = y_train_dt, X_val = X_val_dt, y_val = y_val_dt):
    
    params = {
              "learning_rate" : trial.suggest_categorical("learning_rate", [0.01, 0.03, 0.05, 0.1, 0.15, 0.2]),
              "n_estimators" : trial.suggest_categorical("n_estimators", [100, 500, 1000, 2000, 3000, 4000]),
              "max_depth" : trial.suggest_int("max_depth", 5,16),
              "random_state" : 1234
    }

    model = catboost.CatBoostRegressor(**params)
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 100, verbose = False)

    preds = model.predict(X_val)

    rmse = mean_squared_error(y_val, preds, squared = False)

    return rmse

In [51]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_cat, n_trials=10)

[I 2021-01-23 15:54:20,219] Trial 0 finished with value: 0.7055257106294528 and parameters: {'learning_rate': 0.2, 'n_estimators': 4000, 'max_depth': 13}. Best is trial 0 with value: 0.7055257106294528.
[I 2021-01-23 15:54:53,560] Trial 1 finished with value: 0.7036941201692 and parameters: {'learning_rate': 0.15, 'n_estimators': 4000, 'max_depth': 13}. Best is trial 1 with value: 0.7036941201692.
[I 2021-01-23 15:55:05,355] Trial 2 finished with value: 0.6988152020458548 and parameters: {'learning_rate': 0.05, 'n_estimators': 500, 'max_depth': 9}. Best is trial 2 with value: 0.6988152020458548.
[I 2021-01-23 15:56:12,249] Trial 3 finished with value: 0.6979947963790938 and parameters: {'learning_rate': 0.05, 'n_estimators': 3000, 'max_depth': 11}. Best is trial 3 with value: 0.6979947963790938.
[I 2021-01-23 15:57:01,971] Trial 4 finished with value: 0.6972927606997429 and parameters: {'learning_rate': 0.05, 'n_estimators': 1000, 'max_depth': 10}. Best is trial 4 with value: 0.6972927

### XGBoostのハイパーパラメーターチューニング

In [None]:
def objective_xgb(trial, X_train = X_train_dt, y_train = y_train_dt, X_val = X_val_dt, y_val = y_val_dt):
    
    params = {
              "learning_rate" : trial.suggest_categorical("learning_rate", [0.01, 0.03, 0.05, 0.1, 0.15, 0.2]),
              "n_estimators" : trial.suggest_categorical("n_estimators", [100, 500, 1000, 2000, 3000, 4000]),
              "max_depth" : trial.suggest_int("max_depth", 5, 30),
              "alpha" : trial.suggest_uniform("alpha", 0, 10),
              "random_state" : 1234
    }

    model = xgboost.XGBRegressor(**params)
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 100, verbose = False)

    preds = model.predict(X_val)

    rmse = mean_squared_error(y_val, preds, squared = False)

    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_xgb, n_trials=10)

### LightGBMのハイパーパラメーターチューニング

In [29]:
def objective_lgbm(trial, X_train = X_train_dt, y_train = y_train_dt, X_val = X_val_dt, y_val = y_val_dt):
    
    params = {
              "learning_rate" : trial.suggest_categorical("learning_rate", [0.01, 0.03, 0.05, 0.1, 0.15, 0.2]),
              "n_estimators" : trial.suggest_categorical("n_estimators", [100, 500, 1000, 2000, 3000, 4000]),
              "max_depth" : trial.suggest_int("max_depth", 5, 30),
              "alpha" : trial.suggest_uniform("alpha", 0, 10),
              "random_state" : 1234,
              "metric":"root_mean_squared_error",
    }

    model = lightgbm.LGBMRegressor(**params)
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 100, verbose = False)

    preds = model.predict(X_val)

    rmse = mean_squared_error(y_val, preds, squared = False)

    return rmse

In [30]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_lgbm, n_trials=10)

[I 2021-01-22 19:29:46,039] Trial 0 finished with value: 0.6974868774145178 and parameters: {'learning_rate': 0.05, 'n_estimators': 4000, 'max_depth': 30, 'alpha': 2.921690167332931}. Best is trial 0 with value: 0.6974868774145178.
[I 2021-01-22 19:29:53,833] Trial 1 finished with value: 0.6980852122646505 and parameters: {'learning_rate': 0.03, 'n_estimators': 1000, 'max_depth': 13, 'alpha': 8.97815542887104}. Best is trial 0 with value: 0.6974868774145178.
[I 2021-01-22 19:29:55,175] Trial 2 finished with value: 0.7009180454213799 and parameters: {'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 21, 'alpha': 8.340898003363415}. Best is trial 0 with value: 0.6974868774145178.
[I 2021-01-22 19:29:58,850] Trial 3 finished with value: 0.6981740435318077 and parameters: {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 20, 'alpha': 0.26926869611194393}. Best is trial 0 with value: 0.6974868774145178.
[I 2021-01-22 19:29:59,800] Trial 4 finished with value: 0.7007079070352253

### RandomForestのハイパーパラメーターチューニング

In [37]:
def objective_rf(trial, X_train = X_train_dt, y_train = y_train_dt, X_val = X_val_dt, y_val = y_val_dt):
    params = {
              "n_estimators" : trial.suggest_categorical("n_estimators", [100, 500, 1000]),
              "max_depth" : trial.suggest_int("max_depth", 5, 30),
              "random_state":1234
    }

    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)

    preds = model.predict(X_val)

    rmse = mean_squared_error(y_val, preds, squared = False)

    return rmse

In [38]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_rf, n_trials=10)

[I 2021-01-23 01:18:32,238] Trial 0 finished with value: 0.7028004404296736 and parameters: {'n_estimators': 4000, 'max_depth': 29}. Best is trial 0 with value: 0.7028004404296736.
[I 2021-01-23 03:54:20,432] Trial 1 finished with value: 0.704466528401772 and parameters: {'n_estimators': 3000, 'max_depth': 15}. Best is trial 0 with value: 0.7028004404296736.
[I 2021-01-23 05:49:52,766] Trial 2 finished with value: 0.7093919895180603 and parameters: {'n_estimators': 3000, 'max_depth': 10}. Best is trial 0 with value: 0.7028004404296736.
[I 2021-01-23 07:03:33,536] Trial 3 finished with value: 0.703145226993087 and parameters: {'n_estimators': 1000, 'max_depth': 30}. Best is trial 0 with value: 0.7028004404296736.
[I 2021-01-23 07:10:54,113] Trial 4 finished with value: 0.7060202677296563 and parameters: {'n_estimators': 100, 'max_depth': 30}. Best is trial 0 with value: 0.7028004404296736.
[I 2021-01-23 08:36:31,734] Trial 5 finished with value: 0.7151406373192797 and parameters: {'n_es

### Ridge回帰のハイパーパラメーターチューニング

In [9]:
alphas = np.arange(0, 100, 0.1)
results = []
for i in alphas:
    ri = Ridge(alpha = i)
    ri.fit(X_train_l, y_train_l)
    y_pred = ri.predict(X_val_l)
    result = np.sqrt(mean_squared_error(y_pred, y_val_l))
    print("alpha{} : ".format(i), result)
    results.append(result)
    
print("Best alpha : "+str(np.argmin(results)/10))

alpha0.0 :  0.7175550145346463
alpha0.1 :  0.7175543341602432
alpha0.2 :  0.7175536656022568
alpha0.30000000000000004 :  0.7175530086647104
alpha0.4 :  0.7175523631556658
alpha0.5 :  0.7175517288871229
alpha0.6000000000000001 :  0.717551105674925
alpha0.7000000000000001 :  0.7175504933386635
alpha0.8 :  0.7175498917015884
alpha0.9 :  0.7175493005905181
alpha1.0 :  0.7175487198357551
alpha1.1 :  0.717548149271001
alpha1.2000000000000002 :  0.717547588733276
alpha1.3 :  0.7175470380628396
alpha1.4000000000000001 :  0.7175464971031136
alpha1.5 :  0.7175459657006076
alpha1.6 :  0.7175454437048461
alpha1.7000000000000002 :  0.7175449309682976
alpha1.8 :  0.7175444273463064
alpha1.9000000000000001 :  0.7175439326970245
alpha2.0 :  0.7175434468813481
alpha2.1 :  0.717542969762852
alpha2.2 :  0.7175425012077301
alpha2.3000000000000003 :  0.7175420410847341
alpha2.4000000000000004 :  0.7175415892651149
alpha2.5 :  0.7175411456225662
alpha2.6 :  0.717540710033169
alpha2.7 :  0.7175402823753375
a

alpha22.400000000000002 :  0.7175311206244785
alpha22.5 :  0.7175312527142386
alpha22.6 :  0.7175313855934
alpha22.700000000000003 :  0.7175315192544742
alpha22.8 :  0.7175316536900492
alpha22.900000000000002 :  0.7175317888927903
alpha23.0 :  0.7175319248554388
alpha23.1 :  0.7175320615708097
alpha23.200000000000003 :  0.717532199031793
alpha23.3 :  0.7175323372313505
alpha23.400000000000002 :  0.7175324761625166
alpha23.5 :  0.7175326158183968
alpha23.6 :  0.7175327561921664
alpha23.700000000000003 :  0.7175328972770703
alpha23.8 :  0.7175330390664216
alpha23.900000000000002 :  0.7175331815536015
alpha24.0 :  0.7175333247320574
alpha24.1 :  0.717533468595303
alpha24.200000000000003 :  0.7175336131369172
alpha24.3 :  0.717533758350543
alpha24.400000000000002 :  0.7175339042298873
alpha24.5 :  0.7175340507687196
alpha24.6 :  0.7175341979608711
alpha24.700000000000003 :  0.7175343458002353
alpha24.8 :  0.7175344942807652
alpha24.900000000000002 :  0.7175346433964741
alpha25.0 :  0.71753

alpha44.7 :  0.7175714643087159
alpha44.800000000000004 :  0.7175716697239259
alpha44.900000000000006 :  0.7175718752201111
alpha45.0 :  0.7175720807961605
alpha45.1 :  0.7175722864509718
alpha45.2 :  0.7175724921834503
alpha45.300000000000004 :  0.7175726979925093
alpha45.400000000000006 :  0.7175729038770706
alpha45.5 :  0.717573109836063
alpha45.6 :  0.7175733158684235
alpha45.7 :  0.7175735219730967
alpha45.800000000000004 :  0.7175737281490351
alpha45.900000000000006 :  0.7175739343951986
alpha46.0 :  0.7175741407105547
alpha46.1 :  0.717574347094078
alpha46.2 :  0.7175745535447512
alpha46.300000000000004 :  0.7175747600615638
alpha46.400000000000006 :  0.7175749666435127
alpha46.5 :  0.7175751732896024
alpha46.6 :  0.7175753799988439
alpha46.7 :  0.7175755867702561
alpha46.800000000000004 :  0.7175757936028643
alpha46.900000000000006 :  0.7175760004957011
alpha47.0 :  0.717576207447806
alpha47.1 :  0.7175764144582255
alpha47.2 :  0.7175766215260129
alpha47.300000000000004 :  0.71

alpha66.7 :  0.7176172302551797
alpha66.8 :  0.7176174364992796
alpha66.9 :  0.7176176427000437
alpha67.0 :  0.7176178488572347
alpha67.10000000000001 :  0.7176180549706166
alpha67.2 :  0.7176182610399559
alpha67.3 :  0.7176184670650199
alpha67.4 :  0.7176186730455784
alpha67.5 :  0.7176188789814021
alpha67.60000000000001 :  0.7176190848722636
alpha67.7 :  0.7176192907179373
alpha67.8 :  0.7176194965181992
alpha67.9 :  0.7176197022728267
alpha68.0 :  0.7176199079815987
alpha68.10000000000001 :  0.7176201136442961
alpha68.2 :  0.7176203192607009
alpha68.3 :  0.7176205248305968
alpha68.4 :  0.7176207303537695
alpha68.5 :  0.7176209358300055
alpha68.60000000000001 :  0.7176211412590935
alpha68.7 :  0.717621346640823
alpha68.8 :  0.7176215519749859
alpha68.9 :  0.7176217572613749
alpha69.0 :  0.7176219624997845
alpha69.10000000000001 :  0.7176221676900105
alpha69.2 :  0.7176223728318505
alpha69.3 :  0.7176225779251032
alpha69.4 :  0.7176227829695687
alpha69.5 :  0.7176229879650492
alpha69.

alpha89.5 :  0.7176628129752681
alpha89.60000000000001 :  0.7176630055935471
alpha89.7 :  0.7176631981431794
alpha89.80000000000001 :  0.717663390624135
alpha89.9 :  0.7176635830363844
alpha90.0 :  0.7176637753798991
alpha90.10000000000001 :  0.7176639676546499
alpha90.2 :  0.717664159860609
alpha90.30000000000001 :  0.7176643519977485
alpha90.4 :  0.7176645440660411
alpha90.5 :  0.7176647360654598
alpha90.60000000000001 :  0.7176649279959783
alpha90.7 :  0.7176651198575702
alpha90.80000000000001 :  0.7176653116502102
alpha90.9 :  0.7176655033738726
alpha91.0 :  0.7176656950285327
alpha91.10000000000001 :  0.7176658866141662
alpha91.2 :  0.7176660781307486
alpha91.30000000000001 :  0.7176662695782565
alpha91.4 :  0.7176664609566664
alpha91.5 :  0.7176666522659555
alpha91.60000000000001 :  0.7176668435061012
alpha91.7 :  0.7176670346770815
alpha91.80000000000001 :  0.7176672257788744
alpha91.9 :  0.7176674168114585
alpha92.0 :  0.717667607774813
alpha92.10000000000001 :  0.7176677986689

### K近傍法のハイパーパラメーターチューニング

In [None]:
list_k = []
list_rmse = []
for k in range(1, 101, 5):
    
    # KNeighborsRegressor
    knr = KNeighborsRegressor(n_neighbors=k)
    knr.fit(X_train_l, y_train_l)

    # 予測　
    y_pred = knr.predict(X_val_l)

    # 評価
    # 平方根平均二乗誤差（RMSE）
    score_rmse = np.sqrt(mean_squared_error(y_val_l, y_pred))
    
    print("k={} : {}".format(str(k), str(score_rmse)))

    list_k.append(k)
    list_rmse.append(score_rmse)

# プロット
plt.ylim(0, 1)
plt.xlabel("k")
plt.ylabel("rmse")
plt.plot(list_k, list_rmse)

k=1: 0.9910552962539824
k=6: 0.764377675327636
k=11: 0.7404047552594443
k=16: 0.7307220303645556
k=21: 0.7260794055504859
k=26: 0.7229850051419421
k=31: 0.721076632493803
k=36: 0.7194649298735017
k=41: 0.7183885666187045
k=46: 0.7175773420650757
k=51: 0.7168963910897955
k=56: 0.7163284921948015
k=61: 0.7158891841808471
k=66: 0.7158008961420407
k=71: 0.7153724421910779
k=76: 0.7151673039668361
k=81: 0.7150436617022765
k=86: 0.714986451661307
k=91: 0.7148046583295764
k=96: 0.7146541615382922
k=101: 0.7145271193704931
k=106: 0.7144399023635173
k=111: 0.7142694385439168
k=116: 0.714130360314477
k=121: 0.7140499505438874
k=126: 0.7139537938890705
k=131: 0.7138279946421198
k=136: 0.713802720528425
k=141: 0.7137931004139653
k=146: 0.7137013577387942
k=151: 0.7136137653345764
k=156: 0.7135621171240117
k=161: 0.7135588826570078
