In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import catboost
import xgboost
import lightgbm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor

import optuna

### データの前処理

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# trainから id = 170514を削除する (外れ値)
train = train[train["id"] != 170514]
train = train.reset_index(drop = True)

# 説明変数と目的変数の分離
# 列の抜き出し
features = train.columns
features = features.drop(["id", "target"])

# 分離
train_target = train["target"]
train_id = train["id"]
test_id = test["id"]
train = train[features]
test = test[features]

In [None]:
# KMeans法によるラベリング(特徴量エンジニアリング)
km = KMeans(n_clusters=2, random_state=0)
km.fit(train)
km_train = pd.DataFrame(km.predict(train), columns = ["km_cluster"])
km_test = pd.DataFrame(km.predict(test), columns = ["km_cluster"])

# 結合
train = pd.concat([train, km_train], axis = 1)
test = pd.concat([test, km_test], axis = 1)

In [None]:
## 決定木系モデル用のデータを作成
train_dt = train.copy()
test_dt = test.copy()

# 混合ガウス分布による特徴量のラベリング(特徴量エンジニアリング)
def get_gmm_class_feature(feat, train_df, test_df, n):
    gmm = GaussianMixture(n_components=n, random_state=1234)
    gmm.fit(train_df[feat].values.reshape(-1, 1))
    train_df[f'{feat}_class'] = gmm.predict(train_df[feat].values.reshape(-1, 1))
    test_df[f'{feat}_class'] = gmm.predict(test_df[feat].values.reshape(-1, 1))

get_gmm_class_feature('cont1', train_dt, test_dt, 4)
get_gmm_class_feature('cont2', train_dt, test_dt, 10)
get_gmm_class_feature('cont3', train_dt, test_dt, 6)
get_gmm_class_feature('cont4', train_dt, test_dt, 4)
get_gmm_class_feature('cont5', train_dt, test_dt, 3)
get_gmm_class_feature('cont6', train_dt, test_dt, 2)
get_gmm_class_feature('cont7', train_dt, test_dt, 3)
get_gmm_class_feature('cont8', train_dt, test_dt, 4)
get_gmm_class_feature('cont9', train_dt, test_dt, 4)
get_gmm_class_feature('cont10', train_dt, test_dt, 8)
get_gmm_class_feature('cont11', train_dt, test_dt, 5)
get_gmm_class_feature('cont12', train_dt, test_dt, 4)
get_gmm_class_feature('cont13', train_dt, test_dt, 6)
get_gmm_class_feature('cont14', train_dt, test_dt, 6)    

In [None]:
# 線形回帰、K近傍法用にデータを加工する
train_l = train_dt.copy()
test_l = test_dt.copy()

# ダミー変数化
for i in range(1,15):
    temp = pd.get_dummies(train_l["cont{}_class".format(str(i))], drop_first = True)
    train_l = pd.concat([train_l, temp], axis = 1)
    del train_l["cont{}_class".format(str(i))]

for i in range(1,15):
    temp = pd.get_dummies(test_l["cont{}_class".format(str(i))], drop_first = True)
    test_l = pd.concat([test_l, temp], axis = 1)
    del test_l["cont{}_class".format(str(i))]

In [None]:
# train_test_split
X_train_dt, X_val_dt, y_train_dt, y_val_dt = train_test_split(train_dt, train_target, test_size = 0.25, random_state = 1234)
X_train_l, X_val_l, y_train_l, y_val_l = train_test_split(train_l, train_target, test_size = 0.25, random_state = 1234)

### CatBoostのハイパーパラメーターチューニング

In [None]:
def objective_cat(trial, X_train = X_train_dt, y_train = y_train_dt, X_val = X_val_dt, y_val = y_val_dt):
    
    params = {
              "learning_rate" : trial.suggest_categorical("learning_rate", [0.01, 0.03, 0.05, 0.1, 0.15, 0.2]),
              "n_estimators" : trial.suggest_categorical("n_estimators", [100, 500, 1000, 2000, 3000, 4000]),
              "max_depth" : trial.suggest_int("max_depth", 5,16),
              "random_state" : 1234
    }

    model = catboost.CatBoostRegressor(**params)
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 100, verbose = False)

    preds = model.predict(X_val)

    rmse = mean_squared_error(y_val, preds, squared = False)

    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_cat, n_trials=10)

### XGBoostのハイパーパラメーターチューニング

In [None]:
def objective_xgb(trial, X_train = X_train_dt, y_train = y_train_dt, X_val = X_val_dt, y_val = y_val_dt):
    
    params = {
              "learning_rate" : trial.suggest_categorical("learning_rate", [0.01, 0.03, 0.05, 0.1, 0.15, 0.2]),
              "n_estimators" : trial.suggest_categorical("n_estimators", [100, 500, 1000, 2000, 3000, 4000]),
              "max_depth" : trial.suggest_int("max_depth", 5, 30),
              "alpha" : trial.suggest_uniform("alpha", 0, 10),
              "random_state" : 1234
    }

    model = xgboost.XGBRegressor(**params)
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 100, verbose = False)

    preds = model.predict(X_val)

    rmse = mean_squared_error(y_val, preds, squared = False)

    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_xgb, n_trials=10)

### LightGBMのハイパーパラメーターチューニング

In [None]:
def objective_lgbm(trial, X_train = X_train_dt, y_train = y_train_dt, X_val = X_val_dt, y_val = y_val_dt):
    
    params = {
              "learning_rate" : trial.suggest_categorical("learning_rate", [0.01, 0.03, 0.05, 0.1, 0.15, 0.2]),
              "n_estimators" : trial.suggest_categorical("n_estimators", [100, 500, 1000, 2000, 3000, 4000]),
              "max_depth" : trial.suggest_int("max_depth", 5, 30),
              "alpha" : trial.suggest_uniform("alpha", 0, 10),
              "random_state" : 1234,
              "metric":"root_mean_squared_error",
    }

    model = lightgbm.LGBMRegressor(**params)
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 100, verbose = False)

    preds = model.predict(X_val)

    rmse = mean_squared_error(y_val, preds, squared = False)

    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_lgbm, n_trials=10)

### RandomForestのハイパーパラメーターチューニング

In [None]:
def objective_rf(trial, X_train = X_train_dt, y_train = y_train_dt, X_val = X_val_dt, y_val = y_val_dt):
    params = {
              "n_estimators" : trial.suggest_categorical("n_estimators", [100, 500, 1000]),
              "max_depth" : trial.suggest_int("max_depth", 5, 30),
              "random_state":1234
    }

    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)

    preds = model.predict(X_val)

    rmse = mean_squared_error(y_val, preds, squared = False)

    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_rf, n_trials=10)

### Ridge回帰のハイパーパラメーターチューニング

In [None]:
alphas = np.arange(0, 100, 0.1)
results = []
for i in alphas:
    ri = Ridge(alpha = i)
    ri.fit(X_train_l, y_train_l)
    y_pred = ri.predict(X_val_l)
    result = np.sqrt(mean_squared_error(y_pred, y_val_l))
    print("alpha{} : ".format(i), result)
    results.append(result)
    
print("Best alpha : "+str(np.argmin(results)/10))

### K近傍法のハイパーパラメーターチューニング

In [None]:
list_k = []
list_rmse = []
for k in range(1, 101, 5):
    
    # KNeighborsRegressor
    knr = KNeighborsRegressor(n_neighbors=k)
    knr.fit(X_train_l, y_train_l)

    # 予測　
    y_pred = knr.predict(X_val_l)

    # 評価
    # 平方根平均二乗誤差（RMSE）
    score_rmse = np.sqrt(mean_squared_error(y_val_l, y_pred))
    
    print("k={} : {}".format(str(k), str(score_rmse)))

    list_k.append(k)
    list_rmse.append(score_rmse)

# プロット
plt.ylim(0, 1)
plt.xlabel("k")
plt.ylabel("rmse")
plt.plot(list_k, list_rmse)