# sorafune

In [30]:
import numpy as np 
import pandas as pd 
import re
import typing as tp

import matplotlib.pyplot as plt
import seaborn as sns
#from matplotlib_venn import venn2
%matplotlib inline

import string
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm

import os
from glob import glob

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import gensim

from collections import Counter
pd.set_option('display.max_columns', 100)


from catboost import CatBoost
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from catboost import Pool
from catboost import cv
import optuna

import category_encoders as ce
from sklearn.model_selection import KFold, StratifiedKFold

from xfeat import (SelectCategorical, LabelEncoder, Pipeline, ConcatCombination, SelectNumerical, 
                   ArithmeticCombinations, TargetEncoder, aggregation, GBDTFeatureSelector, GBDTFeatureExplorer)

import warnings
warnings.filterwarnings('ignore')

In [31]:
input_dir = "../input/"
output_dir = "../output/"

In [32]:
from contextlib import contextmanager
from time import time

@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time()
    yield
    d = time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)
        
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [33]:
train_df = pd.read_csv(input_dir+"TrainDataSet.csv")
test_df = pd.read_csv(input_dir+"EvaluationData.csv")
submission = pd.read_csv(input_dir+"UploadFileTemplate.csv")

In [34]:
# aggregationのagg_methodsで使用
def max_min(x):
    return x.max()-x.min()

def q75_q25(x):
    return x.quantile(0.75) - x.quantile(0.25)

def q25(x):
    return x.quantile(0.25)

def q50(x):
    return x.quantile(0.5)
def q75(x):
    return x.quantile(0.75)

def aggregation_cumfeat(input_df, group_key, group_values):
    output_df = pd.DataFrame()
    for col in group_values:
        if input_df.AverageLandPrice.min() ==1:
            new_col = f"cum_feat_{col}_grpby_{group_key}"
        else:
            new_col = f"all_cum_feat_{col}_grpby_{group_key}"
        input_df["lag"] = input_df.groupby(group_key)[[col]].shift(1)
        cum = input_df[[group_key]+["lag"]].groupby(group_key).lag.agg(["cumsum", "cumcount"])
        new_df = pd.DataFrame(cum["cumsum"]/cum["cumcount"])
        new_df.columns = [new_col]
        output_df = pd.concat([output_df, new_df],axis=1)
            
    return output_df

def get_agg_cumfeat_features(input_df):
    _input_df =  pd.concat([input_df, get_area_feature(input_df)], axis=1)
    
    group_key = "PlaceID"
    group_values = ["MeanLight","SumLight"]    
    output_df = aggregation_cumfeat(input_df,
                                    group_key=group_key,
                                    group_values=group_values)
    return output_df    

# group 内で diffをとる関数
def diff_aggregation(input_df, group_key, group_values, num_diffs):
    dfs = []
    for nd in num_diffs:
        _df = input_df.groupby(group_key)[group_values].diff(nd)
        _df.columns = [f'diff={nd}_{col}_grpby_{group_key}' for col in group_values]
        dfs.append(_df)
    output_df = pd.concat(dfs, axis=1)
    return output_df

# group 内で shiftをとる関数
def shift_aggregation(input_df, group_key, group_values, num_shifts):
    dfs = []
    for ns in num_shifts:
        _df = input_df.groupby(group_key)[group_values].shift(ns)
        _df.columns = [f'shift={ns}_{col}_grpby_{group_key}' for col in group_values]
        dfs.append(_df)
    output_df = pd.concat(dfs, axis=1)
    return output_df

# そのままの値の特徴量
def get_raw_features(input_df):
    cols = [
        "MeanLight",
        "SumLight",
        "Year"
    ]
    return input_df[cols].copy()

# 面積
def get_area_feature(input_df):
    output_df = pd.DataFrame()
    output_df["Area"] = input_df["SumLight"] / (input_df["MeanLight"]+1e-3)
    return output_df

# aggration PlaceID
def get_agg_place_id_features(input_df):
    _input_df = pd.concat([input_df, get_area_feature(input_df)], axis=1)
    
    cols = 'PlaceID'

    output_df = pd.DataFrame()
    output_df, agg_cols = aggregation(_input_df,
                                      group_key=cols,
                                      group_values=["MeanLight", "SumLight", "Area"],
                                      agg_methods=["min", "max", "median", "mean", "std","var", max_min, q75_q25,q25,q50,q75],
                                               )
    
    return output_df[agg_cols]

# aggration Year
def get_agg_year_features(input_df):
    _input_df = pd.concat([input_df, get_area_feature(input_df)], axis=1)
    
    cols = 'Year'

    output_df = pd.DataFrame()
    output_df, agg_cols = aggregation(_input_df,
                                      group_key=cols,
                                      group_values=["MeanLight", "SumLight", "Area"],
                                      agg_methods=["min", "max", "median", "mean", "std", "var",max_min, q75_q25,q25,q50,q75],
                                               )
    
    return output_df[agg_cols]

# PlaceID をキーにしたグループ内差分
def get_diff_agg_place_id_features(input_df):
    group_key = "PlaceID"
    group_values = ["MeanLight", "SumLight"]
    num_diffs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
    output_df = diff_aggregation(input_df, 
                                 group_key=group_key, 
                                 group_values=group_values, 
                                 num_diffs=num_diffs)
    return output_df

# PlaceID をキーにしたグループ内シフト
def get_shift_agg_place_id_features(input_df):
    group_key = "PlaceID"
    group_values = ["MeanLight", "SumLight"]
    num_shifts = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
    output_df = shift_aggregation(input_df, 
                                  group_key=group_key, 
                                  group_values=group_values, 
                                  num_shifts=num_shifts)
    return output_df

# pivot tabel を用いた特徴量
def get_place_id_vecs_features(input_df):
    _input_df = pd.concat([input_df, get_area_feature(input_df)], axis=1)
    # pivot table
    area_df = pd.pivot_table(_input_df, index="PlaceID", columns="Year", values="Area").add_prefix("Area=")
    mean_light_df = pd.pivot_table(_input_df, index="PlaceID", columns="Year", values="MeanLight").add_prefix("MeanLight=")
    sum_light_df = pd.pivot_table(_input_df, index="PlaceID", columns="Year", values="SumLight").add_prefix("SumLight=")
    all_df = pd.concat([area_df, mean_light_df, sum_light_df], axis=1)
    
    # PCA all 
    sc_all_df = StandardScaler().fit_transform(all_df.fillna(0))
    pca = PCA(n_components=64, random_state=2021)
    pca_all_df = pd.DataFrame(pca.fit_transform(sc_all_df), index=all_df.index).rename(columns=lambda x: f"PlaceID_all_PCA={x:03}")
    # PCA Area
    sc_area_df = StandardScaler().fit_transform(area_df.fillna(0))
    pca = PCA(n_components=16, random_state=2021)
    pca_area_df = pd.DataFrame(pca.fit_transform(sc_area_df), index=all_df.index).rename(columns=lambda x: f"PlaceID_Area_PCA={x:03}")
    # PCA MeanLight
    sc_mean_light_df = StandardScaler().fit_transform(mean_light_df.fillna(0))
    pca = PCA(n_components=16, random_state=2021)
    pca_mean_light_df = pd.DataFrame(pca.fit_transform(sc_mean_light_df), index=all_df.index).rename(columns=lambda x: f"PlaceID_MeanLight_PCA={x:03}")
    # PCA SumLight
    sc_sum_light_df = StandardScaler().fit_transform(sum_light_df.fillna(0))
    pca = PCA(n_components=16, random_state=2021)
    pca_sum_light_df = pd.DataFrame(pca.fit_transform(sc_sum_light_df), index=all_df.index).rename(columns=lambda x: f"PlaceID_SumLight_PCA={x:03}")
    
    df = pd.concat([all_df, pca_all_df, pca_area_df, pca_mean_light_df, pca_sum_light_df], axis=1)
    output_df = pd.merge(_input_df[["PlaceID"]], df, left_on="PlaceID", right_index=True, how="left")
    return output_df.drop("PlaceID", axis=1)

# PlaceIDをキーにしたグループ内相関係数
def get_corr_features(input_df):
    _input_df = pd.concat([input_df, get_area_feature(input_df)], axis=1)
    group_key = "PlaceID"
    group_vlaues = [
        ["Year", "MeanLight"],
        ["Year", "SumLight"],
        ["Year", "Area"],
    ]
    dfs = []
    for gv in group_vlaues:
        _df = _input_df.groupby(group_key)[gv].corr().unstack().iloc[:, 1].rename(f"Corr={gv[0]}-{gv[1]}")
        dfs.append(pd.DataFrame(_df))
    dfs = pd.concat(dfs, axis=1)
    output_df = pd.merge(_input_df[[group_key]], dfs, left_on=group_key, right_index=True, how="left").drop(group_key, axis=1)
    return output_df
    
# count 63
def get_count63_feature(input_df):
    # 各地域でMeanLightが63をとった回数を特徴量にする
    _mapping = input_df[input_df['MeanLight']==63].groupby('PlaceID').size()
    
    output_df = pd.DataFrame()
    output_df['count63'] = input_df['PlaceID'].map(_mapping).fillna(0)
    return output_df

In [35]:
# 前処理関数を順々に処理していく関数
def to_features(train, test):
    input_df = pd.concat([train, test]).reset_index(drop=True)

    processes = [
        get_raw_features,
        get_area_feature,
        get_agg_place_id_features,
        get_agg_year_features,
        get_diff_agg_place_id_features,
        get_shift_agg_place_id_features,
        get_place_id_vecs_features,
        get_corr_features,
        get_count63_feature,
        get_agg_cumfeat_features
    ]

    output_df = pd.DataFrame()
    for func in tqdm(processes):
        _df = func(input_df)
        assert len(_df) == len(input_df), func.__name__
        output_df = pd.concat([output_df, _df], axis=1)

    train_x = output_df.iloc[:len(train)] 
    test_x = output_df.iloc[len(train):].reset_index(drop=True)
    return train_x, test_x

In [36]:
target_data = "AverageLandPrice" 

train_x, test_x = to_features(train_df, test_df)
train_ys = train_df[target_data]
train_ys = np.log1p(train_ys)

100%|██████████| 10/10 [00:19<00:00,  1.98s/it]


In [37]:
def fit_lgbm(X, y, cv, params: dict=None, verbose: int=50):
    metric_func = mean_squared_error
    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgb.LGBMRegressor(**params)

        with timer(prefix='fit fold={} '.format(i + 1)):
            clf.fit(x_train, y_train, 
                    eval_set=[(x_valid, y_valid)],  
                    early_stopping_rounds=verbose,
                    verbose=verbose)

        pred_i = clf.predict(x_valid)

        oof_pred[idx_valid] = pred_i
        models.append(clf)

        print(f'Fold {i} RMSLE: {metric_func(y_valid, pred_i)**.5 :.4f}')

    score = metric_func(y, oof_pred)**.5 
    print('FINISHED | Whole RMSLE: {:.4f}'.format(score))
    return oof_pred, models

# XGB
def fit_xgb(X, y, cv, params: dict=None, verbose: int=50):
    metric_func = mean_squared_error
    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        
        model_xgb = xgb.XGBRegressor(**params)

        with timer(prefix='fit fold={} '.format(i + 1)):
            model_xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)],verbose=-1)
            
        #print(model_xgb.best_score())
        
        pred_i = model_xgb.predict(x_valid)

        oof_pred[idx_valid] = pred_i
        models.append(model_xgb)

        print(f'Fold {i} RMSLE: {metric_func(y_valid, pred_i)**.5 :.4f}')

    score = metric_func(y, oof_pred)**.5 
    print('FINISHED | Whole RMSLE: {:.4f}'.format(score))
    return oof_pred, models

# Catboost
def fit_cb(X, y, cv, params: dict=None, verbose: int=50):
    metric_func = mean_squared_error
    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        
        train_pool = Pool(x_train, label = y_train)
        valid_pool = Pool(x_valid, label = y_valid)
        
        model_cb = CatBoostRegressor(**params)

        with timer(prefix='fit fold={} '.format(i + 1)):
            model_cb.fit(train_pool,
              # valid_data
              eval_set = valid_pool,
              use_best_model = True,
              silent = True,
              plot = False)
            
        print(model_cb.get_best_score())
        
        pred_i = model_cb.predict(x_valid)

        oof_pred[idx_valid] = pred_i
        models.append(model_cb)

        print(f'Fold {i} RMSLE: {metric_func(y_valid, pred_i)**.5 :.4f}')

    score = metric_func(y, oof_pred)**.5 
    print('FINISHED | Whole RMSLE: {:.4f}'.format(score))
    return oof_pred, models

In [38]:
class GroupKFold:
    """
    GroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=5, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X=None, y=None, group=None):
        kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = group.unique()
        for tr_group_idx, va_group_idx in kf.split(unique_ids):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(group.isin(tr_group))[0]
            val_idx = np.where(group.isin(va_group))[0]
            yield train_idx, val_idx


# PlaceID をキーにした Group K fold
def make_gkf(X, y, n_splits=5, random_state=2020):
    gkf = GroupKFold(n_splits=n_splits, random_state=random_state)
    return list(gkf.split(X, y, train_df["PlaceID"]))


In [39]:
group_cv =make_gkf(train_x, train_ys)
group_cv2 =make_gkf(train_x, train_ys, 5, 71)
group_cv3 =make_gkf(train_x, train_ys, 5, 23)

In [40]:
lgm_params = {  
    "n_estimators": 20000,
    "objective": 'rmse',
    "learning_rate": 0.01,
    "num_leaves": 36,
    "random_state": 2021,
    "n_jobs": -1,
    "importance_type": "gain",
    'colsample_bytree': .5,
    "reg_lambda": 5,
    "max_depth":7,
    }

oof, models = fit_lgbm(train_x.values, train_ys,group_cv , params=lgm_params)

Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.774816
[100]	valid_0's rmse: 0.642579
[150]	valid_0's rmse: 0.586299
[200]	valid_0's rmse: 0.561054
[250]	valid_0's rmse: 0.548692
[300]	valid_0's rmse: 0.542027
[350]	valid_0's rmse: 0.538238
[400]	valid_0's rmse: 0.535607
[450]	valid_0's rmse: 0.534053
[500]	valid_0's rmse: 0.533171
[550]	valid_0's rmse: 0.532535
[600]	valid_0's rmse: 0.532228
[650]	valid_0's rmse: 0.531672
[700]	valid_0's rmse: 0.531506
[750]	valid_0's rmse: 0.531592
Early stopping, best iteration is:
[717]	valid_0's rmse: 0.531379
fit fold=1 19.921[s]
Fold 0 RMSLE: 0.5314
Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.902036
[100]	valid_0's rmse: 0.722727
[150]	valid_0's rmse: 0.628569
[200]	valid_0's rmse: 0.580601
[250]	valid_0's rmse: 0.554605
[300]	valid_0's rmse: 0.540055
[350]	valid_0's rmse: 0.532119
[400]	valid_0's rmse: 0.527588
[450]	valid_0's rmse: 0.524288
[500]	valid_0's rmse: 0.5211

In [41]:
oof_lgb2, models_lgb2 = fit_lgbm(train_x.values, train_ys, group_cv2 , params=lgm_params)

Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.886802
[100]	valid_0's rmse: 0.726178
[150]	valid_0's rmse: 0.646051
[200]	valid_0's rmse: 0.605451
[250]	valid_0's rmse: 0.585236
[300]	valid_0's rmse: 0.574223
[350]	valid_0's rmse: 0.567214
[400]	valid_0's rmse: 0.563434
[450]	valid_0's rmse: 0.560632
[500]	valid_0's rmse: 0.558563
[550]	valid_0's rmse: 0.557083
[600]	valid_0's rmse: 0.555698
[650]	valid_0's rmse: 0.554825
[700]	valid_0's rmse: 0.554159
[750]	valid_0's rmse: 0.55385
[800]	valid_0's rmse: 0.553719
[850]	valid_0's rmse: 0.553502
[900]	valid_0's rmse: 0.553286
Early stopping, best iteration is:
[872]	valid_0's rmse: 0.553163
fit fold=1 23.297[s]
Fold 0 RMSLE: 0.5532
Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.917375
[100]	valid_0's rmse: 0.759833
[150]	valid_0's rmse: 0.684258
[200]	valid_0's rmse: 0.648135
[250]	valid_0's rmse: 0.62935
[300]	valid_0's rmse: 0.619899
[350]	valid_0's rmse: 0.61395


In [42]:
oof_lgb3, models_lgb3 = fit_lgbm(train_x.values, train_ys, group_cv3 , params=lgm_params)

Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.906771
[100]	valid_0's rmse: 0.756035
[150]	valid_0's rmse: 0.679403
[200]	valid_0's rmse: 0.641067
[250]	valid_0's rmse: 0.619688
[300]	valid_0's rmse: 0.606533
[350]	valid_0's rmse: 0.59839
[400]	valid_0's rmse: 0.593554
[450]	valid_0's rmse: 0.590801
[500]	valid_0's rmse: 0.588081
[550]	valid_0's rmse: 0.586414
[600]	valid_0's rmse: 0.585412
[650]	valid_0's rmse: 0.584017
[700]	valid_0's rmse: 0.58332
[750]	valid_0's rmse: 0.58284
[800]	valid_0's rmse: 0.581913
[850]	valid_0's rmse: 0.581177
[900]	valid_0's rmse: 0.580432
[950]	valid_0's rmse: 0.580203
[1000]	valid_0's rmse: 0.579953
[1050]	valid_0's rmse: 0.579636
[1100]	valid_0's rmse: 0.579278
[1150]	valid_0's rmse: 0.57908
[1200]	valid_0's rmse: 0.578864
[1250]	valid_0's rmse: 0.578724
[1300]	valid_0's rmse: 0.578753
Early stopping, best iteration is:
[1252]	valid_0's rmse: 0.578722
fit fold=1 29.358[s]
Fold 0 RMSLE: 0.5787
Training until validat

In [43]:
cb_params = {
    'loss_function': 'RMSE',
    'num_boost_round': 10000,
    'depth':7,
    'learning_rate':0.01,
    "random_state": 2021,
    }

oof_cb, models_cb = fit_cb(train_x.values, train_ys,group_cv , params=cb_params)

fit fold=1 275.551[s]
{'learn': {'RMSE': 0.038081387717376645}, 'validation': {'RMSE': 0.5292038555969685}}
Fold 0 RMSLE: 0.5292
fit fold=2 277.454[s]
{'learn': {'RMSE': 0.039919377108343576}, 'validation': {'RMSE': 0.52063268870213}}
Fold 1 RMSLE: 0.5206
fit fold=3 277.837[s]
{'learn': {'RMSE': 0.038318644618509}, 'validation': {'RMSE': 0.5552293181402578}}
Fold 2 RMSLE: 0.5552
fit fold=4 278.456[s]
{'learn': {'RMSE': 0.038390233328619454}, 'validation': {'RMSE': 0.538640816713981}}
Fold 3 RMSLE: 0.5386
fit fold=5 277.980[s]
{'learn': {'RMSE': 0.03804498663929287}, 'validation': {'RMSE': 0.5691340492006972}}
Fold 4 RMSLE: 0.5691
FINISHED | Whole RMSLE: 0.5429


In [None]:
oof_cb2, models_cb2 = fit_cb(train_x.values, train_ys,group_cv2 , params=cb_params)

fit fold=1 278.235[s]
{'learn': {'RMSE': 0.03936371031462618}, 'validation': {'RMSE': 0.5494590505841918}}
Fold 0 RMSLE: 0.5495
fit fold=2 276.255[s]
{'learn': {'RMSE': 0.03828292978694559}, 'validation': {'RMSE': 0.599951119057503}}
Fold 1 RMSLE: 0.6000
fit fold=3 277.021[s]
{'learn': {'RMSE': 0.038658509903143776}, 'validation': {'RMSE': 0.5637353311454404}}
Fold 2 RMSLE: 0.5637


In [None]:
oof_cb3, models_cb3 = fit_cb(train_x.values, train_ys,group_cv3 , params=cb_params)

In [None]:
xgb_params = {
    'booster': 'gbtree',
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'num_boost_round': 10000,
    'max_depth':7,
    'eta':0.03,
    "random_state": 2021,
    "verbosity":1
    }

oof_xgb, models_xgb = fit_xgb(train_x.values, train_ys,group_cv , params=xgb_params)

In [None]:
oof_xgb2, models_xgb2 = fit_xgb(train_x.values, train_ys, group_cv2 , params=xgb_params)

In [None]:
oof_xgb3, models_xgb3 = fit_xgb(train_x.values, train_ys, group_cv3 , params=xgb_params)

In [None]:
def visualize_importance(models, feat_train_df):
    """lightGBM の model 配列の feature importance を plot する
    CVごとのブレを boxen plot として表現します.

    args:
        models:
            List of lightGBM models
        feat_train_df:
            学習時に使った DataFrame
    """
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df['feature_importance'] = model.feature_importances_
        _df['column'] = feat_train_df.columns
        _df['fold'] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True)

    order = feature_importance_df.groupby('column')\
        .sum()[['feature_importance']]\
        .sort_values('feature_importance', ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(max(6, len(order) * .4), 7))
    sns.boxenplot(data=feature_importance_df, x='column', y='feature_importance', order=order, ax=ax, palette='viridis')
    ax.tick_params(axis='x', rotation=90)
    ax.grid()
    fig.tight_layout()
    return fig, ax

In [None]:
fig, ax = visualize_importance(models, train_x)

# pred

In [None]:
# predict lgb
pred_lgb1 = np.array([model.predict(test_x.values) for model in models])
pred_lgb1 = np.mean(pred_lgb1, axis=0)
pred_lgb1 = np.where(pred_lgb1 < 0, 0, pred_lgb1)
pred_lgb1 = np.expm1(pred_lgb1)

pred_lgb2 = np.array([model.predict(test_x.values) for model in models_lgb2])
pred_lgb2 = np.mean(pred_lgb2, axis=0)
pred_lgb2 = np.where(pred_lgb2 < 0, 0, pred_lgb2)
pred_lgb2 = np.expm1(pred_lgb2)

pred_lgb3 = np.array([model.predict(test_x.values) for model in models_lgb3])
pred_lgb3 = np.mean(pred_lgb3, axis=0)
pred_lgb3 = np.where(pred_lgb3 < 0, 0, pred_lgb3)
pred_lgb3 = np.expm1(pred_lgb3)

# seed average lgb
pred_lgb = (pred_lgb1+pred_lgb2+pred_lgb3)/3

# predict catboost
pred_cb1 = np.array([model.predict(test_x.values) for model in models_cb])
pred_cb1 = np.mean(pred_cb1, axis=0)
pred_cb1 = np.where(pred_cb1 < 0, 0, pred_cb1)
pred_cb1 = np.expm1(pred_cb1)

pred_cb2 = np.array([model.predict(test_x.values) for model in models_cb2])
pred_cb2 = np.mean(pred_cb2, axis=0)
pred_cb2 = np.where(pred_cb2 < 0, 0, pred_cb2)
pred_cb2 = np.expm1(pred_cb2)

pred_cb3 = np.array([model.predict(test_x.values) for model in models_cb3])
pred_cb3 = np.mean(pred_cb3, axis=0)
pred_cb3 = np.where(pred_cb3 < 0, 0, pred_cb3)
pred_cb3 = np.expm1(pred_cb3)

# seed average catboost
pred_cb = (pred_cb1+pred_cb2+pred_cb3)/3

# predict xgb
pred_xgb1 = np.array([model.predict(test_x.values) for model in models_xgb])
pred_xgb1 = np.mean(pred_xgb1, axis=0)
pred_xgb1 = np.where(pred_xgb1 < 0, 0, pred_xgb1)
pred_xgb1 = np.expm1(pred_xgb1)

pred_xgb2 = np.array([model.predict(test_x.values) for model in models_xgb2])
pred_xgb2 = np.mean(pred_xgb2, axis=0)
pred_xgb2 = np.where(pred_xgb2 < 0, 0, pred_xgb2)
pred_xgb2 = np.expm1(pred_xgb2)

pred_xgb3 = np.array([model.predict(test_x.values) for model in models_xgb3])
pred_xgb3 = np.mean(pred_xgb3, axis=0)
pred_xgb3 = np.where(pred_xgb3 < 0, 0, pred_xgb3)
pred_xgb3 = np.expm1(pred_xgb3)

# seed average catboost
pred_xgb = (pred_xgb1+pred_xgb2+pred_xgb3)/3


pred_em = (pred_lgb+pred_cb+pred_xgb)/3

In [None]:
submission.head()

In [None]:
date="20210402_2"
submission["LandPrice"] = pred_lgb
submission.to_csv(output_dir + "date" + date +'_simple_submission.csv', index=False)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
sns.histplot(np.log1p(pred_lgb), label='Test Predict', ax=ax, color='black')
sns.histplot(oof, label='Out Of Fold', ax=ax, color='C1')
ax.legend()
ax.grid()