In [2]:
import pandas as pd
from custom.feature_generators import ReviewedMonth,MinNightsAvailability,PreprocessName,RulebaseName
from lilac.core.blocks import BlocksRunner

features_dir = "data/features"
seed = 18
target_col = "y"
evaluator_flag = "rmsle"

trainer_params = {
    "model_str": "basic",
    "target_col": target_col,
    "params": None,
    "seed" : seed
}

folds_gen_settings={
    "fold_num" : 5,
    "model_str" : "group", # kfold, stratified, group, stratified_group
    "seed": seed,
    "params" : {"target_col": "y", "key_col": "host_id"}
}

model_params  = {
    "model_str": "lgbm_rmsle",
    "target_col": target_col, 
    "params":None, 
    "depth":8,
    "n_estimators":100000, 
    "seed":seed
}


custom_members={
    "reviewed_month": ReviewedMonth,
    "min_nights_availability": MinNightsAvailability,
    "rulebase_name":RulebaseName,
    "preprocess_name":  PreprocessName
}


unused_cols = ["id","host_id","name","last_review","last_review_day","station_name","neighbourhood"]

settings = [
    # カテゴリ系
    #{
    #    "model_str": "pipeline", 
    #    "params": {
    #        "feature_generators":[
    #            {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
    #            {"model_str": "target", "params": {"encoder_str":"catb", "target_col": "y","input_cols": ["station_name","room_type","neighbourhood"]}}
    #        ]
    #    }
    #}, 
    #{"model_str": "category", "params":{"encoder_str": "count","input_cols": ["host_id"]}},
    # 自然言語処理
    {"model_str": "datetime","params":{"input_col": "last_review"}},
    {
        "model_str": "pipeline", 
        "params": {
            "feature_generators":[
                {"model_str": "preprocess_name"},
                {"model_str": "wc_vec", "params": {"input_col": "name_preprocessed", "vectorizer_str":"tfidf", "decomposer_str":"svd", "n_components":32,"random_state":42}},
            ]
        }
    },
    {
        "model_str": "pipeline", 
        "params": {
            "feature_generators":[
                {"model_str": "preprocess_name"},
                {"model_str": "wc_vec", "params": {"input_col": "name_preprocessed", "vectorizer_str":"tfidf", "decomposer_str":"nmf", "n_components":10,"random_state":42}},
            ]
        }
    },
    {
        "model_str": "pipeline", 
        "params": {
            "feature_generators":[
                {"model_str": "preprocess_name"},
                {"model_str": "wc_vec", "params": {"input_col": "name_preprocessed", "vectorizer_str":"bow", "decomposer_str":"svd", "n_components":32,"random_state":42}},
            ]
        }
    },
    {
        "model_str": "pipeline", 
        "params": {
            "feature_generators":[
                {"model_str": "preprocess_name"},
                {"model_str": "wc_vec", "params": {"input_col": "name_preprocessed", "vectorizer_str":"bow", "decomposer_str":"nmf", "n_components":10,"random_state":42}},
            ]
        }
    },
    #{"model_str": "wc_vec", "params": {"input_col": "name", "vectorizer_str":"tfidf", "decomposer_str":"svd", "n_components":32,"random_state":42}},
    #{"model_str": "wc_vec", "params": {"input_col": "name", "vectorizer_str":"tfidf", "decomposer_str":"nmf", "n_components":10,"random_state":42}},
    #{"model_str": "wc_vec", "params": {"input_col": "name", "vectorizer_str":"bow", "decomposer_str":"svd", "n_components":32,"random_state":42}},
    #{"model_str": "wc_vec", "params": {"input_col": "name", "vectorizer_str":"bow", "decomposer_str":"nmf", "n_components":10,"random_state":42}},
    #{"model_str": "bert", "params": {"input_col": "name",  "max_len":128}},
    {
        "model_str": "pipeline", 
        "params": {
            "use_previous_cols": True,
            "feature_generators":[
                #{"model_str": "preprocess_name"},
                {"model_str": "bert", "params": {"input_col": "name",  "max_len":128}},
                {"model_str": "dec", "params": {"decomposer_str": "pca","n_components":20,"random_state":42,"col_mark":"bert"}},
            ]
        }
    },
    {
        "model_str": "pipeline", 
        "params": {
            "use_previous_cols": True,
            "feature_generators":[
                #{"model_str": "preprocess_name"},
                {"model_str": "bert", "params": {"input_col": "name",  "max_len":128}},
                {"model_str": "dec", "params": {"decomposer_str": "umap","n_components":3,"random_state":42,"col_mark":"bert"}},
            ]
        }
    },
    {
        "model_str": "pipeline", 
        "params": {
            "feature_generators":[
                {"model_str": "preprocess_name"},
                {"model_str": "rulebase_name"}
            ]
        }
    },
    # 駅関連
    {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
    {"model_str": "extra_join", "params": {"csv_path":"data/dist_under_counts.csv","join_on":"id"}},
    {"model_str": "extra_join", "params": {"csv_path":"data/nearest_terminal.csv","join_on":"id"}},
    #{
    #    "model_str": "pipeline", 
    #    "params": {
    #        "use_previous_cols": True,
    #        "feature_generators":[
    #            {"model_str": "extra_join", "params": {"csv_path":"data/dist_under_counts.csv","join_on":"id"}},
    #            {"model_str": "pca", "params": {"n_components":3,"random_state":42,"col_mark":"dist_under_counts"}},
    #        ]
    #    }
    #},
    {"model_str": "cluster", "params": {"input_cols": ["latitude","longitude"], "model_str":"gmm", "n_clusters":10,"random_state":42}},
    {"model_str": "cluster", "params": {"input_cols": ["latitude","longitude"], "model_str":"kmeans", "n_clusters":10,"random_state":42}},
    {
        "model_str": "pipeline", 
        "params": {
            "feature_generators":[
                {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
                {"model_str": "cluster", "params": {"input_cols": ["sta_latitude","sta_longitude"], "model_str":"kmeans", "n_clusters":10,"random_state":42}}
            ]
        }
    },
    {
        "model_str": "pipeline", 
        "params": {
            "feature_generators":[
                {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
                {"model_str": "cluster", "params": {"input_cols": ["sta_latitude","sta_longitude"], "model_str":"gmm", "n_clusters":10,"random_state":42}}
            ]
        }
    },
    # 集約特徴量
    #{"model_str": "group", "params":{"group_key": "neighbourhood","input_cols": ["minimum_nights", "number_of_reviews", "reviews_per_month","availability_365"]}},
    #{
    #    "model_str": "pipeline", 
    #    "params": {
    #        "feature_generators":[
    #            {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
    #            {"model_str": "group", "params":{"group_key": "station_name","input_cols": ["minimum_nights", "number_of_reviews", "reviews_per_month","availability_365"]}},
    #        ]
    #    }
    #},
    #{"model_str": "group", "params":{"group_key": "room_type","input_cols": ["minimum_nights", "number_of_reviews", "reviews_per_month","availability_365"]}},
    #{"model_str": "group", "params":{"group_key": "host_id","input_cols": ["minimum_nights", "number_of_reviews", "reviews_per_month","availability_365"]}},
    {
        "model_str": "pipeline", 
        "params": {
            "use_previous_cols": True,
            "feature_generators":[
                {"model_str": "group", "params":{"group_key": "host_id","input_cols": ["minimum_nights", "number_of_reviews", "availability_365"]}},
                {"model_str": "dec", "params": {"decomposer_str": "umap","n_components":10,"random_state":42,"col_mark":"host_group"}},
            ]
        }
    },
    {
        "model_str": "pipeline", 
        "params": {
            "use_previous_cols": [False,True],
            "feature_generators":[
                {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
                {"model_str": "group", "params":{"group_key": "station_name","input_cols": ["minimum_nights", "number_of_reviews", "availability_365"]}},
                {"model_str": "dec", "params": {"decomposer_str": "umap","n_components":10,"random_state":42,"col_mark":"station_group"}},
            ]
        }
    },
    # その他
    {"model_str": "reviewed_month"},
    #{"model_str": "min_nights_availability"},
]


train=pd.read_csv("data/train_data.csv")
test=pd.read_csv("data/test_data.csv")

blocks_runner=BlocksRunner(target_col=target_col,features_dir=features_dir,custom_members=custom_members, features_settings= settings,unused_cols=unused_cols, 
 folds_gen_settings=folds_gen_settings,model_params=model_params,trainer_params=trainer_params,evaluator_flag=evaluator_flag)

output=blocks_runner.run(train,test)
output["score"]


[1/18] DatetimeFeatures (90d1adf771e1730a7212eb9a7a7b0d3e)
Loading DatetimeFeatures_90d1adf771e1730a7212eb9a7a7b0d3e (train)...
Loading DatetimeFeatures_90d1adf771e1730a7212eb9a7a7b0d3e (test)...
[2/18] FeaturesPipeline (d9cb50f6ef3110f7d6a365883a02c485)
Loading PreprocessName_99914b932bd37a50b983c5e7c90ae93b (train)...
Loading PreprocessName_99914b932bd37a50b983c5e7c90ae93b (test)...
Loading WordCountVectorizer_94c0a90739d9f25d7e291daab63b3d06 (train)...
Loading WordCountVectorizer_94c0a90739d9f25d7e291daab63b3d06 (test)...
[3/18] FeaturesPipeline (2786ca5caf7c70f751af1209f785a76b)
Loading PreprocessName_99914b932bd37a50b983c5e7c90ae93b (train)...
Loading PreprocessName_99914b932bd37a50b983c5e7c90ae93b (test)...
Loading WordCountVectorizer_6e5ac083789403e0d9a0c8b137e60a61 (train)...
Loading WordCountVectorizer_6e5ac083789403e0d9a0c8b137e60a61 (test)...
[4/18] FeaturesPipeline (cb9c50185036432e84b6b094b7d5a3a9)
Loading PreprocessName_99914b932bd37a50b983c5e7c90ae93b (train)...
Loading 



[100]	train's rmse: 0.44065	valid's rmse: 0.747533
[200]	train's rmse: 0.329322	valid's rmse: 0.749162
Fold : 2




[100]	train's rmse: 0.446489	valid's rmse: 0.797435
[200]	train's rmse: 0.332839	valid's rmse: 0.798663
Fold : 3




[100]	train's rmse: 0.441901	valid's rmse: 0.765683
[200]	train's rmse: 0.328505	valid's rmse: 0.764742
[300]	train's rmse: 0.257633	valid's rmse: 0.767594
Fold : 4




[100]	train's rmse: 0.443963	valid's rmse: 0.831828
[200]	train's rmse: 0.332197	valid's rmse: 0.836304
Fold : 5




[100]	train's rmse: 0.426362	valid's rmse: 1.03825


0.8384338084447452

In [3]:
from lilac.ensemble.ensemble_runner_factory import EnsembleRunnerFactory

ensemble_params = {
    "target_col":target_col,
    "unused_cols":unused_cols,
    "folds_gen_settings":folds_gen_settings, 
    "trainer_params":trainer_params,
    "use_original_cols": False
}

ensemble_runner=EnsembleRunnerFactory().run(model_str="linear_rmsle",params=ensemble_params)

hoge=ensemble_runner.run([output,output], train, test)
hoge["score"]

Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5


0.8398010649612698

In [21]:
from lilac.ensemble.stacking_runner import StackingRunner

ensemble_params = {
    "target_col":target_col,
    "unused_cols":unused_cols,
    "folds_gen_settings":folds_gen_settings, 
    "trainer_params":trainer_params,
    "use_original_cols": False
}

stacking_settings = [["avg_rmsle","linear_rmsle"], ["avg_rmsle"]]
stacking_runner=StackingRunner(settings=stacking_settings,shared_params=ensemble_params)

stack_output=stacking_runner.run([output,output])

Layer 1
Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5
Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5
Layer 2
Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5


In [22]:
stack_output[-1][0]

{'oof_pred': [14322.428359608472,
  11509.543701317305,
  16048.71609278716,
  12005.957133586606,
  19301.355678270975,
  26888.93155220242,
  18014.425984131056,
  12597.218230198134,
  5491.8337494918,
  11290.463083092076,
  10945.193125393493,
  13603.419368741048,
  16216.12250445781,
  14085.591735005542,
  14028.818556552807,
  9846.415890196113,
  18250.813623022223,
  27413.946367159857,
  13814.310239144444,
  3641.0965465113973,
  12864.262281321748,
  13682.713832671157,
  16875.746529063756,
  16834.906475545802,
  18613.49515101949,
  18444.196722898865,
  4707.389679419972,
  10479.507289278063,
  10538.302970527257,
  19732.702564591476,
  18339.96117781652,
  18749.188608297452,
  21129.544142912782,
  37546.64122818798,
  13903.760770810444,
  11309.370186918015,
  8221.229960340577,
  13642.740296331787,
  11737.478711528498,
  17338.930452071596,
  9266.367912081654,
  19059.615039221542,
  19502.66533411476,
  11325.389926811848,
  39972.320854451216,
  10317.3621