In [6]:
import pandas as pd
from custom.feature_generators import ReviewedMonth,MinNightsAvailability,PreprocessName,RulebaseName
from tools.core.blocks import BlocksRunner

def main(seed, settings,depth,unused_cols,train,test):
    target_col = "y"
    evaluator_flag = "rmsle"
    features_dir = "data/features"

    trainer_factory_settings = {
        "model_str": "basic",
        "params": {
            "target_col": target_col,
            "seed" : seed
        }
    }

    folds_gen_factory_settings = {
        "model_str" : "group", # kfold, stratified, group, stratified_group
        "params" : {
            "fold_num" : 5,
            "seed": seed,
            "target_col": "y", 
            "key_col": "host_id"
        }    
    }

    model_factory_settings = {
        "model_str": "lgbm_rmsle",
        "params":{
            "depth":depth,
            "n_estimators":100000, 
            "seed":seed
        }
    }

    custom_members={
        "reviewed_month": ReviewedMonth,
        "min_nights_availability": MinNightsAvailability,
        "rulebase_name":RulebaseName,
        "preprocess_name":  PreprocessName
    }
    
    blocks_runner=BlocksRunner(target_col=target_col,features_dir=features_dir,custom_members=custom_members, features_settings= settings,unused_cols=unused_cols, 
    folds_gen_factory_settings=folds_gen_factory_settings,model_factory_settings=model_factory_settings,trainer_factory_settings =trainer_factory_settings ,evaluator_flag=evaluator_flag)

    return blocks_runner.run(train,test)



In [7]:
seed = 18
depth=8
unused_cols = ["id","host_id","name","last_review","last_review_day"]
settings = [
        # カテゴリ系
        #{
        #    "model_str": "pipeline", 
        #    "params": {
        #        "feature_generators":[
        #            {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
        #            {"model_str": "target", "params": {"encoder_str":"catb", "target_col": "y","input_cols": ["station_name","room_type","neighbourhood"]}}
        #        ]
        #    }
        #}, 
        #{"model_str": "category", "params":{"encoder_str": "count","input_cols": ["host_id"]}},
        # 自然言語処理
        {"model_str": "datetime","params":{"input_col": "last_review"}},
        {
            "model_str": "pipeline", 
            "params": {
                "feature_generators":[
                    {"model_str": "preprocess_name"},
                    {"model_str": "wc_vec", "params": {"input_col": "name_preprocessed", "vectorizer_str":"tfidf", "decomposer_str":"svd", "n_components":32,"random_state":42}},
                ]
            }
        },
        {
            "model_str": "pipeline", 
            "params": {
                "feature_generators":[
                    {"model_str": "preprocess_name"},
                    {"model_str": "wc_vec", "params": {"input_col": "name_preprocessed", "vectorizer_str":"tfidf", "decomposer_str":"nmf", "n_components":10,"random_state":42}},
                ]
            }
        },
        {
            "model_str": "pipeline", 
            "params": {
                "feature_generators":[
                    {"model_str": "preprocess_name"},
                    {"model_str": "wc_vec", "params": {"input_col": "name_preprocessed", "vectorizer_str":"bow", "decomposer_str":"svd", "n_components":32,"random_state":42}},
                ]
            }
        },
        {
            "model_str": "pipeline", 
            "params": {
                "feature_generators":[
                    {"model_str": "preprocess_name"},
                    {"model_str": "wc_vec", "params": {"input_col": "name_preprocessed", "vectorizer_str":"bow", "decomposer_str":"nmf", "n_components":10,"random_state":42}},
                ]
            }
        },
        #{"model_str": "wc_vec", "params": {"input_col": "name", "vectorizer_str":"tfidf", "decomposer_str":"svd", "n_components":32,"random_state":42}},
        #{"model_str": "wc_vec", "params": {"input_col": "name", "vectorizer_str":"tfidf", "decomposer_str":"nmf", "n_components":10,"random_state":42}},
        #{"model_str": "wc_vec", "params": {"input_col": "name", "vectorizer_str":"bow", "decomposer_str":"svd", "n_components":32,"random_state":42}},
        #{"model_str": "wc_vec", "params": {"input_col": "name", "vectorizer_str":"bow", "decomposer_str":"nmf", "n_components":10,"random_state":42}},
        #{"model_str": "bert", "params": {"input_col": "name",  "max_len":128}},
        {
            "model_str": "pipeline", 
            "params": {
                "use_prev_only": True,
                "feature_generators":[
                    {"model_str": "preprocess_name"},
                    {"model_str": "bert", "params": {"model_name": "bert-base-multilingual-uncased", "input_col": "name_preprocessed",  "max_len":128}},
                    {"model_str": "dec", "params": {"decomposer_str": "pca","n_components":20,"random_state":42,"col_mark":"bert"}},
                ]
            }
        },
        {
            "model_str": "pipeline", 
            "params": {
                "use_prev_only": True,
                "feature_generators":[
                    {"model_str": "preprocess_name"},
                    {"model_str": "bert", "params": {"model_name": "bert-base-multilingual-uncased", "input_col": "name_preprocessed",  "max_len":128}},
                    {"model_str": "dec", "params": {"decomposer_str": "umap","n_components":10,"random_state":42,"col_mark":"bert"}},
                ]
            }
        },
        {
            "model_str": "pipeline", 
            "params": {
                "feature_generators":[
                    {"model_str": "preprocess_name"},
                    {"model_str": "rulebase_name"}
                ]
            }
        },
        # 駅関連
        {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
        {"model_str": "extra_join", "params": {"csv_path":"data/dist_under_counts.csv","join_on":"id"}},
        {"model_str": "extra_join", "params": {"csv_path":"data/nearest_terminal.csv","join_on":"id"}},
        #{
        #    "model_str": "pipeline", 
        #    "params": {
        #        "use_previous_cols": True,
        #        "feature_generators":[
        #            {"model_str": "extra_join", "params": {"csv_path":"data/dist_under_counts.csv","join_on":"id"}},
        #            {"model_str": "pca", "params": {"n_components":3,"random_state":42,"col_mark":"dist_under_counts"}},
        #        ]
        #    }
        #},
        {"model_str": "cluster", "params": {"input_cols": ["latitude","longitude"], "model_str":"gmm", "n_clusters":10,"random_state":42}},
        {"model_str": "cluster", "params": {"input_cols": ["latitude","longitude"], "model_str":"kmeans", "n_clusters":10,"random_state":42}},
        {
            "model_str": "pipeline", 
            "params": {
                "feature_generators":[
                    {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
                    {"model_str": "cluster", "params": {"input_cols": ["sta_latitude","sta_longitude"], "model_str":"kmeans", "n_clusters":10,"random_state":42}}
                ]
            }
        },
        {
            "model_str": "pipeline", 
            "params": {
                "feature_generators":[
                    {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
                    {"model_str": "cluster", "params": {"input_cols": ["sta_latitude","sta_longitude"], "model_str":"gmm", "n_clusters":10,"random_state":42}}
                ]
            }
        },
        # 集約特徴量
        #{"model_str": "group", "params":{"group_key": "neighbourhood","input_cols": ["minimum_nights", "number_of_reviews", "reviews_per_month","availability_365"]}},
        #{
        #    "model_str": "pipeline", 
        #    "params": {
        #        "feature_generators":[
        #            {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
        #            {"model_str": "group", "params":{"group_key": "station_name","input_cols": ["minimum_nights", "number_of_reviews", "reviews_per_month","availability_365"]}},
        #        ]
        #    }
        #},
        #{"model_str": "group", "params":{"group_key": "room_type","input_cols": ["minimum_nights", "number_of_reviews", "reviews_per_month","availability_365"]}},
        #{"model_str": "group", "params":{"group_key": "host_id","input_cols": ["minimum_nights", "number_of_reviews", "reviews_per_month","availability_365"]}},
        #{
        #    "model_str": "pipeline", 
        #    "params": {
        #        "use_prev_only": True,
        #        "feature_generators":[
        #            {"model_str": "group", "params":{"group_key": "host_id","input_cols": ["minimum_nights", "number_of_reviews", "availability_365"]}},
        #            {"model_str": "dec", "params": {"decomposer_str": "umap","n_components":10,"random_state":42,"col_mark":"host_group"}},
        #        ]
        #    }
        #},
        {
            "model_str": "pipeline", 
            "params": {
                "use_prev_only": [False,True],
                "feature_generators":[
                    {"model_str": "extra_join", "params": {"csv_path":"data/nearest_station.csv","join_on":"id"}},
                    {"model_str": "group", "params":{"group_key": "station_name","input_cols": ["minimum_nights", "number_of_reviews", "availability_365"]}},
                    {"model_str": "dec", "params": {"decomposer_str": "umap","n_components":10,"random_state":42,"col_mark":"station_group"}},
                ]
            }
        },
        # その他
        {"model_str": "reviewed_month"},
        #{"model_str": "min_nights_availability"},
    ]
train=pd.read_csv("data/train_data.csv")
test=pd.read_csv("data/test_data.csv")


#output=main(seed=seed,settings=settings,depth=depth,unused_cols=unused_cols,train=train,test=test)
#output["score"]

In [8]:
output=main(seed=1,settings=settings,depth=depth,unused_cols=unused_cols,train=train,test=test)

[1/17] DatetimeFeatures
Loading DatetimeFeatures_90d1adf771e1730a7212eb9a7a7b0d3e (train)...
Loading DatetimeFeatures_90d1adf771e1730a7212eb9a7a7b0d3e (test)...
[2/17] FeaturesPipeline
Loading WordCountVectorizer_by_pipeline_5c2e5e9f6d8264592dcca4f3af6d1b50 (train)...
Loading WordCountVectorizer_by_pipeline_5c2e5e9f6d8264592dcca4f3af6d1b50 (test)...
[3/17] FeaturesPipeline
Loading WordCountVectorizer_by_pipeline_87d2e943b5b0f59fd4da386de6f0ba51 (train)...
Loading WordCountVectorizer_by_pipeline_87d2e943b5b0f59fd4da386de6f0ba51 (test)...
[4/17] FeaturesPipeline
Loading WordCountVectorizer_by_pipeline_37472898aeef2a25348b77118eb04d65 (train)...
Loading WordCountVectorizer_by_pipeline_37472898aeef2a25348b77118eb04d65 (test)...
[5/17] FeaturesPipeline
Loading WordCountVectorizer_by_pipeline_03cf258cfb18d14d5b780696814ac6ae (train)...
Loading WordCountVectorizer_by_pipeline_03cf258cfb18d14d5b780696814ac6ae (test)...
[6/17] FeaturesPipeline
Loading DecompositionFeatures_by_pipeline_f1bdab22b



[100]	train's rmse: 0.453196	valid's rmse: 0.728179
Fold : 2




[100]	train's rmse: 0.429483	valid's rmse: 0.980015
Fold : 3




[100]	train's rmse: 0.447819	valid's rmse: 0.763117
[200]	train's rmse: 0.337735	valid's rmse: 0.765358
Fold : 4




[100]	train's rmse: 0.447434	valid's rmse: 0.821161
[200]	train's rmse: 0.337396	valid's rmse: 0.82245
Fold : 5




[100]	train's rmse: 0.444912	valid's rmse: 0.771827
[200]	train's rmse: 0.333498	valid's rmse: 0.77039


In [12]:
output["score"]

0.8185436632898353

In [4]:
from tools.ensemble.stacking_runner import StackingRunner

output_list=[output,output]

target_col = "y"
ensemble_params = {
    "target_col":target_col,
    "unused_cols": None,
    "trainer_factory_settings": {
        "model_str": "basic",
        "params": {
            "target_col": target_col,
            "seed" : seed
        }
    },
    "folds_gen_factory_settings" : {
        "model_str" : "group", # kfold, stratified, group, stratified_group
        "params" : {
            "fold_num" : 5,
            "seed": seed,
            "target_col": "y", 
            "key_col": "host_id"
        }    
    },
    "use_original_cols": False
}
stacking_settings = [["avg_rmsle"]]
stacking_runner=StackingRunner(settings=stacking_settings,shared_params=ensemble_params)


stack_output=stacking_runner.run(output_list,train,test)
stack_output[-1][0]["score"]

Layer 1
Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5


0.8185436632898353

In [5]:
submit_df = pd.DataFrame({'y': stack_output[-1][0]["pred"]})
submit_df.index.name = 'id'
submit_df.to_csv('data/subs/test.csv')