In [1]:
TRAIN_SALES_DATA_PATH = "data/train_sales_data.csv"
TRAIN_SEARCH_DATA_PATH = "data/train_search_data.csv"
TRAIN_USER_REPLY_DATA_PATH = "data/train_user_reply_data.csv"
TEST_PATH = "data/evaluation_public.csv"

# import re
# import gc
# import os
# import csv
import time
import math
import random
import pickle
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook, trange
from sklearn import preprocessing
from scipy import stats

from sklearn.tree import ExtraTreeRegressor
import lightgbm as lgb
import xgboost as xgb
# import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
def get_data(df, data_type):
    if data_type == "train":
        return df[df.regYear == 2016]
    elif data_type == "test":
        return df[df.regYear == 2017]

In [3]:
train_sales = pd.read_csv(TRAIN_SALES_DATA_PATH)
train_search = pd.read_csv(TRAIN_SEARCH_DATA_PATH)
train_user = pd.read_csv(TRAIN_USER_REPLY_DATA_PATH)
test_data = pd.read_csv(TEST_PATH)

train_sales.salesVolume = train_sales.salesVolume.apply(lambda x: np.log(1+x))

In [4]:
# 特征工程
def cal_basic_fea(df:pd.DataFrame, cal_col:str, stat_dim:list, data_type:str) -> pd.DataFrame:
    """
    计算原始特征、周期特征、趋势特征
    """
    train_sales_data = get_data(train_sales, data_type)

    name_prefix = "_".join(stat_dim) + "_%s"%cal_col
    drop_name = "level_%d"%len(stat_dim)

    # 原始特征
    feature_data = train_sales_data.groupby(stat_dim)[cal_col].apply(lambda x: x.sum()).unstack(level=-1)
    feature_data.columns = [name_prefix + "_%d"%x for x in feature_data.columns.ravel()]
    feature_data = feature_data.reset_index()

    # 周期特征
    ## shift_div
    tmp_df = train_sales_data.groupby(stat_dim)[cal_col].apply(lambda x: x.sum()).groupby(stat_dim[:-1]).apply(lambda x: x / x.shift(1)).reset_index()

    tmp_df = tmp_df.rename(columns={"salesVolume":"shift_div"})

    train_sales_data = pd.merge(train_sales_data, tmp_df, on=stat_dim, how="left")

    tmp_df = train_sales_data.dropna().groupby(stat_dim).shift_div.apply(lambda x: x.sum()).unstack(level=-1)

    tmp_df.columns = [name_prefix + "_shift_div_%d"%x for x in tmp_df.columns.ravel()]

    feature_data = pd.merge(feature_data, tmp_df, on=stat_dim[:-1], how="left")

    ## shift_sub
    tmp_df = train_sales_data.groupby(stat_dim)[cal_col].apply(lambda x: x.sum()).groupby(stat_dim[:-1]).apply(lambda x: x - x.shift(1)).reset_index()

    tmp_df = tmp_df.rename(columns={"salesVolume":"shift_sub"})

    train_sales_data = pd.merge(train_sales_data, tmp_df, on=stat_dim, how="left")

    tmp_df = train_sales_data.dropna().groupby(stat_dim).shift_sub.apply(lambda x: x.sum()).unstack(level=-1)

    tmp_df.columns = [name_prefix + "_shift_sub_%d"%x for x in tmp_df.columns.ravel()]

    feature_data = pd.merge(feature_data, tmp_df, on=stat_dim[:-1], how="left")

    # 趋势特征
    ## shift_div
    tmp_df = train_sales_data.groupby(stat_dim)["shift_div"].apply(lambda x: x.sum()).groupby(stat_dim[:-1]).apply(lambda x: x / x.shift(1)).reset_index()

    tmp_df = tmp_df.rename(columns={"shift_div":"shift_2_div"})

    train_sales_data = pd.merge(train_sales_data, tmp_df, on=stat_dim, how="left")

    tmp_df = train_sales_data.dropna().groupby(stat_dim).shift_div.apply(lambda x: x.sum()).unstack(level=-1)

    tmp_df.columns = [name_prefix + "_shift_2_div_%d"%x for x in tmp_df.columns.ravel()]

    feature_data = pd.merge(feature_data, tmp_df, on=stat_dim[:-1], how="left")

    ## shift_sub
    tmp_df = train_sales_data.groupby(stat_dim)["shift_sub"].apply(lambda x: x.sum()).groupby(stat_dim[:-1]).apply(lambda x: x - x.shift(1)).reset_index()

    tmp_df = tmp_df.rename(columns={"shift_sub":"shift_2_sub"})

    train_sales_data = pd.merge(train_sales_data, tmp_df, on=stat_dim, how="left")

    tmp_df = train_sales_data.dropna().groupby(stat_dim).shift_sub.apply(lambda x: x.sum()).unstack(level=-1)

    tmp_df.columns = [name_prefix + "_shift_2_sub_%d"%x for x in tmp_df.columns.ravel()]

    feature_data = pd.merge(feature_data, tmp_df, on=stat_dim[:-1], how="left")
    return feature_data


In [5]:
def cal_windows_fea(df:pd.DataFrame, cal_col:str, stat_dim:list, data_type:str) -> pd.DataFrame:
    """
    计算滑窗特征
    """
    train_sales_data = get_data(df, data_type)

    name_prefix = "_".join(stat_dim) + "_%s"%cal_col

    # 滑窗特征
    ## mean
    feature_data = train_sales_data.groupby(stat_dim)[cal_col].apply(lambda x: x.sum()).groupby(stat_dim[:-1]).rolling(3).mean()

    feature_data = feature_data.dropna().unstack(level=-1)

    if len(stat_dim) == 3:
        feature_data.index = feature_data.index.droplevel(0)
        feature_data.index = feature_data.index.droplevel(0)
    elif len(stat_dim) == 2:
        feature_data.index = feature_data.index.droplevel(0)


    feature_data.reset_index(inplace=True)
    feature_data = feature_data.rename(columns={k:"%s_rolling_mean_%d"%(name_prefix, k) for k in range(13)})

    ## std
    tmp_df = train_sales_data.groupby(stat_dim)[cal_col].apply(lambda x: x.sum()).groupby(stat_dim[:-1]).rolling(3).std()

    tmp_df = tmp_df.dropna().unstack(level=-1)

    if len(stat_dim) == 3:
        tmp_df.index = tmp_df.index.droplevel(0)
        tmp_df.index = tmp_df.index.droplevel(0)
    elif len(stat_dim) == 2:
        tmp_df.index = tmp_df.index.droplevel(0)


    tmp_df.reset_index(inplace=True)
    tmp_df = tmp_df.rename(columns={k:"%s_rolling_std_%d"%(name_prefix, k) for k in range(13)})

    feature_data = pd.merge(feature_data, tmp_df, on=stat_dim[:-1], how="left")
    
    ## var
    tmp_df = train_sales_data.groupby(stat_dim)[cal_col].apply(lambda x: x.sum()).groupby(stat_dim[:-1]).rolling(3).var()

    tmp_df = tmp_df.dropna().unstack(level=-1)

    if len(stat_dim) == 3:
        tmp_df.index = tmp_df.index.droplevel(0)
        tmp_df.index = tmp_df.index.droplevel(0)
    elif len(stat_dim) == 2:
        tmp_df.index = tmp_df.index.droplevel(0)


    tmp_df.reset_index(inplace=True)
    tmp_df = tmp_df.rename(columns={k:"%s_rolling_std_%d"%(name_prefix, k) for k in range(13)})

    feature_data = pd.merge(feature_data, tmp_df, on=stat_dim[:-1], how="left")
    

    ## sum
    tmp_df = train_sales_data.groupby(stat_dim)[cal_col].apply(lambda x: x.sum()).groupby(stat_dim[:-1]).rolling(3).sum()

    tmp_df = tmp_df.dropna().unstack(level=-1)

    if len(stat_dim) == 3:
        tmp_df.index = tmp_df.index.droplevel(0)
        tmp_df.index = tmp_df.index.droplevel(0)
    elif len(stat_dim) == 2:
        tmp_df.index = tmp_df.index.droplevel(0)

    tmp_df.reset_index(inplace=True)
    tmp_df = tmp_df.rename(columns={k:"%s_rolling_sum_%d"%(name_prefix, k) for k in range(13)})

    feature_data = pd.merge(feature_data, tmp_df, on=stat_dim[:-1], how="left")
    
    # 滑窗特征2222222222222222
    ## mean
    feature_data = train_sales_data.groupby(stat_dim)[cal_col].apply(lambda x: x.sum()).groupby(stat_dim[:-1]).rolling(4).mean()

    feature_data = feature_data.dropna().unstack(level=-1)

    if len(stat_dim) == 3:
        feature_data.index = feature_data.index.droplevel(0)
        feature_data.index = feature_data.index.droplevel(0)
    elif len(stat_dim) == 2:
        feature_data.index = feature_data.index.droplevel(0)


    feature_data.reset_index(inplace=True)
    feature_data = feature_data.rename(columns={k:"%s_rolling_mean_%d"%(name_prefix, k) for k in range(13)})

    ## std
    tmp_df = train_sales_data.groupby(stat_dim)[cal_col].apply(lambda x: x.sum()).groupby(stat_dim[:-1]).rolling(4).std()

    tmp_df = tmp_df.dropna().unstack(level=-1)

    if len(stat_dim) == 3:
        tmp_df.index = tmp_df.index.droplevel(0)
        tmp_df.index = tmp_df.index.droplevel(0)
    elif len(stat_dim) == 2:
        tmp_df.index = tmp_df.index.droplevel(0)


    tmp_df.reset_index(inplace=True)
    tmp_df = tmp_df.rename(columns={k:"%s_rolling_std_%d"%(name_prefix, k) for k in range(13)})

    feature_data = pd.merge(feature_data, tmp_df, on=stat_dim[:-1], how="left")
    


    ## sum
    tmp_df = train_sales_data.groupby(stat_dim)[cal_col].apply(lambda x: x.sum()).groupby(stat_dim[:-1]).rolling(4).sum()

    tmp_df = tmp_df.dropna().unstack(level=-1)

    if len(stat_dim) == 3:
        tmp_df.index = tmp_df.index.droplevel(0)
        tmp_df.index = tmp_df.index.droplevel(0)
    elif len(stat_dim) == 2:
        tmp_df.index = tmp_df.index.droplevel(0)

    tmp_df.reset_index(inplace=True)
    tmp_df = tmp_df.rename(columns={k:"%s_rolling_sum_%d"%(name_prefix, k) for k in range(13)})

    feature_data = pd.merge(feature_data, tmp_df, on=stat_dim[:-1], how="left")
    return feature_data

In [6]:
# 建立车型与型号的对应字典
model2type = train_sales[["model", "bodyType"]].drop_duplicates().set_index("model").to_dict()["bodyType"]
# 城市+车
train_basic_fea = cal_basic_fea(train_sales, "salesVolume", ["adcode", "model", "regMonth"], "train")
# 城市
tmp_df = cal_basic_fea(train_sales, "salesVolume", ["adcode", "regMonth"], "train")
train_basic_fea = pd.merge(train_basic_fea, tmp_df, on="adcode", how="left")
# 车
tmp_df = cal_basic_fea(train_sales, "salesVolume", ["model", "regMonth"], "train")
train_basic_fea = pd.merge(train_basic_fea, tmp_df, on="model", how="left")
# 城市+车型
tmp_df = cal_basic_fea(train_sales, "salesVolume", ["adcode", "bodyType", "regMonth"], "train")
train_basic_fea["bodyType"] = train_basic_fea.model.apply(lambda x: model2type[x])
train_basic_fea = pd.merge(train_basic_fea, tmp_df, on=["adcode", "bodyType"], how="left")


# cal_windows_fea
# 城市+车
train_windows_fea = cal_windows_fea(train_sales, cal_col="salesVolume", stat_dim=["adcode", "model", "regMonth"], data_type="train")
# 城市
tmp_df = cal_windows_fea(train_sales, "salesVolume", ["adcode", "regMonth"], "train")
train_windows_fea = pd.merge(train_windows_fea, tmp_df, on="adcode", how="left")
# 车
tmp_df = cal_windows_fea(train_sales, "salesVolume", ["model", "regMonth"], "train")
train_windows_fea = pd.merge(train_windows_fea, tmp_df, on="model", how="left")
# 城市+车型
tmp_df = cal_windows_fea(train_sales, "salesVolume", ["adcode", "bodyType", "regMonth"], "train")
train_windows_fea["bodyType"] = train_windows_fea.model.apply(lambda x: model2type[x])
train_windows_fea = pd.merge(train_windows_fea, tmp_df, on=["adcode", "bodyType"], how="left")

In [7]:
train_data = pd.merge(train_basic_fea, train_windows_fea, on=["adcode", "model", "bodyType"], how="left")
train_data.head()

Unnamed: 0,adcode,model,adcode_model_regMonth_salesVolume_1,adcode_model_regMonth_salesVolume_2,adcode_model_regMonth_salesVolume_3,adcode_model_regMonth_salesVolume_4,adcode_model_regMonth_salesVolume_5,adcode_model_regMonth_salesVolume_6,adcode_model_regMonth_salesVolume_7,adcode_model_regMonth_salesVolume_8,...,adcode_bodyType_regMonth_salesVolume_rolling_std_12,adcode_bodyType_regMonth_salesVolume_rolling_sum_4,adcode_bodyType_regMonth_salesVolume_rolling_sum_5,adcode_bodyType_regMonth_salesVolume_rolling_sum_6,adcode_bodyType_regMonth_salesVolume_rolling_sum_7,adcode_bodyType_regMonth_salesVolume_rolling_sum_8,adcode_bodyType_regMonth_salesVolume_rolling_sum_9,adcode_bodyType_regMonth_salesVolume_rolling_sum_10,adcode_bodyType_regMonth_salesVolume_rolling_sum_11,adcode_bodyType_regMonth_salesVolume_rolling_sum_12
0,110000,02aab221aabc03b9,6.684612,6.052089,6.598509,6.302619,6.340359,6.472346,6.618739,6.742881,...,8.657722,639.799585,636.69988,655.841951,654.532863,655.573478,655.266455,647.766958,648.454342,659.300918
1,110000,04e66e578f653ab9,4.912655,4.060443,5.081404,4.955827,4.976734,5.062595,5.111988,5.068904,...,8.657722,639.799585,636.69988,655.841951,654.532863,655.573478,655.266455,647.766958,648.454342,659.300918
2,110000,06880909932890ca,6.771936,5.288267,6.204558,6.063785,6.22059,6.003887,6.376727,6.395262,...,8.657722,639.799585,636.69988,655.841951,654.532863,655.573478,655.266455,647.766958,648.454342,659.300918
3,110000,0797526c057dcf5b,5.736572,4.672829,6.12905,6.363028,6.429719,6.498282,6.548219,6.612041,...,8.657722,639.799585,636.69988,655.841951,654.532863,655.573478,655.266455,647.766958,648.454342,659.300918
4,110000,12f8b7e14947c34d,6.390241,5.393628,5.849325,5.83773,5.926926,5.849325,6.063785,5.986452,...,8.657722,639.799585,636.69988,655.841951,654.532863,655.573478,655.266455,647.766958,648.454342,659.300918


In [26]:
vaild_data.shape

(1320, 14)

In [8]:
# 合并特征
train_data = pd.merge(train_basic_fea, train_windows_fea, on=["adcode", "model", "bodyType"], how="left")
vaild_data = get_data(train_sales, "test").groupby(["adcode", "model"])["salesVolume"].apply(lambda x: pd.DataFrame(np.array(x)).T).reset_index().drop("level_2", axis=1)

In [9]:




# 模型
le_model = preprocessing.LabelEncoder()
le_bodyType = preprocessing.LabelEncoder()

le_model.fit(train_data.model)
le_bodyType.fit(train_data.bodyType)

lgb_params = {
    "num_leaves":32,
    "reg_alpha":1,
    "reg_lambda":0.1,
    "objective":'mse',
    "max_depth": 4,
    "learning_rate":0.01,
    "min_child_samples":5,
    "random_state":2019,
    "n_estimators":4000,
    "subsample":0.8,
    "colsample_bytree":0.8
}

df_train_columns = [c for c in train_data.columns if c not in []]
print("df_train_columns:",df_train_columns)
cate_fea = ["adcode", "model", "bodyType"]
train_data.model = le_model.transform(train_data.model)
train_data.bodyType = le_bodyType.transform(train_data.bodyType)
print(train_data.shape)
print(len(df_train_columns))


y_score = []    # 交叉验证
cv_pred = []    # 各折的预测值
predictions = 0
feature_importance_df = pd.DataFrame()
skf = KFold(n_splits=5, random_state=random.randint(100, 10000), shuffle=True)


label_0 = vaild_data[0]
label_1 = vaild_data[1]
label_2 = vaild_data[2]
label_3 = vaild_data[3]


for fold_, (trn_idx, val_idx) in enumerate(skf.split(train_data, label_0)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train_data.iloc[trn_idx][df_train_columns], label=label_0.iloc[trn_idx])
    val_data = lgb.Dataset(train_data.iloc[val_idx][df_train_columns], label=label_0.iloc[val_idx])

    result_df = train_data.iloc[val_idx][["adcode", "model"]]

    gbm_1 = lgb.train(lgb_params,
                    trn_data,
                    # init_model=gbm,
                    num_boost_round=150000,
                    valid_sets=[trn_data, val_data],
                    early_stopping_rounds=200,
                    verbose_eval=200,
                    categorical_feature=cate_fea)     # 训练

    trn_data = lgb.Dataset(train_data.iloc[trn_idx][df_train_columns], label=label_1.iloc[trn_idx])
    val_data = lgb.Dataset(train_data.iloc[val_idx][df_train_columns], label=label_1.iloc[val_idx])

    gbm_2 = lgb.train(lgb_params,
                    trn_data,
                    # init_model=gbm,
                    num_boost_round=150000,
                    valid_sets=[trn_data, val_data],
                    early_stopping_rounds=200,
                    verbose_eval=200,
                    categorical_feature=cate_fea)     # 训练

    trn_data = lgb.Dataset(train_data.iloc[trn_idx][df_train_columns], label=label_2.iloc[trn_idx])
    val_data = lgb.Dataset(train_data.iloc[val_idx][df_train_columns], label=label_2.iloc[val_idx])

    gbm_3 = lgb.train(lgb_params,
                    trn_data,
                    # init_model=gbm,
                    num_boost_round=3000,
                    valid_sets=[trn_data, val_data],
                    early_stopping_rounds=200,
                    verbose_eval=200,
                    categorical_feature=cate_fea)     # 训练

    trn_data = lgb.Dataset(train_data.iloc[trn_idx][df_train_columns], label=label_3.iloc[trn_idx])
    val_data = lgb.Dataset(train_data.iloc[val_idx][df_train_columns], label=label_3.iloc[val_idx])

    gbm_4 = lgb.train(lgb_params,
                    trn_data,
                    # init_model=gbm,
                    num_boost_round=150000,
                    valid_sets=[trn_data, val_data],
                    early_stopping_rounds=200,
                    verbose_eval=200,
                    categorical_feature=cate_fea)     # 训练

    result_df["y_pred_1"] = gbm_1.predict(train_data.iloc[val_idx][df_train_columns])
    result_df["y_pred_2"] = gbm_2.predict(train_data.iloc[val_idx][df_train_columns])
    result_df["y_pred_3"] = gbm_3.predict(train_data.iloc[val_idx][df_train_columns])
    result_df["y_pred_4"] = gbm_4.predict(train_data.iloc[val_idx][df_train_columns])
    # break

result_df["y_true_1"] = label_0
result_df["y_true_2"] = label_1
result_df["y_true_3"] = label_2
result_df["y_true_4"] = label_3

# 预测
# 基础特征
# 城市+车
test_basic_fea = cal_basic_fea(train_sales, "salesVolume", ["adcode", "model", "regMonth"], "test")
# 城市
tmp_df = cal_basic_fea(train_sales, "salesVolume", ["adcode", "regMonth"], "test")
test_basic_fea = pd.merge(test_basic_fea, tmp_df, on="adcode", how="left")
# 车
tmp_df = cal_basic_fea(train_sales, "salesVolume", ["model", "regMonth"], "test")
test_basic_fea = pd.merge(test_basic_fea, tmp_df, on="model", how="left")
# 城市+车型
tmp_df = cal_basic_fea(train_sales, "salesVolume", ["adcode", "bodyType", "regMonth"], "test")
test_basic_fea["bodyType"] = test_basic_fea.model.apply(lambda x: model2type[x])
test_basic_fea = pd.merge(test_basic_fea, tmp_df, on=["adcode", "bodyType"], how="left")

# 滑窗特征
# 城市+车
test_windows_fea = cal_windows_fea(train_sales, cal_col="salesVolume", stat_dim=["adcode", "model", "regMonth"], data_type="test")
# 城市
tmp_df = cal_windows_fea(train_sales, "salesVolume", ["adcode", "regMonth"], "test")
test_windows_fea = pd.merge(test_windows_fea, tmp_df, on="adcode", how="left")
# 车
tmp_df = cal_windows_fea(train_sales, "salesVolume", ["model", "regMonth"], "test")
test_windows_fea = pd.merge(test_windows_fea, tmp_df, on="model", how="left")
# 城市+车型
tmp_df = cal_windows_fea(train_sales, "salesVolume", ["adcode", "bodyType", "regMonth"], "test")
test_windows_fea["bodyType"] = test_windows_fea.model.apply(lambda x: model2type[x])
test_windows_fea = pd.merge(test_windows_fea, tmp_df, on=["adcode", "bodyType"], how="left")

# 合并
test_data = pd.merge(test_basic_fea, test_windows_fea, on=["adcode", "model", "bodyType"], how="left")

test_data.model = le_model.transform(test_data.model)
test_data.bodyType = le_bodyType.transform(test_data.bodyType)

y_pred_1 = gbm_1.predict(test_data[df_train_columns])
y_pred_2 = gbm_2.predict(test_data[df_train_columns])
y_pred_3 = gbm_3.predict(test_data[df_train_columns])
y_pred_4 = gbm_4.predict(test_data[df_train_columns])

y_pred_1 = (np.e ** y_pred_1 - 1).astype(int)
y_pred_2 = (np.e ** y_pred_2 - 1).astype(int)
y_pred_3 = (np.e ** y_pred_3 - 1).astype(int)
y_pred_4 = (np.e ** y_pred_4 - 1).astype(int)

result_df = test_basic_fea[["adcode", "model"]]
result_df["y_pred_1"] = y_pred_1
result_df["y_pred_2"] = y_pred_2
result_df["y_pred_3"] = y_pred_3
result_df["y_pred_4"] = y_pred_4



df_train_columns: ['adcode', 'model', 'adcode_model_regMonth_salesVolume_1', 'adcode_model_regMonth_salesVolume_2', 'adcode_model_regMonth_salesVolume_3', 'adcode_model_regMonth_salesVolume_4', 'adcode_model_regMonth_salesVolume_5', 'adcode_model_regMonth_salesVolume_6', 'adcode_model_regMonth_salesVolume_7', 'adcode_model_regMonth_salesVolume_8', 'adcode_model_regMonth_salesVolume_9', 'adcode_model_regMonth_salesVolume_10', 'adcode_model_regMonth_salesVolume_11', 'adcode_model_regMonth_salesVolume_12', 'adcode_model_regMonth_salesVolume_shift_div_2', 'adcode_model_regMonth_salesVolume_shift_div_3', 'adcode_model_regMonth_salesVolume_shift_div_4', 'adcode_model_regMonth_salesVolume_shift_div_5', 'adcode_model_regMonth_salesVolume_shift_div_6', 'adcode_model_regMonth_salesVolume_shift_div_7', 'adcode_model_regMonth_salesVolume_shift_div_8', 'adcode_model_regMonth_salesVolume_shift_div_9', 'adcode_model_regMonth_salesVolume_shift_div_10', 'adcode_model_regMonth_salesVolume_shift_div_11',


(1320, 335)
335
fold 0


New categorical_feature is ['adcode', 'bodyType', 'model']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds.
[200]	training's l2: 0.0791586	valid_1's l2: 0.127653
[400]	training's l2: 0.0256935	valid_1's l2: 0.0640422
[600]	training's l2: 0.0162166	valid_1's l2: 0.0516776
[800]	training's l2: 0.0120997	valid_1's l2: 0.0475752
[1000]	training's l2: 0.00948978	valid_1's l2: 0.0460382
[1200]	training's l2: 0.00776839	valid_1's l2: 0.0455201
[1400]	training's l2: 0.00658673	valid_1's l2: 0.0451363
[1600]	training's l2: 0.00562753	valid_1's l2: 0.0449572
[1800]	training's l2: 0.00489398	valid_1's l2: 0.0448303
[2000]	training's l2: 0.0043544	valid_1's l2: 0.04474
[2200]	training's l2: 0.00398942	valid_1's l2: 0.0447454
Early stopping, best iteration is:
[2056]	training's l2: 0.00424143	valid_1's l2: 0.0447138
Training until validation scores don't improve for 200 rounds.
[200]	training's l2: 0.0696984	valid_1's l2: 0.103014
[400]	training's l2: 0.0235283	valid_1's l2: 0.056339
[600]	training's l2: 0.0158953	valid_1's l2: 0.0474621
[800

[1000]	training's l2: 0.00897308	valid_1's l2: 0.060204
[1200]	training's l2: 0.00739045	valid_1's l2: 0.0598475
[1400]	training's l2: 0.00624387	valid_1's l2: 0.0598654
[1600]	training's l2: 0.00538173	valid_1's l2: 0.0597002
Early stopping, best iteration is:
[1529]	training's l2: 0.00565825	valid_1's l2: 0.0596569
Training until validation scores don't improve for 200 rounds.
[200]	training's l2: 0.0705325	valid_1's l2: 0.109221
[400]	training's l2: 0.0226614	valid_1's l2: 0.0591424
[600]	training's l2: 0.0149151	valid_1's l2: 0.052471
[800]	training's l2: 0.0115468	valid_1's l2: 0.0508696
[1000]	training's l2: 0.00940199	valid_1's l2: 0.0503871
[1200]	training's l2: 0.00783556	valid_1's l2: 0.0505419
Early stopping, best iteration is:
[1010]	training's l2: 0.00931228	valid_1's l2: 0.0503751
Training until validation scores don't improve for 200 rounds.
[200]	training's l2: 0.0707011	valid_1's l2: 0.0926085
[400]	training's l2: 0.0239635	valid_1's l2: 0.0456264
[600]	training's l2: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

In [27]:




# 模型
le_model = preprocessing.LabelEncoder()
le_bodyType = preprocessing.LabelEncoder()

le_model.fit(train_data.model)
le_bodyType.fit(train_data.bodyType)

lgb_params = {
    "num_leaves":32,
    "reg_alpha":1,
    "reg_lambda":0.1,
    "objective":'mse',
    "max_depth": 4,
    "learning_rate":0.01,
    "min_child_samples":5,
    "random_state":random.randint(100, 10000),
    "n_estimators":5000,
    "subsample":0.8,
    "colsample_bytree":0.8
}

df_train_columns = [c for c in train_data.columns if c not in []]
print("df_train_columns:",df_train_columns)
cate_fea = ["adcode", "model", "bodyType"]
train_data.model = le_model.transform(train_data.model)
train_data.bodyType = le_bodyType.transform(train_data.bodyType)
print(train_data.shape)
print(len(df_train_columns))


y_score = []    # 交叉验证
cv_pred = []    # 各折的预测值
predictions = 0
feature_importance_df = pd.DataFrame()
skf = KFold(n_splits=5, random_state=random.randint(100, 10000), shuffle=True)


label_0 = vaild_data[0]
label_1 = vaild_data[1]
label_2 = vaild_data[2]
label_3 = vaild_data[3]


for fold_, (trn_idx, val_idx) in enumerate(skf.split(train_data, label_0)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train_data.iloc[trn_idx][df_train_columns], label=label_0.iloc[trn_idx])
    val_data = lgb.Dataset(train_data.iloc[val_idx][df_train_columns], label=label_0.iloc[val_idx])

    result_df = train_data.iloc[val_idx][["adcode", "model"]]

    gbm_1 = lgb.train(lgb_params,
                    trn_data,
                    # init_model=gbm,
                    num_boost_round=150000,
                    valid_sets=[trn_data, val_data],
                    early_stopping_rounds=200,
                    verbose_eval=200,
                    categorical_feature=cate_fea)     # 训练

    trn_data = lgb.Dataset(train_data.iloc[trn_idx][df_train_columns], label=label_1.iloc[trn_idx])
    val_data = lgb.Dataset(train_data.iloc[val_idx][df_train_columns], label=label_1.iloc[val_idx])

    gbm_2 = lgb.train(lgb_params,
                    trn_data,
                    # init_model=gbm,
                    num_boost_round=150000,
                    valid_sets=[trn_data, val_data],
                    early_stopping_rounds=200,
                    verbose_eval=200,
                    categorical_feature=cate_fea)     # 训练

    trn_data = lgb.Dataset(train_data.iloc[trn_idx][df_train_columns], label=label_2.iloc[trn_idx])
    val_data = lgb.Dataset(train_data.iloc[val_idx][df_train_columns], label=label_2.iloc[val_idx])

    gbm_3 = lgb.train(lgb_params,
                    trn_data,
                    # init_model=gbm,
                    num_boost_round=3000,
                    valid_sets=[trn_data, val_data],
                    early_stopping_rounds=200,
                    verbose_eval=200,
                    categorical_feature=cate_fea)     # 训练

    trn_data = lgb.Dataset(train_data.iloc[trn_idx][df_train_columns], label=label_3.iloc[trn_idx])
    val_data = lgb.Dataset(train_data.iloc[val_idx][df_train_columns], label=label_3.iloc[val_idx])

    gbm_4 = lgb.train(lgb_params,
                    trn_data,
                    # init_model=gbm,
                    num_boost_round=150000,
                    valid_sets=[trn_data, val_data],
                    early_stopping_rounds=200,
                    verbose_eval=200,
                    categorical_feature=cate_fea)     # 训练

    result_df["y_pred_1"] = gbm_1.predict(train_data.iloc[val_idx][df_train_columns])
    result_df["y_pred_2"] = gbm_2.predict(train_data.iloc[val_idx][df_train_columns])
    result_df["y_pred_3"] = gbm_3.predict(train_data.iloc[val_idx][df_train_columns])
    result_df["y_pred_4"] = gbm_4.predict(train_data.iloc[val_idx][df_train_columns])
    # break

result_df["y_true_1"] = label_0
result_df["y_true_2"] = label_1
result_df["y_true_3"] = label_2
result_df["y_true_4"] = label_3

# 预测
# 基础特征
# 城市+车
test_basic_fea = cal_basic_fea(train_sales, "salesVolume", ["adcode", "model", "regMonth"], "test")
# 城市
tmp_df = cal_basic_fea(train_sales, "salesVolume", ["adcode", "regMonth"], "test")
test_basic_fea = pd.merge(test_basic_fea, tmp_df, on="adcode", how="left")
# 车
tmp_df = cal_basic_fea(train_sales, "salesVolume", ["model", "regMonth"], "test")
test_basic_fea = pd.merge(test_basic_fea, tmp_df, on="model", how="left")
# 城市+车型
tmp_df = cal_basic_fea(train_sales, "salesVolume", ["adcode", "bodyType", "regMonth"], "test")
test_basic_fea["bodyType"] = test_basic_fea.model.apply(lambda x: model2type[x])
test_basic_fea = pd.merge(test_basic_fea, tmp_df, on=["adcode", "bodyType"], how="left")

# 滑窗特征
# 城市+车
test_windows_fea = cal_windows_fea(train_sales, cal_col="salesVolume", stat_dim=["adcode", "model", "regMonth"], data_type="test")
# 城市
tmp_df = cal_windows_fea(train_sales, "salesVolume", ["adcode", "regMonth"], "test")
test_windows_fea = pd.merge(test_windows_fea, tmp_df, on="adcode", how="left")
# 车
tmp_df = cal_windows_fea(train_sales, "salesVolume", ["model", "regMonth"], "test")
test_windows_fea = pd.merge(test_windows_fea, tmp_df, on="model", how="left")
# 城市+车型
tmp_df = cal_windows_fea(train_sales, "salesVolume", ["adcode", "bodyType", "regMonth"], "test")
test_windows_fea["bodyType"] = test_windows_fea.model.apply(lambda x: model2type[x])
test_windows_fea = pd.merge(test_windows_fea, tmp_df, on=["adcode", "bodyType"], how="left")

# 合并
test_data = pd.merge(test_basic_fea, test_windows_fea, on=["adcode", "model", "bodyType"], how="left")

test_data.model = le_model.transform(test_data.model)
test_data.bodyType = le_bodyType.transform(test_data.bodyType)

y_pred_1 = gbm_1.predict(test_data[df_train_columns])
y_pred_2 = gbm_2.predict(test_data[df_train_columns])
y_pred_3 = gbm_3.predict(test_data[df_train_columns])
y_pred_4 = gbm_4.predict(test_data[df_train_columns])

y_pred_1 = (np.e ** y_pred_1 - 1).astype(int)
y_pred_2 = (np.e ** y_pred_2 - 1).astype(int)
y_pred_3 = (np.e ** y_pred_3 - 1).astype(int)
y_pred_4 = (np.e ** y_pred_4 - 1).astype(int)

result_df = test_basic_fea[["adcode", "model"]]
result_df["y_pred_1"] = y_pred_1
result_df["y_pred_2"] = y_pred_2
result_df["y_pred_3"] = y_pred_3
result_df["y_pred_4"] = y_pred_4



df_train_columns: ['adcode', 'model', 'adcode_model_regMonth_salesVolume_1', 'adcode_model_regMonth_salesVolume_2', 'adcode_model_regMonth_salesVolume_3', 'adcode_model_regMonth_salesVolume_4', 'adcode_model_regMonth_salesVolume_5', 'adcode_model_regMonth_salesVolume_6', 'adcode_model_regMonth_salesVolume_7', 'adcode_model_regMonth_salesVolume_8', 'adcode_model_regMonth_salesVolume_9', 'adcode_model_regMonth_salesVolume_10', 'adcode_model_regMonth_salesVolume_11', 'adcode_model_regMonth_salesVolume_12', 'adcode_model_regMonth_salesVolume_shift_div_2', 'adcode_model_regMonth_salesVolume_shift_div_3', 'adcode_model_regMonth_salesVolume_shift_div_4', 'adcode_model_regMonth_salesVolume_shift_div_5', 'adcode_model_regMonth_salesVolume_shift_div_6', 'adcode_model_regMonth_salesVolume_shift_div_7', 'adcode_model_regMonth_salesVolume_shift_div_8', 'adcode_model_regMonth_salesVolume_shift_div_9', 'adcode_model_regMonth_salesVolume_shift_div_10', 'adcode_model_regMonth_salesVolume_shift_div_11',


(1320, 347)
347
fold 0


New categorical_feature is ['adcode', 'bodyType', 'model']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds.
[200]	training's l2: 0.0784653	valid_1's l2: 0.117199
[400]	training's l2: 0.0234233	valid_1's l2: 0.0624278
[600]	training's l2: 0.0146224	valid_1's l2: 0.05196
[800]	training's l2: 0.0110052	valid_1's l2: 0.0484078
[1000]	training's l2: 0.00881217	valid_1's l2: 0.0465347
[1200]	training's l2: 0.00730667	valid_1's l2: 0.0455504
[1400]	training's l2: 0.00614076	valid_1's l2: 0.0451977
[1600]	training's l2: 0.00525662	valid_1's l2: 0.0448401
[1800]	training's l2: 0.00459806	valid_1's l2: 0.0445541
[2000]	training's l2: 0.00413173	valid_1's l2: 0.0445176
[2200]	training's l2: 0.00400036	valid_1's l2: 0.0445261
Early stopping, best iteration is:
[2030]	training's l2: 0.00407556	valid_1's l2: 0.0445
Training until validation scores don't improve for 200 rounds.
[200]	training's l2: 0.070053	valid_1's l2: 0.107385
[400]	training's l2: 0.0226529	valid_1's l2: 0.0600138
[600]	training's l2: 0.0150841	valid_1's l2: 0.0512898
[800]	

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

In [10]:
test_data = pd.read_csv(TEST_PATH)
test_data = test_data.drop("forecastVolum", axis=1)
test_data_1 = pd.merge(test_data.loc[test_data.regMonth == 1], result_df[["adcode", "model", "y_pred_1"]].rename(columns={"y_pred_1":"forecastVolum"}),\
                       how="left", on=["adcode", "model"])
test_data_2 = pd.merge(test_data.loc[test_data.regMonth == 2], result_df[["adcode", "model", "y_pred_2"]].rename(columns={"y_pred_2":"forecastVolum"}),\
                       how="left", on=["adcode", "model"])
test_data_3 = pd.merge(test_data.loc[test_data.regMonth == 3], result_df[["adcode", "model", "y_pred_3"]].rename(columns={"y_pred_3":"forecastVolum"}),\
                       how="left", on=["adcode", "model"])
test_data_4 = pd.merge(test_data.loc[test_data.regMonth == 4], result_df[["adcode", "model", "y_pred_4"]].rename(columns={"y_pred_4":"forecastVolum"}),\
                       how="left", on=["adcode", "model"])
result = pd.concat([test_data_1, test_data_2, test_data_3, test_data_4]).reset_index(drop=True)
result.forecastVolum = result.forecastVolum.astype(int)
result.loc[(result.forecastVolum < 0), "forecastVolum"] = 1
print((result.forecastVolum < 0 ).sum())
result[["id", "forecastVolum"]].to_csv("submit/evaluation_public_2019091602_lgb.csv", index=False)

0
