In [38]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

In [50]:


train_sales_data = pd.read_csv('train_sales_data.csv')
train_search_data = pd.read_csv('train_search_data.csv')
train_user_reply_data = pd.read_csv('train_user_reply_data.csv')
test = pd.read_csv('evaluation_public.csv')

data = pd.merge(train_sales_data, train_search_data, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = pd.merge(data, train_user_reply_data, 'left', on=['model', 'regYear', 'regMonth'])

In [51]:
def score(data):
    pred = data.groupby(['adcode', 'model'])['pred_label'].agg(lambda x: list(x))
    label = data.groupby(['adcode', 'model'])['label'].agg(lambda x: list(x))
    label_mean = data.groupby(['adcode', 'model'])['label'].agg(lambda x: np.mean(x))
    data_agg = pd.DataFrame()
    data_agg['pred_label'] = pred
    data_agg['label'] = label
    data_agg['label_mean'] = label_mean
    nrmse_score = []
    for raw in data_agg.values:
        nrmse_score.append(mse(raw[0], raw[1]) ** 0.5 / raw[2])
    return 1 - np.mean(nrmse_score)

In [52]:
# col, col2, col3 中 ，设1.5倍四分位距之外的数据为异常值，用上下四分位数的均值填充
col, col2, col3 = ['popularity', 'carCommentVolum', 'newsReplyVolum']
col_per = np.percentile(data[col],(25,75))
diff = 1.5*(col_per[1] - col_per[0])
col_per_in = (data[col] >= col_per[0] - diff) & (data[col] <= col_per[1] + diff) 

col_per2 = np.percentile(data[col2],(25,75))
diff2 = 1.5*(col_per2[1] - col_per2[0])
col_per_in2 = (data[col2] >= col_per2[0] - diff2) & (data[col2] <= col_per2[1] + diff2)
 
col_per3 = np.percentile(data[col3],(25,75))
diff3 = 1.5*(col_per3[1] - col_per3[0])
col_per_in3 = (data[col3] >= col_per3[0] - diff3) & (data[col3] <= col_per3[1] + diff3)
 
data.loc[~col_per_in, col] = col_per.mean()
data.loc[~col_per_in2, col2] = col_per2.mean()
data.loc[~col_per_in3, col3] = col_per3.mean()

# 统计销量
data['bt_ry_mean'] = data.groupby(['bodyType','regYear'])['salesVolume'].transform('mean')
data['ad_ry_mean'] = data.groupby(['adcode','regYear'])['salesVolume'].transform('mean')
data['md_ry_mean'] = data.groupby(['model','regYear'])['salesVolume'].transform('mean')

# data.head()
# data_temp = data[['adcode','model','regMonth','regYear','bt_ry_mean','ad_ry_mean','md_ry_mean']].to_csv("sanlie.csv",index = False)

In [53]:
data = pd.concat([data, test], ignore_index=True)
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
del data['salesVolume'], data['forecastVolum']
# 填补测试集的车身类型
data['bodyType'] = data['model'].map(train_sales_data.drop_duplicates('model').set_index('model')['bodyType'])
# 编码 bodyType、model
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
# 距离2016年的时间间隔，月数
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

shift_feat = []
data['model_adcode'] = data['adcode'] + data['model']
data['model_adcode_mt'] = data['model_adcode'] * 100 + data['mt']

In [43]:
def Cluster(df_): 
    df_['cluster_label']  =-1
    df = df_.copy()
    df['label'] = df['label'].fillna('0') 
    df= df[['adcode','model','mt','label']].set_index(['model','adcode','mt']).reset_index()
    kmeans = KMeans(n_clusters = 3) 
    kmeans.fit(df.values)
    df_['cluster_label'] =  kmeans.labels_
#     print(df.head())
#     df = df.merge(df,'left',on = ['adcode','model','mt','label'])
    return df_
data = Cluster(data)

In [34]:
#原始代码
# 填充测试集特征值
for col in ['carCommentVolum','newsReplyVolum','popularity','bt_ry_mean','ad_ry_mean', 'md_ry_mean']:
    lgb_col_na = pd.isnull(data[col])
    data[col] = data[col].replace(0,1)
    data.loc[lgb_col_na,col] = \
    ((((data.loc[(data['regYear'].isin([2017]))&(data['regMonth'].isin([1,2,3,4])), col].values /
    data.loc[(data['regYear'].isin([2016]))&(data['regMonth'].isin([1,2,3,4])), col].values)))*
    data.loc[(data['regYear'].isin([2017]))&(data['regMonth'].isin([1,2,3,4])), col].values * 1.03).round()

    
# 每年的新年在第几月份
data['happyNY'] = 0
data.loc[(data['regYear'].isin([2016,2018])&data['regMonth'].isin([2])),'happyNY'] = 1
data.loc[(data['regYear'].isin([2017])&data['regMonth'].isin([1])),'happyNY'] = 1


# label 下移12个月，则测试集填充上了label
for i in [4]:
    shift_feat.append('shift_model_adcode_mt_label_{0}'.format(i))
    data['model_adcode_mt_{0}'.format(i)] = data['model_adcode_mt'] + i
    data_last = data[~data.label.isnull()].set_index('model_adcode_mt_{0}'.format(i))
    data['shift_model_adcode_mt_label_{0}'.format(i)] = data['model_adcode_mt'].map(data_last['label'])
    
data.loc[pd.isnull(data['shift_model_adcode_mt_label_4']),'shift_model_adcode_mt_label_4'] = \
((data.loc[(data.regMonth.isin([1,2,3,4]))&(data.regYear.isin([2016])),'label'].values/
 data.loc[(data.regMonth.isin([1,2,3,4]))&(data.regYear.isin([2017])),'label'].values)*
data.loc[(data.regMonth.isin([1,2,3,4]))&(data.regYear.isin([2016])),'label'].values).round()
 
# 根据月份添加权重值
a = 6; b = 4
data['weightMonth'] = data['regMonth'].map({1:a, 2:a, 3:a, 4:a,
                                            5:b, 6:b, 7:b, 8:b, 9:b, 10:b, 11:b, 12:b,})

df_lgb = pd.DataFrame({'id': test['id']})

col_add = ['ad_ry_mean', 'md_ry_mean', 'bt_ry_mean']
# 取用的字段，用于训练模型
num_feat = ['regYear']+shift_feat
cate_feat = ['adcode', 'bodyType', 'model', 'regMonth', 'happyNY','cluster_label']
features = num_feat + cate_feat + ['popularity', 'carCommentVolum', 'newsReplyVolum', 'weightMonth'] + col_add 

# for col_add in ['ad_ry_mean', 'md_ry_mean', 'bt_ry_mean']:
    # 取用的字段，用于训练模型

#     num_feat = shift_feat
#     cate_feat = ['adcode', 'bodyType', 'model', 'regYear', 'regMonth', 'happyNY']
#     features = num_feat + cate_feat + ['popularity', 'carCommentVolum', 'newsReplyVolum', 'weightMonth'] + [col_add]  
    # [ad_ry_mean, md_ry_mean, bt_ry_mean]
train_idx = (data['mt'] <= 20) # 小于等于20月以内的数据作为训练集
valid_idx = (data['mt'].between(21, 24)) # 21到24个月的数据作为验证集
test_idx = (data['mt'] > 24) # 大于24个月的是测试集

# label
data['model_mean'] = data.groupby('model')['label'].transform('mean') # mean
data['n_label'] = np.log(data['label'])

train_x = data[train_idx][features]
train_y = data[train_idx]['n_label']

valid_x = data[valid_idx][features]
valid_y = data[valid_idx]['n_label']

############################### lgb ###################################
lgb_model = lgb.LGBMRegressor(
    num_leaves=64, reg_alpha=1, reg_lambda=0.1, objective='mse',
    max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
    n_estimators=8000, subsample=0.8, colsample_bytree=0.8)

lgb_model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)],
              categorical_feature=cate_feat, early_stopping_rounds=100, verbose=300)
data['pred_label'] = np.e ** lgb_model.predict(data[features])
model = lgb_model
# 特征重要程度
print ('lgb特征重要程度：',sorted(dict(zip(train_x.columns,model.feature_importances_)).items(),key=lambda x: x[1], reverse=True))
print('NRMSE的均值:',score(data = data[valid_idx]))
model.n_estimators = model.best_iteration_
model.fit(data[~test_idx][features], data[~test_idx]['n_label'], categorical_feature=cate_feat)
data['forecastVolum'] = np.e ** model.predict(data[features])
sub = data[test_idx][['id']]
sub['forecastVolum'] = data[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
sub_lgb = sub.reset_index(drop=True)
sub_lgb = sub_lgb[['id','forecastVolum']]
print('lgb中forecastVolmn的0值数量：',(sub_lgb['forecastVolum']==0).sum())
# df_lgb[col_add] = sub_lgb['forecastVolum']

# df_lgb有三列值，任一一列提交，上0.57，祝各位好运！！


Training until validation scores don't improve for 100 rounds.
[300]	valid_0's l2: 0.14122
[600]	valid_0's l2: 0.135829
[900]	valid_0's l2: 0.13352
[1200]	valid_0's l2: 0.132251
[1500]	valid_0's l2: 0.131462
[1800]	valid_0's l2: 0.131124
[2100]	valid_0's l2: 0.130648
[2400]	valid_0's l2: 0.130294
[2700]	valid_0's l2: 0.130049
[3000]	valid_0's l2: 0.129932
[3300]	valid_0's l2: 0.129812
[3600]	valid_0's l2: 0.129648
Early stopping, best iteration is:
[3739]	valid_0's l2: 0.129605
lgb特征重要程度： [('model', 32639), ('popularity', 32297), ('shift_model_adcode_mt_label_4', 31693), ('carCommentVolum', 25426), ('newsReplyVolum', 24611), ('adcode', 23430), ('regMonth', 20612), ('ad_ry_mean', 15035), ('md_ry_mean', 12406), ('cluster_label', 4965), ('bodyType', 4694), ('happyNY', 3686), ('bt_ry_mean', 2645), ('regYear', 605), ('weightMonth', 368)]
NRMSE的均值: 0.7135838921170896
lgb中forecastVolmn的0值数量： 0


In [139]:
711

711

In [49]:
sub_lgb.tail()

Unnamed: 0,id,forecastVolum
5275,5364,188
5276,5365,204
5277,5366,172
5278,5367,228
5279,5368,156


In [54]:
#原始代码
# 填充测试集特征值
for col in ['carCommentVolum','newsReplyVolum','popularity','bt_ry_mean','ad_ry_mean', 'md_ry_mean']:
    lgb_col_na = pd.isnull(data[col])
    data[col] = data[col].replace(0,1)
    data.loc[lgb_col_na,col] = \
    ((((data.loc[(data['regYear'].isin([2017]))&(data['regMonth'].isin([1,2,3,4])), col].values /
    data.loc[(data['regYear'].isin([2016]))&(data['regMonth'].isin([1,2,3,4])), col].values)))*
    data.loc[(data['regYear'].isin([2017]))&(data['regMonth'].isin([1,2,3,4])), col].values * 1.03).round()

    
# 每年的新年在第几月份
data['happyNY'] = 0
data.loc[(data['regYear'].isin([2016,2018])&data['regMonth'].isin([2])),'happyNY'] = 1
data.loc[(data['regYear'].isin([2017])&data['regMonth'].isin([1])),'happyNY'] = 1


# label 下移12个月，则测试集填充上了label
for i in [4]:
    shift_feat.append('shift_model_adcode_mt_label_{0}'.format(i))
    data['model_adcode_mt_{0}'.format(i)] = data['model_adcode_mt'] + i
    data_last = data[~data.label.isnull()].set_index('model_adcode_mt_{0}'.format(i))
    data['shift_model_adcode_mt_label_{0}'.format(i)] = data['model_adcode_mt'].map(data_last['label'])
    
data.loc[pd.isnull(data['shift_model_adcode_mt_label_4']),'shift_model_adcode_mt_label_4'] = \
((data.loc[(data.regMonth.isin([1,2,3,4]))&(data.regYear.isin([2016])),'label'].values/
 data.loc[(data.regMonth.isin([1,2,3,4]))&(data.regYear.isin([2017])),'label'].values)*
data.loc[(data.regMonth.isin([1,2,3,4]))&(data.regYear.isin([2016])),'label'].values).round()
 
# 根据月份添加权重值
a = 6; b = 4
data['weightMonth'] = data['regMonth'].map({1:a, 2:a, 3:a, 4:a,
                                            5:b, 6:b, 7:b, 8:b, 9:b, 10:b, 11:b, 12:b,})

df_lgb = pd.DataFrame({'id': test['id']})



for col_add in ['ad_ry_mean', 'md_ry_mean', 'bt_ry_mean']:
#     取用的字段，用于训练模型

    num_feat = shift_feat
    cate_feat = ['adcode', 'bodyType', 'model', 'regYear', 'regMonth', 'happyNY']
    features = num_feat + cate_feat + ['popularity', 'carCommentVolum', 'newsReplyVolum', 'weightMonth'] + [col_add]  
    #     [ad_ry_mean, md_ry_mean, bt_ry_mean]
    train_idx = (data['mt'] <= 20) # 小于等于20月以内的数据作为训练集
    valid_idx = (data['mt'].between(21, 24)) # 21到24个月的数据作为验证集
    test_idx = (data['mt'] > 24) # 大于24个月的是测试集

    # label
    data['model_mean'] = data.groupby('model')['label'].transform('mean') # mean
    data['n_label'] = np.log(data['label'])

    train_x = data[train_idx][features]
    train_y = data[train_idx]['n_label']

    valid_x = data[valid_idx][features]
    valid_y = data[valid_idx]['n_label']

    ############################### lgb ###################################
    lgb_model = lgb.LGBMRegressor(
        num_leaves=64, reg_alpha=1, reg_lambda=0.1, objective='mse',
        max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
        n_estimators=8000, subsample=0.8, colsample_bytree=0.8)

    lgb_model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)],
                  categorical_feature=cate_feat, early_stopping_rounds=100, verbose=300)
    data['pred_label'] = np.e ** lgb_model.predict(data[features])
    model = lgb_model
    # 特征重要程度
    print ('lgb特征重要程度：',sorted(dict(zip(train_x.columns,model.feature_importances_)).items(),key=lambda x: x[1], reverse=True))
    print('NRMSE的均值:',score(data = data[valid_idx]))
    model.n_estimators = model.best_iteration_
    model.fit(data[~test_idx][features], data[~test_idx]['n_label'], categorical_feature=cate_feat)
    data['forecastVolum'] = np.e ** model.predict(data[features])
    sub = data[test_idx][['id']]
    sub['forecastVolum'] = data[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    sub_lgb = sub.reset_index(drop=True)
    sub_lgb = sub_lgb[['id','forecastVolum']]
    print('lgb中forecastVolmn的0值数量：',(sub_lgb['forecastVolum']==0).sum())
    df_lgb[col_add] = sub_lgb['forecastVolum']

# df_lgb有三列值，任一一列提交，上0.57，祝各位好运！！


Training until validation scores don't improve for 100 rounds.
[300]	valid_0's l2: 0.142269
[600]	valid_0's l2: 0.135737
[900]	valid_0's l2: 0.133294
[1200]	valid_0's l2: 0.132254
[1500]	valid_0's l2: 0.13127
[1800]	valid_0's l2: 0.130648
Early stopping, best iteration is:
[1800]	valid_0's l2: 0.130648
lgb特征重要程度： [('model', 19275), ('shift_model_adcode_mt_label_4', 14571), ('popularity', 14495), ('adcode', 13942), ('carCommentVolum', 13376), ('newsReplyVolum', 12911), ('regMonth', 10838), ('ad_ry_mean', 7635), ('bodyType', 2429), ('regYear', 2155), ('happyNY', 1323), ('weightMonth', 450)]
NRMSE的均值: 0.710305044566558
lgb中forecastVolmn的0值数量： 0
Training until validation scores don't improve for 100 rounds.
[300]	valid_0's l2: 0.141162
[600]	valid_0's l2: 0.135974
[900]	valid_0's l2: 0.133285
[1200]	valid_0's l2: 0.13214
[1500]	valid_0's l2: 0.131065
[1800]	valid_0's l2: 0.130741
[2100]	valid_0's l2: 0.130348
[2400]	valid_0's l2: 0.129981
[2700]	valid_0's l2: 0.129803
[3000]	valid_0's l2: 

In [56]:
df_lgb.tail()

Unnamed: 0,id,ad_ry_mean,md_ry_mean,bt_ry_mean
5275,5364,101,101,91
5276,5365,104,100,93
5277,5366,87,93,89
5278,5367,205,238,223
5279,5368,70,75,70


In [57]:
df_lgb['forecastVolum'] = (df_lgb['ad_ry_mean'].values+ df_lgb['md_ry_mean'].values+ df_lgb['bt_ry_mean'].values)/3
df_lgb.head()
# df_lgb.to_csv("submit/lgb570.csv", index=False) 

Unnamed: 0,id,ad_ry_mean,md_ry_mean,bt_ry_mean,forecastVolum
0,1,278,280,247,268.333333
1,2,364,328,325,339.0
2,3,173,176,175,174.666667
3,4,347,344,309,333.333333
4,5,435,408,405,416.0


In [60]:
sub = df_lgb[['id','forecastVolum']]
sub[['id','forecastVolum']].round().astype(int).to_csv('submit/lgb570.csv', index=False)
# sub.to_csv("submit/lgb570.csv", index=False) 
sub.head()


Unnamed: 0,id,forecastVolum
0,1,268.333333
1,2,339.0
2,3,174.666667
3,4,333.333333
4,5,416.0
