In [1]:
# import sys
import numpy as np
import pandas as pd
# import os 
# import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# path  = './ccf_car/'
train_sales  = pd.read_csv('train_sales_data.csv')
train_search = pd.read_csv('train_search_data.csv')
train_user   = pd.read_csv('train_user_reply_data.csv')
evaluation_public = pd.read_csv('evaluation_public.csv')
submit_example    = pd.read_csv('submit_example.csv')
data = pd.concat([train_sales, evaluation_public], ignore_index=True)
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)



data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
#索引-->value

# data['model_2'] = data['model']
#LabelEncoder
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    #key-->value
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']


In [8]:
data.head()

Unnamed: 0,adcode,bodyType,forecastVolum,id,model,province,regMonth,regYear,salesVolume,popularity,carCommentVolum,newsReplyVolum,label,bodyType_2,model_2,mt,model_weight,n_label
0,310000,SUV,,0,3c974920a76ac9c1,上海,1,2016,292.0,1479.0,11.0,106.0,292.0,0,0,1,444.518939,0.65689
1,530000,SUV,,0,3c974920a76ac9c1,云南,1,2016,466.0,1594.0,11.0,106.0,466.0,0,0,1,444.518939,1.048324
2,150000,SUV,,0,3c974920a76ac9c1,内蒙古,1,2016,257.0,1479.0,11.0,106.0,257.0,0,0,1,444.518939,0.578153
3,110000,SUV,,0,3c974920a76ac9c1,北京,1,2016,408.0,2370.0,11.0,106.0,408.0,0,0,1,444.518939,0.917846
4,510000,SUV,,0,3c974920a76ac9c1,四川,1,2016,610.0,3562.0,11.0,106.0,610.0,0,0,1,444.518939,1.37227


In [3]:
data['model_weight'] = data.groupby('model')['label'].transform('mean')
data['n_label'] = data['label'] / data['model_weight']

In [182]:
data[data['mt']>=26].head()

Unnamed: 0,adcode,bodyType,forecastVolum,id,model,province,regMonth,regYear,salesVolume,popularity,carCommentVolum,newsReplyVolum,label,mt,model_weight,n_label
33000,310000,0,,1343,0,上海,2,2018,,,,,,26,444.518939,
33001,530000,0,,1344,0,云南,2,2018,,,,,,26,444.518939,
33002,150000,0,,1345,0,内蒙古,2,2018,,,,,,26,444.518939,
33003,110000,0,,1346,0,北京,2,2018,,,,,,26,444.518939,
33004,510000,0,,1347,0,四川,2,2018,,,,,,26,444.518939,


In [13]:
def get_stat_feature(df_):   
    df = data.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt'] #省份/车型/时间  集合特征
    
    data['model_weight'] = data.groupby('model')['label'].transform('mean')
    data['n_label'] = data['label'] / data['model_weight']
    
    for col in tqdm(['n_label','popularity','carCommentVolum','newsReplyVolum']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9,10,11]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i #平移月份
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            #df_last把
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])
            #map在这里设置了一个索引和value，df['model_adcode_mt']实际上变成了索引，返回的是df_last[col]对应的value
            
#         df['sum_model_adcode_mt_{}_{}'.format(col,1)] = df['shift_model_adcode_mt_{}_{}'.format(col,1)]
#         for i in [2,3,4,5,6,7,8,9,10,11,12]:
#             if i >=5:
#                 stat_feat.append('sum_model_adcode_mt_{}_{}'.format(col,i))
# #             df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i #平移月份
# #             df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
#             #df_last把
#             df['sum_model_adcode_mt_{}_{}'.format(col,i)] = df['sum_model_adcode_mt_{}_{}'.format(col,i-1)]+df['shift_model_adcode_mt_{}_{}'.format(col,i)]
            
        for i in [2,3,4,5,6,7,8,9,10,11]:
            stat_feat.append('differ_model_adcode_mt_{}_{}'.format(col,i))
#             df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i #平移月份
#             df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            #df_last把
            df['differ_model_adcode_mt_{}_{}'.format(col,i)] = df['shift_model_adcode_mt_{}_{}'.format(col,1)]-df['shift_model_adcode_mt_{}_{}'.format(col,i)]
        for i in [2,3,4,5,6,7,8,9,10,11]:
            stat_feat.append('differ2_model_adcode_mt_{}_{}'.format(col,i))
#             df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i #平移月份
#             df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            #df_last把
            df['differ2_model_adcode_mt_{}_{}'.format(col,i)] = df['differ_model_adcode_mt_{}_{}'.format(col,i)]-df['differ_model_adcode_mt_{}_{}'.format(col,i+1)]
        
#         for i in [2,3,4,5,6,7,8,9,10,11,12]:
#             stat_feat.append('divide_model_adcode_mt_{}_{}'.format(col,i))
# #             df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i #平移月份
# #             df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
#             #df_last把
#             df['divide_model_adcode_mt_{}_{}'.format(col,i)] = df['shift_model_adcode_mt_{}_{}'.format(col,1)]/df['shift_model_adcode_mt_{}_{}'.format(col,i)]
       
    return df,stat_feat

# data_df, stat_feat = get_stat_feature(data)


In [5]:
def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

In [6]:
def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):   
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-2, reg_alpha=0.1, reg_lambda=0.1, objective='mae',
                                max_depth=-1, learning_rate=0.01, min_child_samples=15, random_state=1000,
                                n_estimators=2500, subsample=0.9, colsample_bytree=0.8,
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              categorical_feature=cate_feat, 
              early_stopping_rounds=100, verbose=100)      
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000, 
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9, 
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse' 
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              early_stopping_rounds=100, verbose=100)   
    return model

In [7]:

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 6
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )  
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['n_label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['n_label']

    
    
    
    print("---------start training label--------------")
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)  
    # offline
    df['pred_label'] = model.predict(df[features])*df['model_weight']
    best_score = score(df[valid_idx]) 
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['n_label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])*df['model_weight']
    
#     print('valid mean:',df[valid_idx]['pred_label'].mean())
#     print('true  mean:',df[valid_idx]['label'].mean())
#     print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int) 
    

    return sub,df[valid_idx]['pred_label']

In [78]:
# data_df.head()
data[(data.regMonth==2)&(data.regYear==2018)].head(10)

Unnamed: 0,adcode,bodyType,forecastVolum,id,model,province,regMonth,regYear,salesVolume,popularity,carCommentVolum,newsReplyVolum,label,mt,model_weight,n_label
33000,310000,0,,1343,0,上海,2,2018,193.0,,,,193.0,26,424.001623,0.455187
33001,530000,0,,1344,0,云南,2,2018,230.0,,,,230.0,26,424.001623,0.542451
33002,150000,0,,1345,0,内蒙古,2,2018,138.0,,,,138.0,26,424.001623,0.32547
33003,110000,0,,1346,0,北京,2,2018,278.0,,,,278.0,26,424.001623,0.655658
33004,510000,0,,1347,0,四川,2,2018,269.0,,,,269.0,26,424.001623,0.634432
33005,340000,0,,1348,0,安徽,2,2018,150.0,,,,150.0,26,424.001623,0.353772
33006,370000,0,,1349,0,山东,2,2018,303.0,,,,303.0,26,424.001623,0.71462
33007,140000,0,,1350,0,山西,2,2018,152.0,,,,152.0,26,424.001623,0.358489
33008,440000,0,,1351,0,广东,2,2018,1780.0,,,,1780.0,26,424.001623,4.198097
33009,450000,0,,1352,0,广西,2,2018,238.0,,,,238.0,26,424.001623,0.561319


In [14]:
for month in [25,26,27,28]: 
    m_type = 'lgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear','regMonth','popularity','carCommentVolum','newsReplyVolum'] + stat_feat
    cate_feat = ['adcode','bodyType','model']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
# sub[['id','forecastVolum']].round().astype(int).to_csv('CCF_sales.csv', index=False)

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]


KeyError: 'differ_model_adcode_mt_n_label_12'

In [116]:
for month in [25,26,27,28]: 
    m_type = 'lgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear','regMonth','popularity'] + stat_feat
    cate_feat = ['adcode','bodyType','model']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
# sub[['id','forecastVolum']].round().astype(int).to_csv('CCF_sales.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.38s/it]


162 162
all_idx  : 6 24
train_idx: 6 20
valid_idx: 21 21
test_idx : 25 25
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.209106	valid_1's l2: 0.232636
[200]	training's l2: 0.085862	valid_1's l2: 0.101744
[300]	training's l2: 0.0557997	valid_1's l2: 0.0782164
[400]	training's l2: 0.0437689	valid_1's l2: 0.0721713
[500]	training's l2: 0.0364557	valid_1's l2: 0.0703275
[600]	training's l2: 0.0312732	valid_1's l2: 0.0696519
[700]	training's l2: 0.0276509	valid_1's l2: 0.0692414
[800]	training's l2: 0.0249314	valid_1's l2: 0.0689236
[900]	training's l2: 0.0227552	valid_1's l2: 0.068443
[1000]	training's l2: 0.0209873	valid_1's l2: 0.0681127
[1100]	training's l2: 0.0195109	valid_1's l2: 0.0679325
[1200]	training's l2: 0.0182234	valid_1's l2: 0.0678292
[1300]	training's l2: 0.0170996	valid_1's l2: 0.0675032
[1400]	training's l2: 0.0161392	valid_1's l2: 0.0673245
[1500]	training's l2: 0.0152783	valid_1's l2: 0.0

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.66s/it]


162 162
all_idx  : 6 25
train_idx: 6 21
valid_idx: 22 22
test_idx : 26 26
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.208465	valid_1's l2: 0.184131
[200]	training's l2: 0.0856249	valid_1's l2: 0.101754
[300]	training's l2: 0.0561021	valid_1's l2: 0.0975244
[400]	training's l2: 0.0441389	valid_1's l2: 0.0996361
Early stopping, best iteration is:
[316]	training's l2: 0.0536453	valid_1's l2: 0.0972717
0.7482434177293765


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.44s/it]


162 162
all_idx  : 6 26
train_idx: 6 22
valid_idx: 23 23
test_idx : 27 27
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.208824	valid_1's l2: 0.176452
[200]	training's l2: 0.0875295	valid_1's l2: 0.0668913
[300]	training's l2: 0.058359	valid_1's l2: 0.0643222
Early stopping, best iteration is:
[240]	training's l2: 0.0719292	valid_1's l2: 0.0619564
0.7990290813881417


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.41s/it]


162 162
all_idx  : 6 27
train_idx: 6 23
valid_idx: 24 24
test_idx : 28 28
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.20944	valid_1's l2: 0.69299
[200]	training's l2: 0.087448	valid_1's l2: 0.428658
[300]	training's l2: 0.0584939	valid_1's l2: 0.368447
[400]	training's l2: 0.0466636	valid_1's l2: 0.351382
[500]	training's l2: 0.0391777	valid_1's l2: 0.343116
[600]	training's l2: 0.033742	valid_1's l2: 0.336577
[700]	training's l2: 0.0300141	valid_1's l2: 0.337403
Early stopping, best iteration is:
[616]	training's l2: 0.0330363	valid_1's l2: 0.336381
0.7131454226912151


In [121]:
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
sub[['id','forecastVolum']].round().astype(int).head(100)

Unnamed: 0,id,forecastVolum
31680,1,213
31681,2,266
31682,3,138
31683,4,201
31684,5,331
31685,6,145
31686,7,349
31687,8,166
31688,9,2143
31689,10,274


In [90]:
sub2 = pd.read_csv('rongheguize_0902.csv')
sub2.head()

Unnamed: 0,id,forecastVolum
0,1,288
1,2,376
2,3,154
3,4,332
4,5,402


In [122]:
sub['forecastVolum'] = sub['forecastVolum'].values*0.6+sub2['forecastVolum'].values*0.4
sub.head()

Unnamed: 0,id,forecastVolum
31680,1,243.0
31681,2,310.0
31682,3,144.4
31683,4,253.4
31684,5,359.4


In [141]:
sub[['id','forecastVolum']].round().astype(int).to_csv('lucky_0915.csv', index=False)#0.5439

In [8]:
for month in [25,26,27,28]: 
    m_type = 'lgb' 
    data_df, stat_feat = get_stat_feature(data)
#     num_feat = ['regYear'] + stat_feat
#     cate_feat = ['adcode','bodyType','model','regMonth']
    num_feat = ['regYear','regMonth','popularity'] + stat_feat
    cate_feat = ['adcode','bodyType','model']
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values




100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.92s/it]


138 138
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.136651	valid_1's l2: 0.22993
[200]	training's l2: 0.0532087	valid_1's l2: 0.114122
[300]	training's l2: 0.0328597	valid_1's l2: 0.0900612
[400]	training's l2: 0.0248102	valid_1's l2: 0.0818536
[500]	training's l2: 0.0199401	valid_1's l2: 0.0781726
[600]	training's l2: 0.0167994	valid_1's l2: 0.0766571
[700]	training's l2: 0.0146022	valid_1's l2: 0.0758024
[800]	training's l2: 0.0129124	valid_1's l2: 0.075525
[900]	training's l2: 0.01157	valid_1's l2: 0.0754902
Early stopping, best iteration is:
[826]	training's l2: 0.0125437	valid_1's l2: 0.0754409
0.7726256389621242


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.93s/it]


138 138
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.141454	valid_1's l2: 0.160104
[200]	training's l2: 0.0551148	valid_1's l2: 0.0965854
[300]	training's l2: 0.0342739	valid_1's l2: 0.0954502
Early stopping, best iteration is:
[256]	training's l2: 0.0407739	valid_1's l2: 0.0947775
0.7224654603298788


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.91s/it]


138 138
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.146209	valid_1's l2: 0.2246
[200]	training's l2: 0.0586061	valid_1's l2: 0.101806
[300]	training's l2: 0.0373873	valid_1's l2: 0.0753228
[400]	training's l2: 0.0286197	valid_1's l2: 0.0680127
[500]	training's l2: 0.0232232	valid_1's l2: 0.0654478
[600]	training's l2: 0.019664	valid_1's l2: 0.0643805
[700]	training's l2: 0.0171709	valid_1's l2: 0.0637988
[800]	training's l2: 0.0152988	valid_1's l2: 0.0634715
[900]	training's l2: 0.0138246	valid_1's l2: 0.0632198
[1000]	training's l2: 0.0126029	valid_1's l2: 0.0628556
[1100]	training's l2: 0.011622	valid_1's l2: 0.0627427
[1200]	training's l2: 0.0107555	valid_1's l2: 0.0626321
[1300]	training's l2: 0.00999392	valid_1's l2: 0.0624862
[1400]	training's l2: 0.00932711	valid_1's l2: 0.0624621
[1500]	training's l2: 0.00872164	valid_1's l2: 

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.91s/it]


138 138
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.149048	valid_1's l2: 0.815322
[200]	training's l2: 0.0594166	valid_1's l2: 0.566807
[300]	training's l2: 0.0380772	valid_1's l2: 0.500916
[400]	training's l2: 0.0294834	valid_1's l2: 0.480994
[500]	training's l2: 0.0241464	valid_1's l2: 0.471352
[600]	training's l2: 0.0205394	valid_1's l2: 0.464178
[700]	training's l2: 0.0180459	valid_1's l2: 0.463276
Early stopping, best iteration is:
[678]	training's l2: 0.0185385	valid_1's l2: 0.462726
0.633967777020353


In [None]:
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']

In [14]:
for month in [25,26,27,28]: 
    m_type = 'lgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear','regMonth','popularity'] + stat_feat
    cate_feat = ['adcode','bodyType','model']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
    sub,val_pred = get_train_model(data_df, month, m_type)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
    



100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.35it/s]


all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.136651	valid_1's l2: 0.22993
[200]	training's l2: 0.0532087	valid_1's l2: 0.114122
[300]	training's l2: 0.0328597	valid_1's l2: 0.0900612
[400]	training's l2: 0.0248102	valid_1's l2: 0.0818536
[500]	training's l2: 0.0199401	valid_1's l2: 0.0781726
[600]	training's l2: 0.0167994	valid_1's l2: 0.0766571
[700]	training's l2: 0.0146022	valid_1's l2: 0.0758024
[800]	training's l2: 0.0129124	valid_1's l2: 0.075525
[900]	training's l2: 0.01157	valid_1's l2: 0.0754902
Early stopping, best iteration is:
[826]	training's l2: 0.0125437	valid_1's l2: 0.0754409
0.7726256389621242


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.36it/s]


all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.141454	valid_1's l2: 0.160104
[200]	training's l2: 0.0551148	valid_1's l2: 0.0965854
[300]	training's l2: 0.0342739	valid_1's l2: 0.0954502
Early stopping, best iteration is:
[256]	training's l2: 0.0407739	valid_1's l2: 0.0947775
0.7224654603298788


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.37it/s]


all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.146209	valid_1's l2: 0.2246
[200]	training's l2: 0.0586061	valid_1's l2: 0.101806
[300]	training's l2: 0.0373873	valid_1's l2: 0.0753228
[400]	training's l2: 0.0286197	valid_1's l2: 0.0680127
[500]	training's l2: 0.0232232	valid_1's l2: 0.0654478
[600]	training's l2: 0.019664	valid_1's l2: 0.0643805
[700]	training's l2: 0.0171709	valid_1's l2: 0.0637988
[800]	training's l2: 0.0152988	valid_1's l2: 0.0634715
[900]	training's l2: 0.0138246	valid_1's l2: 0.0632198
[1000]	training's l2: 0.0126029	valid_1's l2: 0.0628556
[1100]	training's l2: 0.011622	valid_1's l2: 0.0627427
[1200]	training's l2: 0.0107555	valid_1's l2: 0.0626321
[1300]	training's l2: 0.00999392	valid_1's l2: 0.0624862
[1400]	training's l2: 0.00932711	valid_1's l2: 0.0624621
[1500]	training's l2: 0.00872164	valid_1's l2: 0.062431

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.36it/s]


all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
---------start training label--------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.149048	valid_1's l2: 0.815322
[200]	training's l2: 0.0594166	valid_1's l2: 0.566807
[300]	training's l2: 0.0380772	valid_1's l2: 0.500916
[400]	training's l2: 0.0294834	valid_1's l2: 0.480994
[500]	training's l2: 0.0241464	valid_1's l2: 0.471352
[600]	training's l2: 0.0205394	valid_1's l2: 0.464178
[700]	training's l2: 0.0180459	valid_1's l2: 0.463276
Early stopping, best iteration is:
[678]	training's l2: 0.0185385	valid_1's l2: 0.462726
0.633967777020353


In [20]:
sub = data.loc[(data.regMonth>=4)&(data.regYear==2018), ['id','salesVolume']]
sub.head()

Unnamed: 0,id,salesVolume
35640,4027,0.0
35641,4028,0.0
35642,4029,0.0
35643,4030,0.0
35644,4031,0.0


In [9]:
data.head()

Unnamed: 0,adcode,bodyType,forecastVolum,id,model,province,regMonth,regYear,salesVolume,popularity,carCommentVolum,newsReplyVolum,label,bodyType_2,model_2,mt,model_weight,n_label
0,310000,SUV,,0,3c974920a76ac9c1,上海,1,2016,292.0,1479.0,11.0,106.0,292.0,0,0,1,444.518939,0.65689
1,530000,SUV,,0,3c974920a76ac9c1,云南,1,2016,466.0,1594.0,11.0,106.0,466.0,0,0,1,444.518939,1.048324
2,150000,SUV,,0,3c974920a76ac9c1,内蒙古,1,2016,257.0,1479.0,11.0,106.0,257.0,0,0,1,444.518939,0.578153
3,110000,SUV,,0,3c974920a76ac9c1,北京,1,2016,408.0,2370.0,11.0,106.0,408.0,0,0,1,444.518939,0.917846
4,510000,SUV,,0,3c974920a76ac9c1,四川,1,2016,610.0,3562.0,11.0,106.0,610.0,0,0,1,444.518939,1.37227


In [36]:
data[(data.regMonth==4)&(data.regYear==2018)].head(10)

Unnamed: 0,adcode,bodyType,forecastVolum,id,model,province,regMonth,regYear,salesVolume,popularity,carCommentVolum,newsReplyVolum,label,bodyType_2,model_2,mt,model_weight,n_label
35640,310000,,,4027,3c974920a76ac9c1,上海,4,2018,0.0,,,,0.0,0,0,28,444.518939,
35641,530000,,,4028,3c974920a76ac9c1,云南,4,2018,0.0,,,,0.0,0,0,28,444.518939,
35642,150000,,,4029,3c974920a76ac9c1,内蒙古,4,2018,0.0,,,,0.0,0,0,28,444.518939,
35643,110000,,,4030,3c974920a76ac9c1,北京,4,2018,0.0,,,,0.0,0,0,28,444.518939,
35644,510000,,,4031,3c974920a76ac9c1,四川,4,2018,0.0,,,,0.0,0,0,28,444.518939,
35645,340000,,,4032,3c974920a76ac9c1,安徽,4,2018,0.0,,,,0.0,0,0,28,444.518939,
35646,370000,,,4033,3c974920a76ac9c1,山东,4,2018,0.0,,,,0.0,0,0,28,444.518939,
35647,140000,,,4034,3c974920a76ac9c1,山西,4,2018,0.0,,,,0.0,0,0,28,444.518939,
35648,440000,,,4035,3c974920a76ac9c1,广东,4,2018,0.0,,,,0.0,0,0,28,444.518939,
35649,450000,,,4036,3c974920a76ac9c1,广西,4,2018,0.0,,,,0.0,0,0,28,444.518939,


In [101]:
# sub[sub_r['id']>=4027]=0
sub[sub['id']>=4027].head()


Unnamed: 0,id,forecastVolum
35640,4027,263.0
35641,4028,275.0
35642,4029,169.0
35643,4030,303.0
35644,4031,346.0


In [86]:
sub_r = pd.read_csv("rongheguize_0902.csv")
sub_r.head()

Unnamed: 0,id,forecastVolum
0,1,288
1,2,376
2,3,154
3,4,332
4,5,402


In [87]:
sub_r[sub_r['id']>=4027].head()

Unnamed: 0,id,forecastVolum
3960,4027,248
3961,4028,226
3962,4029,157
3963,4030,272
3964,4031,290


In [105]:
sub[sub['id']>=4027].head()

Unnamed: 0,id,forecastVolum
35640,4027,252.5
35641,4028,240.7
35642,4029,160.6
35643,4030,281.3
35644,4031,306.8


In [111]:
sub.loc[sub['id']>=4027, 'forecastVolum'].shape

(1320,)

In [99]:
b = sub_r.loc[sub_r['id']>=4027, 'forecastVolum'].values

In [100]:
b

array([248, 226, 157, ..., 104, 227,  66], dtype=int64)

In [102]:
a = a*0.3+b*0.7
a

array([252.5, 240.7, 160.6, ..., 106.1, 226.4,  69. ])

In [132]:
sub.loc[sub['id']>=4027, 'forecastVolum'] = a

In [133]:
sub[['id','forecastVolum']].round().astype(int).to_csv('091402.csv', index=False)

In [124]:
train_sales[(train_sales['regYear']==2017)&((train_sales['regMonth']==4))]['salesVolume'].head()

19800    264
19801    243
19802    176
19803    340
19804    337
Name: salesVolume, dtype: int64

In [122]:
a= evaluation_public[(evaluation_public['regYear']==2018)&((evaluation_public['regMonth']==4))]['salesVolume']

KeyError: 'salesVolume'

In [68]:
for month in [25,26,27,28]: 
    m_type = 'lgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear','regMonth','popularity'] + stat_feat
    cate_feat = ['adcode','bodyType','model']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
#     print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values
    
    
    
    
    
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
# sub[['id','forecastVolum']].round().astype(int).to_csv('CCF_sales.csv', index=False)


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.37it/s]


all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 86584.9	valid_1's l2: 151719
[200]	training's l2: 27784.3	valid_1's l2: 71704.5
[300]	training's l2: 14902.5	valid_1's l2: 57361.3
[400]	training's l2: 10243.5	valid_1's l2: 52224.7
[500]	training's l2: 7869.38	valid_1's l2: 50304.7
[600]	training's l2: 6385.06	valid_1's l2: 49569.7
[700]	training's l2: 5380.65	valid_1's l2: 48911.9
[800]	training's l2: 4634.41	valid_1's l2: 48274
[900]	training's l2: 4071.47	valid_1's l2: 48026.2
[1000]	training's l2: 3625	valid_1's l2: 47641.9
[1100]	training's l2: 3253.2	valid_1's l2: 47425.4
[1200]	training's l2: 2928.61	valid_1's l2: 47143.3
[1300]	training's l2: 2653.04	valid_1's l2: 46870.8
[1400]	training's l2: 2414.07	valid_1's l2: 46669.9
[1500]	training's l2: 2207.68	valid_1's l2: 46487.6
[1600]	training's l2: 2028.06	valid_1's l2: 46336.7
[1700]	training's l2: 1876.96	valid_1's l2: 46208.4
[

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.41it/s]


all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 90083.5	valid_1's l2: 118412
[200]	training's l2: 28475.8	valid_1's l2: 59906.1
[300]	training's l2: 15309.6	valid_1's l2: 52080.3
[400]	training's l2: 10785.7	valid_1's l2: 50404.3
[500]	training's l2: 8324.54	valid_1's l2: 49632.1
[600]	training's l2: 6828.3	valid_1's l2: 49046.4
[700]	training's l2: 5813.9	valid_1's l2: 48997.8
Early stopping, best iteration is:
[648]	training's l2: 6301.83	valid_1's l2: 48762.4
0.7352932759641437


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.40it/s]


all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 93591.6	valid_1's l2: 135289
[200]	training's l2: 29582.9	valid_1's l2: 52574.8
[300]	training's l2: 15864.8	valid_1's l2: 39181.2
[400]	training's l2: 11154.7	valid_1's l2: 36023.5
[500]	training's l2: 8739.67	valid_1's l2: 35331.5
[600]	training's l2: 7173.39	valid_1's l2: 34526.5
[700]	training's l2: 6126.06	valid_1's l2: 34402.5
[800]	training's l2: 5347.97	valid_1's l2: 34349
[900]	training's l2: 4770.82	valid_1's l2: 34426
Early stopping, best iteration is:
[806]	training's l2: 5308.31	valid_1's l2: 34325.1
0.7811424903161602


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.38it/s]


all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 96023.9	valid_1's l2: 552065
[200]	training's l2: 29709.3	valid_1's l2: 354551
[300]	training's l2: 16067.9	valid_1's l2: 299015
[400]	training's l2: 11490.5	valid_1's l2: 282547
[500]	training's l2: 9067.98	valid_1's l2: 274322
[600]	training's l2: 7522.51	valid_1's l2: 271372
[700]	training's l2: 6477.28	valid_1's l2: 271395
Early stopping, best iteration is:
[658]	training's l2: 6879.88	valid_1's l2: 270441
0.6226761089064834


In [69]:
sub[['id','forecastVolum']].round().astype(int).to_csv('CCF_sales0914.csv', index=False)

In [38]:
for month in [25,26,27,28]: 
    m_type = 'lgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear'] + stat_feat
    cate_feat = ['adcode','bodyType','model','regMonth']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
# sub[['id','forecastVolum']].round().astype(int).to_csv('CCF_sales.csv', index=False)


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.38s/it]


53 53
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 7291.26	valid_1's l2: 37932.3
[200]	training's l2: 3715.93	valid_1's l2: 35716.9
[300]	training's l2: 2515.27	valid_1's l2: 35194.8
[400]	training's l2: 1831.38	valid_1's l2: 34788.8
[500]	training's l2: 1397.64	valid_1's l2: 34608.6
[600]	training's l2: 1065.51	valid_1's l2: 34527.1
[700]	training's l2: 849.095	valid_1's l2: 34415.2
[800]	training's l2: 679.238	valid_1's l2: 34333.1
[900]	training's l2: 545.247	valid_1's l2: 34326.4
Early stopping, best iteration is:
[838]	training's l2: 624.086	valid_1's l2: 34316.6
0.7559303097480222
valid mean: 602.8972278482273
true  mean: 649.3121212121212
test  mean: 493.9801201875369


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.35s/it]


53 53
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 7936.91	valid_1's l2: 41558.5
Early stopping, best iteration is:
[75]	training's l2: 10761.5	valid_1's l2: 41117.3
0.7414955122487169
valid mean: 623.4500376146684
true  mean: 616.5537878787878
test  mean: 324.6451813135614


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.36s/it]


53 53
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 8653.84	valid_1's l2: 32560.4
[200]	training's l2: 4627.91	valid_1's l2: 32133.2
Early stopping, best iteration is:
[155]	training's l2: 5757.15	valid_1's l2: 31853.4
0.7794578141087838
valid mean: 639.6227869473659
true  mean: 673.0143939393939
test  mean: 479.55100257280156


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.40s/it]


53 53
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 9306.24	valid_1's l2: 302435
[200]	training's l2: 5107.21	valid_1's l2: 292129
[300]	training's l2: 3583.18	valid_1's l2: 289373
[400]	training's l2: 2709.06	valid_1's l2: 288389
[500]	training's l2: 2162.77	valid_1's l2: 287983
[600]	training's l2: 1746.75	valid_1's l2: 287504
[700]	training's l2: 1446.22	valid_1's l2: 287029
[800]	training's l2: 1196.94	valid_1's l2: 286429
[900]	training's l2: 1008.26	valid_1's l2: 286030
Early stopping, best iteration is:
[856]	training's l2: 1089.91	valid_1's l2: 286004
0.588139278724323
valid mean: 645.3917897243205
true  mean: 899.8204545454546
test  mean: 468.0356962498267
