In [1]:
# import sys
import numpy as np
import pandas as pd
# import os 
# import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# path  = './ccf_car/'
train_sales  = pd.read_csv('train_sales_data.csv')
train_search = pd.read_csv('train_search_data.csv')
train_user   = pd.read_csv('train_user_reply_data.csv')
evaluation_public = pd.read_csv('evaluation_public.csv')
submit_example    = pd.read_csv('submit_example.csv')
data = pd.concat([train_sales, evaluation_public], ignore_index=True)
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
#索引-->value

#LabelEncoder
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    #key-->value
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']


In [3]:
def get_stat_feature(df_):   
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']
    for col in tqdm(['label','popularity']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9,10,11,12]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])    
    return df,stat_feat

In [4]:
def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

In [10]:
def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):   
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=500,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              categorical_feature=cate_feat, 
              early_stopping_rounds=100, verbose=100)      
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000, 
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9, 
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse' 
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              early_stopping_rounds=100, verbose=100)   
    return model

In [6]:
def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )  
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']   
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)  
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx]) 
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features]) 
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)  
    return sub,df[valid_idx]['pred_label']

In [86]:
data.head()

Unnamed: 0,adcode,bodyType,forecastVolum,id,model,province,regMonth,regYear,salesVolume,popularity,carCommentVolum,newsReplyVolum,label,mt
0,310000,0,,0,0,上海,1,2016,292.0,1479.0,11.0,106.0,292.0,1
1,530000,0,,0,0,云南,1,2016,466.0,1594.0,11.0,106.0,466.0,1
2,150000,0,,0,0,内蒙古,1,2016,257.0,1479.0,11.0,106.0,257.0,1
3,110000,0,,0,0,北京,1,2016,408.0,2370.0,11.0,106.0,408.0,1
4,510000,0,,0,0,四川,1,2016,610.0,3562.0,11.0,106.0,610.0,1


In [11]:
for month in [25,26,27,28]: 
    m_type = 'lgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear'] + stat_feat
    cate_feat = ['adcode','bodyType','model','regMonth']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values	
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
sub[['id','forecastVolum']].round().astype(int).to_csv('submit/yulao.csv', index=False)


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.58it/s]


29 29
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 8297.45	valid_1's l2: 33865.7
[200]	training's l2: 4330.21	valid_1's l2: 30336
[300]	training's l2: 2917.34	valid_1's l2: 29756.2
[400]	training's l2: 2088.04	valid_1's l2: 29307.2
[500]	training's l2: 1566.78	valid_1's l2: 29098.4
[600]	training's l2: 1210.01	valid_1's l2: 29073.3
[700]	training's l2: 970.877	valid_1's l2: 29042.1
[800]	training's l2: 761.514	valid_1's l2: 28910.1
Early stopping, best iteration is:
[785]	training's l2: 790.4	valid_1's l2: 28896.6
0.7594149220102722
valid mean: 597.6044421089272
true  mean: 649.3121212121212
test  mean: 498.3089027836437


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.52it/s]


29 29
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 8960.79	valid_1's l2: 45424.7
[200]	training's l2: 4965.39	valid_1's l2: 44915.5
[300]	training's l2: 3333.06	valid_1's l2: 44686.8
[400]	training's l2: 2410.18	valid_1's l2: 44435
[500]	training's l2: 1874.15	valid_1's l2: 44378.2
[600]	training's l2: 1484.16	valid_1's l2: 44420.8
Early stopping, best iteration is:
[525]	training's l2: 1764.58	valid_1's l2: 44347.9
0.7356319961915723
valid mean: 622.2060321741504
true  mean: 616.5537878787878
test  mean: 335.0132116566594


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.58it/s]


29 29
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 10098.8	valid_1's l2: 35356.1
[200]	training's l2: 5542.35	valid_1's l2: 33904.7
[300]	training's l2: 3793.56	valid_1's l2: 33820.5
Early stopping, best iteration is:
[246]	training's l2: 4631.42	valid_1's l2: 33528.7
0.7824082341041323
valid mean: 634.4264573093336
true  mean: 673.0143939393939
test  mean: 507.24666740850523


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.52it/s]


29 29
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 10683	valid_1's l2: 311125
[200]	training's l2: 6111.38	valid_1's l2: 302843
Early stopping, best iteration is:
[136]	training's l2: 8351.28	valid_1's l2: 302017
0.6075122323501823
valid mean: 653.0269497681012
true  mean: 899.8204545454546
test  mean: 504.1382169474577


In [68]:
for month in [25,26,27,28]: 
    m_type = 'lgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear','regMonth','popularity'] + stat_feat
    cate_feat = ['adcode','bodyType','model']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
#     print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values
    
    
    
    
    
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
# sub[['id','forecastVolum']].round().astype(int).to_csv('CCF_sales.csv', index=False)


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.37it/s]


all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 86584.9	valid_1's l2: 151719
[200]	training's l2: 27784.3	valid_1's l2: 71704.5
[300]	training's l2: 14902.5	valid_1's l2: 57361.3
[400]	training's l2: 10243.5	valid_1's l2: 52224.7
[500]	training's l2: 7869.38	valid_1's l2: 50304.7
[600]	training's l2: 6385.06	valid_1's l2: 49569.7
[700]	training's l2: 5380.65	valid_1's l2: 48911.9
[800]	training's l2: 4634.41	valid_1's l2: 48274
[900]	training's l2: 4071.47	valid_1's l2: 48026.2
[1000]	training's l2: 3625	valid_1's l2: 47641.9
[1100]	training's l2: 3253.2	valid_1's l2: 47425.4
[1200]	training's l2: 2928.61	valid_1's l2: 47143.3
[1300]	training's l2: 2653.04	valid_1's l2: 46870.8
[1400]	training's l2: 2414.07	valid_1's l2: 46669.9
[1500]	training's l2: 2207.68	valid_1's l2: 46487.6
[1600]	training's l2: 2028.06	valid_1's l2: 46336.7
[1700]	training's l2: 1876.96	valid_1's l2: 46208.4
[

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.41it/s]


all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 90083.5	valid_1's l2: 118412
[200]	training's l2: 28475.8	valid_1's l2: 59906.1
[300]	training's l2: 15309.6	valid_1's l2: 52080.3
[400]	training's l2: 10785.7	valid_1's l2: 50404.3
[500]	training's l2: 8324.54	valid_1's l2: 49632.1
[600]	training's l2: 6828.3	valid_1's l2: 49046.4
[700]	training's l2: 5813.9	valid_1's l2: 48997.8
Early stopping, best iteration is:
[648]	training's l2: 6301.83	valid_1's l2: 48762.4
0.7352932759641437


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.40it/s]


all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 93591.6	valid_1's l2: 135289
[200]	training's l2: 29582.9	valid_1's l2: 52574.8
[300]	training's l2: 15864.8	valid_1's l2: 39181.2
[400]	training's l2: 11154.7	valid_1's l2: 36023.5
[500]	training's l2: 8739.67	valid_1's l2: 35331.5
[600]	training's l2: 7173.39	valid_1's l2: 34526.5
[700]	training's l2: 6126.06	valid_1's l2: 34402.5
[800]	training's l2: 5347.97	valid_1's l2: 34349
[900]	training's l2: 4770.82	valid_1's l2: 34426
Early stopping, best iteration is:
[806]	training's l2: 5308.31	valid_1's l2: 34325.1
0.7811424903161602


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.38it/s]


all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 96023.9	valid_1's l2: 552065
[200]	training's l2: 29709.3	valid_1's l2: 354551
[300]	training's l2: 16067.9	valid_1's l2: 299015
[400]	training's l2: 11490.5	valid_1's l2: 282547
[500]	training's l2: 9067.98	valid_1's l2: 274322
[600]	training's l2: 7522.51	valid_1's l2: 271372
[700]	training's l2: 6477.28	valid_1's l2: 271395
Early stopping, best iteration is:
[658]	training's l2: 6879.88	valid_1's l2: 270441
0.6226761089064834


In [69]:
sub[['id','forecastVolum']].round().astype(int).to_csv('CCF_sales0914.csv', index=False)

In [38]:
for month in [25,26,27,28]: 
    m_type = 'lgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear'] + stat_feat
    cate_feat = ['adcode','bodyType','model','regMonth']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
# sub[['id','forecastVolum']].round().astype(int).to_csv('CCF_sales.csv', index=False)


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.38s/it]


53 53
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 7291.26	valid_1's l2: 37932.3
[200]	training's l2: 3715.93	valid_1's l2: 35716.9
[300]	training's l2: 2515.27	valid_1's l2: 35194.8
[400]	training's l2: 1831.38	valid_1's l2: 34788.8
[500]	training's l2: 1397.64	valid_1's l2: 34608.6
[600]	training's l2: 1065.51	valid_1's l2: 34527.1
[700]	training's l2: 849.095	valid_1's l2: 34415.2
[800]	training's l2: 679.238	valid_1's l2: 34333.1
[900]	training's l2: 545.247	valid_1's l2: 34326.4
Early stopping, best iteration is:
[838]	training's l2: 624.086	valid_1's l2: 34316.6
0.7559303097480222
valid mean: 602.8972278482273
true  mean: 649.3121212121212
test  mean: 493.9801201875369


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.35s/it]


53 53
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 7936.91	valid_1's l2: 41558.5
Early stopping, best iteration is:
[75]	training's l2: 10761.5	valid_1's l2: 41117.3
0.7414955122487169
valid mean: 623.4500376146684
true  mean: 616.5537878787878
test  mean: 324.6451813135614


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.36s/it]


53 53
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 8653.84	valid_1's l2: 32560.4
[200]	training's l2: 4627.91	valid_1's l2: 32133.2
Early stopping, best iteration is:
[155]	training's l2: 5757.15	valid_1's l2: 31853.4
0.7794578141087838
valid mean: 639.6227869473659
true  mean: 673.0143939393939
test  mean: 479.55100257280156


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.40s/it]


53 53
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 9306.24	valid_1's l2: 302435
[200]	training's l2: 5107.21	valid_1's l2: 292129
[300]	training's l2: 3583.18	valid_1's l2: 289373
[400]	training's l2: 2709.06	valid_1's l2: 288389
[500]	training's l2: 2162.77	valid_1's l2: 287983
[600]	training's l2: 1746.75	valid_1's l2: 287504
[700]	training's l2: 1446.22	valid_1's l2: 287029
[800]	training's l2: 1196.94	valid_1's l2: 286429
[900]	training's l2: 1008.26	valid_1's l2: 286030
Early stopping, best iteration is:
[856]	training's l2: 1089.91	valid_1's l2: 286004
0.588139278724323
valid mean: 645.3917897243205
true  mean: 899.8204545454546
test  mean: 468.0356962498267
