In [12]:
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
from math import sqrt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [14]:
def mse(true_label,pre_label):
    squaredError = (true_label - pre_label)*(true_label - pre_label)
    return sum(squaredError) / len(squaredError)

In [15]:
train_sales  = pd.read_csv('train_sales_data.csv')
train_search = pd.read_csv('train_search_data.csv')
train_user   = pd.read_csv('train_user_reply_data.csv')

evaluation_public = pd.read_csv('evaluation_public.csv')
submit_example    = pd.read_csv('submit_example.csv')

In [3]:
train_sales.head(5)

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,salesVolume
0,上海,310000,3c974920a76ac9c1,SUV,2016,1,292
1,云南,530000,3c974920a76ac9c1,SUV,2016,1,466
2,内蒙古,150000,3c974920a76ac9c1,SUV,2016,1,257
3,北京,110000,3c974920a76ac9c1,SUV,2016,1,408
4,四川,510000,3c974920a76ac9c1,SUV,2016,1,610


In [4]:
train_search.head(5)

Unnamed: 0,province,adcode,model,regYear,regMonth,popularity
0,河南,410000,17bc272c93f19d56,2016,1,19036
1,河南,410000,17bc272c93f19d56,2016,2,17856
2,河南,410000,17bc272c93f19d56,2016,3,12517
3,河南,410000,17bc272c93f19d56,2016,4,9700
4,河南,410000,17bc272c93f19d56,2016,5,12780


In [5]:
train_user.head(5)

Unnamed: 0,model,regYear,regMonth,carCommentVolum,newsReplyVolum
0,02aab221aabc03b9,2016,1,132,399
1,02aab221aabc03b9,2016,2,160,3043
2,02aab221aabc03b9,2016,3,357,798
3,02aab221aabc03b9,2016,4,243,3821
4,02aab221aabc03b9,2016,5,283,933


In [4]:
evaluation_public.tail(5)

Unnamed: 0,id,province,adcode,model,regYear,regMonth,forecastVolum
5275,5364,福建,350000,a9a43d1a7ecbe75d,2018,4,
5276,5365,辽宁,210000,a9a43d1a7ecbe75d,2018,4,
5277,5366,重庆,500000,a9a43d1a7ecbe75d,2018,4,
5278,5367,陕西,610000,a9a43d1a7ecbe75d,2018,4,
5279,5368,黑龙江,230000,a9a43d1a7ecbe75d,2018,4,


In [7]:
evaluation_public.shape

(5280, 7)

# 2018年1月，提取方式历史月份销量比例，考虑时间衰减，月份越近占比越高

In [16]:
m1_12    = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==12), 'salesVolume'].values
m1_11    = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==11), 'salesVolume'].values
m1_10    = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==10), 'salesVolume'].values
m1_09    = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==9) , 'salesVolume'].values
m1_08    = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==8) , 'salesVolume'].values

m1_12_volum = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==12), 'salesVolume'].values * m1_12
m1_11_volum = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==11), 'salesVolume'].values * m1_11
m1_10_volum = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==10), 'salesVolume'].values * m1_10
m1_09_volum = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==9) , 'salesVolume'].values * m1_09
m1_08_volum = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==8) , 'salesVolume'].values * m1_08

evaluation_public.loc[evaluation_public.regMonth==1, 'forecastVolum'] =  m1_12_volum/2 + m1_11_volum/4 + m1_10_volum/8 + m1_09_volum/16 + m1_08_volum/16

In [33]:
evaluation_public.tail(10)

Unnamed: 0,id,province,adcode,model,regYear,regMonth,forecastVolum
5270,5359,河北,130000,a9a43d1a7ecbe75d,2018,4,111.663448
5271,5360,河南,410000,a9a43d1a7ecbe75d,2018,4,128.788946
5272,5361,浙江,330000,a9a43d1a7ecbe75d,2018,4,336.11699
5273,5362,湖北,420000,a9a43d1a7ecbe75d,2018,4,184.907326
5274,5363,湖南,430000,a9a43d1a7ecbe75d,2018,4,324.15175
5275,5364,福建,350000,a9a43d1a7ecbe75d,2018,4,87.129123
5276,5365,辽宁,210000,a9a43d1a7ecbe75d,2018,4,97.56222
5277,5366,重庆,500000,a9a43d1a7ecbe75d,2018,4,155.508097
5278,5367,陕西,610000,a9a43d1a7ecbe75d,2018,4,237.946192
5279,5368,黑龙江,230000,a9a43d1a7ecbe75d,2018,4,61.765991


In [17]:
m16_1_2  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==2) , 'salesVolume'].values
m16_1_3  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==3) , 'salesVolume'].values
m16_1_4  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==4) , 'salesVolume'].values
m16_1_5  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==5) , 'salesVolume'].values

m16_2_3  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==2) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==3) , 'salesVolume'].values
m16_2_4  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==2) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==4) , 'salesVolume'].values
m16_2_5  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==2) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==5) , 'salesVolume'].values
m16_2_6  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==2) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==6) , 'salesVolume'].values

m16_3_4  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==3) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==4) , 'salesVolume'].values
m16_3_5  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==3) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==5) , 'salesVolume'].values
m16_3_6  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==3) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==6) , 'salesVolume'].values
m16_3_7  = train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==3) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2016)&(train_sales.regMonth==7) , 'salesVolume'].values

m17_1_2  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==2) , 'salesVolume'].values
m17_1_3  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==3) , 'salesVolume'].values
m17_1_4  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==4) , 'salesVolume'].values
m17_1_5  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==1) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==5) , 'salesVolume'].values


m17_2_3  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==2) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==3) , 'salesVolume'].values
m17_2_4  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==2) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==4) , 'salesVolume'].values
m17_2_5  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==2) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==5) , 'salesVolume'].values
m17_2_6  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==2) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==6) , 'salesVolume'].values

m17_3_4  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==3) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==4) , 'salesVolume'].values
m17_3_5  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==3) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==5) , 'salesVolume'].values
m17_3_6  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==3) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==6) , 'salesVolume'].values
m17_3_7  = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==3) , 'salesVolume'].values / train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==7) , 'salesVolume'].values

m16_1 = m16_1_2/2 + m16_1_3/4 + m16_1_4/8 + m16_1_5/8
m16_2 = m16_2_3/2 + m16_2_4/4 + m16_2_5/8 + m16_2_6/8
m16_3 = m16_3_4/2 + m16_3_5/4 + m16_3_6/8 + m16_3_7/8

m17_1 = m17_1_2/2 + m17_1_3/4 + m17_1_4/8 + m17_1_5/8
m17_2 = m17_2_3/2 + m17_2_4/4 + m17_2_5/8 + m17_2_6/8
m17_3 = m17_3_4/2 + m17_3_5/4 + m17_3_6/8 + m17_3_7/8

m1 = m16_1 * 0.4 +  m17_1 * 0.6
m2 = m16_2 * 0.4 +  m17_2 * 0.6
m3 = m16_3 * 0.4 +  m17_3 * 0.6

evaluation_public.loc[evaluation_public.regMonth==2, 'forecastVolum'] = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==1) , 'salesVolume'].values / m1
evaluation_public.loc[evaluation_public.regMonth==3, 'forecastVolum'] = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==2) , 'salesVolume'].values / m2
evaluation_public.loc[evaluation_public.regMonth==4, 'forecastVolum'] = train_sales.loc[(train_sales.regYear==2017)&(train_sales.regMonth==3) , 'salesVolume'].values / m3

In [11]:
evaluation_public.head(10)

Unnamed: 0,id,province,adcode,model,regYear,regMonth,forecastVolum
0,1,上海,310000,3c974920a76ac9c1,2018,1,288.68744
1,2,云南,530000,3c974920a76ac9c1,2018,1,373.407562
2,3,内蒙古,150000,3c974920a76ac9c1,2018,1,152.85144
3,4,北京,110000,3c974920a76ac9c1,2018,1,288.068719
4,5,四川,510000,3c974920a76ac9c1,2018,1,398.131997
5,6,安徽,340000,3c974920a76ac9c1,2018,1,192.017912
6,7,山东,370000,3c974920a76ac9c1,2018,1,437.258605
7,8,山西,140000,3c974920a76ac9c1,2018,1,206.935851
8,9,广东,440000,3c974920a76ac9c1,2018,1,2371.091268
9,10,广西,450000,3c974920a76ac9c1,2018,1,346.799422


In [12]:
print(train_sales.loc[(train_sales.regMonth<=4) ,'salesVolume'].mean())
print(evaluation_public['forecastVolum'].mean())
evaluation_public[['id','forecastVolum']].round().astype(int).to_csv('ccf_car_sales.csv', index=False)

524.1121212121212
477.4628274411719


In [18]:
data = pd.concat([train_sales, evaluation_public], ignore_index=True)
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
del data['salesVolume'], data['forecastVolum']

num_feat = ['adcode', 'regMonth', 'regYear', 'popularity', 'carCommentVolum', 'newsReplyVolum']
cate_feat = ['bodyType', 'model', 'province']

for i in cate_feat:
    data[i] = data[i].astype('category')
features = num_feat + cate_feat

In [19]:
data.head(5)

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label
0,310000,SUV,0,3c974920a76ac9c1,上海,1,2016,1479.0,11.0,106.0,292.0
1,530000,SUV,0,3c974920a76ac9c1,云南,1,2016,1594.0,11.0,106.0,466.0
2,150000,SUV,0,3c974920a76ac9c1,内蒙古,1,2016,1479.0,11.0,106.0,257.0
3,110000,SUV,0,3c974920a76ac9c1,北京,1,2016,2370.0,11.0,106.0,408.0
4,510000,SUV,0,3c974920a76ac9c1,四川,1,2016,3562.0,11.0,106.0,610.0


In [32]:
data =data.drop(['province'],axis=1)
data.head(5)


Unnamed: 0,adcode,bodyType,id,model,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label
0,310000,SUV,0,3c974920a76ac9c1,1,2016,1479.0,11.0,106.0,292.0
1,530000,SUV,0,3c974920a76ac9c1,1,2016,1594.0,11.0,106.0,466.0
2,150000,SUV,0,3c974920a76ac9c1,1,2016,1479.0,11.0,106.0,257.0
3,110000,SUV,0,3c974920a76ac9c1,1,2016,2370.0,11.0,106.0,408.0
4,510000,SUV,0,3c974920a76ac9c1,1,2016,3562.0,11.0,106.0,610.0


In [33]:
#data[predict_label] = 0
test_index = (data['label'].isnull()) | (data['label'] == -1)
train_data = data[~test_index].reset_index(drop=True)
test_data = data[test_index]
train_data.head(5)



        

Unnamed: 0,adcode,bodyType,id,model,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label
0,310000,SUV,0,3c974920a76ac9c1,1,2016,1479.0,11.0,106.0,292.0
1,530000,SUV,0,3c974920a76ac9c1,1,2016,1594.0,11.0,106.0,466.0
2,150000,SUV,0,3c974920a76ac9c1,1,2016,1479.0,11.0,106.0,257.0
3,110000,SUV,0,3c974920a76ac9c1,1,2016,2370.0,11.0,106.0,408.0
4,510000,SUV,0,3c974920a76ac9c1,1,2016,3562.0,11.0,106.0,610.0


In [34]:
test_data.shape
test_data.tail(5)

Unnamed: 0,adcode,bodyType,id,model,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label
36955,350000,,5364,a9a43d1a7ecbe75d,4,2018,,,,
36956,210000,,5365,a9a43d1a7ecbe75d,4,2018,,,,
36957,500000,,5366,a9a43d1a7ecbe75d,4,2018,,,,
36958,610000,,5367,a9a43d1a7ecbe75d,4,2018,,,,
36959,230000,,5368,a9a43d1a7ecbe75d,4,2018,,,,


In [23]:
test_data.shape

(5280, 11)

In [5]:
def get_predict_w(model, data, label='label', feature=[], cate_feature=[], random_state=2018, n_splits=5,
                  model_type='lgb'):
    if 'sample_weight' not in data.keys():
        data['sample_weight'] = 1
    model.random_state = random_state
    predict_label = 'predict_' + label
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    data[predict_label] = 0
    test_index = (data[label].isnull()) | (data[label] == -1)
    train_data = data[~test_index].reset_index(drop=True)
    test_data = data[test_index]

    for train_idx, val_idx in kfold.split(train_data):
        model.random_state = model.random_state + 1

        train_x = train_data.loc[train_idx][feature]
        train_y = train_data.loc[train_idx][label]

        test_x = train_data.loc[val_idx][feature]
        test_y = train_data.loc[val_idx][label]
        if model_type == 'lgb':
            try:
                model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100,
                          eval_metric='mae',
                          # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                          categorical_feature=cate_feature,
                          sample_weight=train_data.loc[train_idx]['sample_weight'],
                          verbose=100)
            except:
                model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100,
                          eval_metric='mae',
                          # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                          # categorical_feature=cate_feature,
                          sample_weight=train_data.loc[train_idx]['sample_weight'],
                          verbose=100)
        elif model_type == 'ctb':
            model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100,
                      # eval_metric='mae',
                      # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                      cat_features=cate_feature,
                      sample_weight=train_data.loc[train_idx]['sample_weight'],
                      verbose=100)
        train_data.loc[val_idx, predict_label] = model.predict(test_x)
        if len(test_data) != 0:
            test_data[predict_label] = test_data[predict_label] + model.predict(test_data[feature])
    test_data[predict_label] = test_data[predict_label] / n_splits
    #print(mse(train_data[label], train_data[predict_label]))
    print(mse(train_data[label], train_data[predict_label]) * 5, train_data[predict_label].mean(),
          test_data[predict_label].mean())

    return pd.concat([train_data, test_data], sort=True, ignore_index=True), predict_label

In [35]:
parameters = {'num_leaves':[32,64],
              'reg_alpha':[0,0.1,0.5],
              'reg_lambda':[0,0.01,0.1],
              'min_child_samples':[10,20],
              'max_depth': [-1,0,5,15,10,20],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'feature_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
  
}


In [36]:
y = train_data.pop('label').values 
col = train_data.columns   
x = train_data[col].values  # 剩下的列作为训练数据
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.333, random_state=0)   # 分训练集和验证集
train = lgb.Dataset(train_x, train_y)
valid = lgb.Dataset(valid_x, valid_y, reference=train)



In [37]:
print(train_x)

[[450000 'Sedan' 0 ... 657.0 220.0 2523.0]
 [110000 'Sedan' 0 ... 2386.0 2834.0 527.0]
 [450000 'Sedan' 0 ... 1253.0 94.0 2615.0]
 ...
 [360000 'Sedan' 0 ... 3166.0 4.0 18988.0]
 [500000 'Sedan' 0 ... 537.0 103.0 3466.0]
 [510000 'MPV' 0 ... 7095.0 0.0 0.0]]


In [38]:
'''
gbm = lgb.LGBMClassifier(boosting_type='gbdt',
                         objective = 'binary',
                         metric = 'auc',
                         verbose = 0,
                         learning_rate = 0.01,
                         num_leaves = 35,
                         feature_fraction=0.8,
                         bagging_fraction= 0.9,
                         bagging_freq= 8,
                         lambda_l1= 0.6,
                         lambda_l2= 0)
'''
lgb_model = lgb.LGBMRegressor(
    num_leaves=32, reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=-1, learning_rate=0.05, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
)
# 有了gridsearch我们便不需要fit函数
gsearch = GridSearchCV(lgb_model, param_grid=parameters, scoring='neg_median_absolute_error', cv=3)
gsearch.fit(train_x, train_y)

print("Best score: %0.3f" % gsearch.best_score_)
print("Best parameters set:")
best_parameters = gsearch.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

ValueError: could not convert string to float: 'Sedan'

In [None]:
#yu
lgb_model = lgb.LGBMRegressor(
    num_leaves=32, reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=-1, learning_rate=0.05, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
)
data, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)

data['lgb'] = data[predict_label]



In [None]:
#保存结果
data['forecastVolum'] = data['lgb'].apply(lambda x: 0 if x < 0 else x)
data[data.label.isnull()][['id', 'forecastVolum']].round().astype(int).to_csv('ccf_car_sales_lgb.csv', index=False)