# 另一种二分类方式，把之前的one to mul转成 mul to mul的形式

In [1]:
import os
import gc
import ast
import math
import time
import json 
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
from itertools import product
from datetime import datetime
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score, accuracy_score,recall_score,precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder

In [2]:
city = 'shenzhen'
if city == 'beijing':
    data_path = '../data/base_feat/base_feat_beijing_06-09_16_15.csv'
elif city == 'shanghai':
    data_path = '../data/base_feat/base_feat_shanghai_06-09_23_23.parquet'
elif city =='shenzhen':
    data_path = 'E:/Machine-learning/kdd_cup_2019/phase2/version3_lgb/shenzhen_version3_2019-06-11_16_20_all_features.csv'
data = pd.read_csv(data_path)

In [3]:
data=data.iloc[:4000]

In [4]:
# ==========================对plans进行拆分展开成二分类==================================
print('3 拆分plans构建二分类数据============================================')
data['plans_json'] = data['plans'].fillna('[]').apply(lambda x: json.loads(x))
data = data.drop('plans', axis=1)

plans = np.dstack((np.repeat(data['sid'].values,list(map(len,data['plans_json'].values))),np.concatenate(data['plans_json'].values)))
plans = pd.DataFrame(plans[0],columns=['sid','plans'])
plans = pd.concat([plans, pd.DataFrame.from_dict(plans['plans'].tolist())], axis=1)

plans.drop_duplicates(subset = ['sid','transport_mode'], inplace=True)
# plans['price'] = plans[['price','transport_mode']].apply(fillna_price, axis=1)

print('3.1 构建二分类plans的特征=============================================')
plans['price'] = plans['price'].apply(lambda x:0 if x == '' else int(x))
plans['price_yuan'] = plans['price']//100
plans['distance_eta'] = plans['distance']/plans['eta']

plans['distance_price'] = plans['distance']*plans['price'] 
plans['eta_price'] = plans['eta'] * plans['price']

plans['self_rank'] = plans.groupby(['sid']).cumcount()
plans['price_rank'] = plans.groupby(['sid'])['price'].rank()
plans['eta_rank'] = plans.groupby(['sid'])['eta'].rank()
plans['distance_rank'] = plans.groupby(['sid'])['distance'].rank()
plans['distance_eta_rank'] = plans.groupby(['sid'])['distance_eta'].rank()
plans['distance_price_price'] = plans.groupby(['sid'])['distance_price'].rank()
plans['eta_price_rank'] = plans.groupby(['sid'])['eta_price'].rank()

gc.collect()



56

In [5]:
plans

Unnamed: 0,sid,plans,distance,eta,price,transport_mode,price_yuan,distance_eta,distance_price,eta_price,self_rank,price_rank,eta_rank,distance_rank,distance_eta_rank,distance_price_price,eta_price_rank
0,1972109,"{'distance': 6238, 'price': 300, 'eta': 1713, ...",6238,1713,300,9,3,3.641565,1871400,513900,0,5.5,4.0,7.0,5.0,6.0,5.0
1,1972109,"{'distance': 5864, 'price': '', 'eta': 862, 't...",5864,862,0,3,0,6.802784,0,0,1,2.0,1.0,4.5,7.0,2.0,2.0
2,1972109,"{'distance': 5864, 'price': 2200, 'eta': 1102,...",5864,1102,2200,4,22,5.321234,12900800,2424400,2,7.0,2.0,4.5,6.0,7.0,7.0
3,1972109,"{'distance': 4323, 'price': '', 'eta': 1304, '...",4323,1304,0,6,0,3.315184,0,0,3,2.0,3.0,2.0,4.0,2.0,2.0
4,1972109,"{'distance': 5711, 'price': 200, 'eta': 1883, ...",5711,1883,200,1,2,3.032926,1142200,376600,4,4.0,5.0,3.0,2.0,4.0,4.0
5,1972109,"{'distance': 6237, 'price': 300, 'eta': 1914, ...",6237,1914,300,2,3,3.258621,1871100,574200,5,5.5,6.0,6.0,3.0,5.0,6.0
6,1972109,"{'distance': 4186, 'price': '', 'eta': 3725, '...",4186,3725,0,5,0,1.123758,0,0,6,2.0,7.0,1.0,1.0,2.0,2.0
7,1684471,"{'distance': 527, 'price': '', 'eta': 450, 'tr...",527,450,0,5,0,1.171111,0,0,0,2.0,3.0,1.0,1.0,2.0,2.0
8,1684471,"{'distance': 872, 'price': '', 'eta': 261, 'tr...",872,261,0,6,0,3.340996,0,0,1,2.0,1.0,2.0,2.0,2.0,2.0
9,1684471,"{'distance': 1214, 'price': '', 'eta': 310, 't...",1214,310,0,3,0,3.916129,0,0,2,2.0,2.0,3.0,3.0,2.0,2.0


In [6]:
profiles = pd.read_csv('../data/data_set_phase2/profiles.csv') 
profiles
import gc 
gc.collect()
data  = data.merge(profiles, 'left', ['pid'])

In [7]:
data.columns.values

array(['click_mode', 'd', 'o', 'pid', 'plan_time', 'req_time', 'sid',
       'o_lng', 'o_lat', 'd_lng', 'd_lat', 'city_flag_o',
       'od_manhattan_distance', 'od_manhattan_distance_detail',
       'o_main_centroid_mean_dis', 'd_main_centroid_mean_dis',
       'o_main_centroid_mode_dis', 'd_main_centroid_mode_dis',
       "('o_cluster', 'd_cluster')", 'o_cluster', 'd_cluster',
       'manhattan', 'euclidean', 'delta_longitude', 'delta_latitude',
       'pickup_x', 'pickup_y', 'pickup_z', 'dropoff_x', 'dropoff_y',
       'dropoff_z', 'direction', 'req_time_hour', 'req_time_weekday',
       'req_time_minute', 'req_time_date_d', 'time_diff', 'diff_6_cloc',
       'diff_12_clock', 'diff_18_clock', 'diff_24_clock', 'svd_fea_0',
       'svd_fea_1', 'svd_fea_2', 'svd_fea_3', 'svd_fea_4', 'svd_fea_5',
       'svd_fea_6', 'svd_fea_7', 'svd_fea_8', 'svd_fea_9', 'plans_json',
       'mode_feas_0', 'mode_feas_1', 'mode_feas_2', 'mode_feas_3',
       'mode_feas_4', 'mode_feas_5', 'mode_feas_6', 'm

In [9]:
print('特征加载')
######################################   非特征    ######################################
or_feature  = ['req_time','click_mode','sid']

######################################   原始特征    ######################################
cate_feature = ['pid'] 
profile_feature = ['p' + str(i) for i in range(66)]

origin_num_feature = ['o_lng', 'o_lat', 'd_lng', 'd_lat'] + profile_feature + cate_feature
      
########################################  经纬度特征  ##########################################
od_feature = ['od_manhattan_distance', 'od_manhattan_distance_detail', 'o_main_centroid_mean_dis',
              'd_main_centroid_mean_dis', 'o_main_centroid_mode_dis', 'd_main_centroid_mode_dis', 
              'o_cluster', 'd_cluster']

########################################  距离特征  ##########################################
distance_feature = ['manhattan','euclidean', 'od_manhattan_distance_detail', 'delta_longitude','delta_latitude','pickup_x','pickup_y','pickup_z','dropoff_x','dropoff_y','dropoff_z','direction']

################################  plan_time & req_time特征  ###################################
#所有时间特征
all_time_feature = ['req_time_hour', 'req_time_weekday', 'req_time_minute', 'req_time_date_d', 'time_diff', 
                'diff_6_cloc', 'diff_12_clock', 'diff_18_clock', 'diff_24_clock']
#选择的时间特征
time_feature=['req_time_hour', 'time_diff', 'diff_6_cloc', 'diff_12_clock', 'diff_18_clock', 'diff_24_clock']

################################  对profile的降维提取特征  ###################################
#所有特征
all_profiles_svd_feature = ['svd_fea_{}'.format(i) for i in range(10)]

##############################  plans百度地图推荐交通方式特征  #################################
plan_feature = ['mode_feas_0', 'mode_feas_1', 'mode_feas_2', 'mode_feas_3', 'mode_feas_4', 'mode_feas_5', 
                'mode_feas_6', 'mode_feas_7', 'mode_feas_8', 'mode_feas_9', 'mode_feas_10', 'mode_feas_11', 
               'max_dist', 'min_dist', 'mean_dist', 'std_dist', 'max_price', 'min_price', 'mean_price', 
                'std_price', 'max_eta', 'min_eta', 'mean_eta', 'std_eta', 'max_dist_mode', 'min_dist_mode',
                'max_price_mode', 'min_price_mode', 'max_eta_mode', 'min_eta_mode', 'first_mode',
                'svd_mode_0', 'svd_mode_1', 'svd_mode_2', 'svd_mode_3', 'svd_mode_4', 'svd_mode_5', 
                'svd_mode_6', 'svd_mode_7', 'svd_mode_8', 'svd_mode_9']

##############################  位置点和od对出现次数及排序特征  #################################
#所有特征
all_od_appear_feature = ['o_appear_count','d_appear_count','o_appear_count_rank','d_appear_count_rank',
                         'o_appear_count_rank_buguiyi','d_appear_count_rank_buguiyi',
                         'od_couple','od_couple_count','od_couple_rank','od_couple_rank_buguiyi']
#选择特征
od_appear_feature = ['o_appear_count','d_appear_count','o_appear_count_rank','d_appear_count_rank',
                     'o_appear_count_rank_buguiyi','d_appear_count_rank_buguiyi','od_couple_count']

##########################################  协同特征  #############################################
xietong_data = pd.read_csv('./xietong_divide_shenzhen.csv', index_col=False)

xietong_feature = xietong_data.columns.to_list()

binary_feat = [feat for feat in plans.columns.tolist() if feat not in ['sid','plans',]]
print('特征加载完成')
#保存选择的特征
Select_features = or_feature + origin_num_feature + od_appear_feature + plan_feature + od_feature +\
                  distance_feature + all_time_feature +  all_profiles_svd_feature + plan_feature +\
                  od_appear_feature + xietong_feature
#使用的特征

feature = [col for col in Select_features if col not in ['req_time','click_mode','sid', 'pid','dewPoint','humidity','humidity','windBearing','windSpeed']]

特征加载
特征加载完成


In [None]:
data_binary = plans.merge(data[Select_features], on='sid', how='left')
# data_binary['dist_diff'] = data_binary['distance'] - data_binary['od_manhattan_distance_detail']
data_binary['label'] = data_binary[['transport_mode','click_mode']].apply(lambda row:1 if row['transport_mode']==row['click_mode'] else 0,axis=1)
gc.collect()
feature += binary_feat

In [6]:
def split(data):
    train_index = (data.req_time < '2018-11-23') 
    train_x     = data[train_index][feature].reset_index(drop=True)
    train_y     = data[train_index].label.reset_index(drop=True)

    valid_index = (data.req_time > '2018-11-23') & (data.req_time < '2018-12-01')
    valid_x     = data[valid_index][feature].reset_index(drop=True)
    valid_y     = data[valid_index].label.reset_index(drop=True)

    test_index = (data.req_time > '2018-12-01')
    test_x     = data[test_index][feature].reset_index(drop=True)
    gc.collect()
    return train_x, train_y, valid_x, valid_y

train_x, train_y, valid_x, valid_y = split(data_binary)

In [7]:
lgb_binary_para = {'boosting_type': 'gbdt',
                 'objective': 'binary',
                 'num_leaves': 31,
                 'learning_rate': 0.05,
                 'feature_fraction': 0.9,
                 'bagging_fraction': 0.9,
                 'bagging_seed': 0,
                 'bagging_freq': 1,
                 'verbose': 1,
                 'seed': 42,
                 'reg_alpha':7.5,
                 'reg_lambda':2}

lgb_model = lgb.LGBMClassifier(**lgb_binary_para, n_estimators=4000, n_jobs=8)
eval_set = [(train_x, train_y),(valid_x, valid_y)]
lgb_model.fit(train_x, train_y, eval_set=eval_set, eval_metric=['auc'], verbose=10, early_stopping_rounds=100)
SCORE = lgb_model.best_score_['valid_1']['auc']

Training until validation scores don't improve for 100 rounds.
[10]	training's binary_logloss: 0.375388	training's auc: 0.903256	valid_1's binary_logloss: 0.372745	valid_1's auc: 0.911127
[20]	training's binary_logloss: 0.325655	training's auc: 0.906787	valid_1's binary_logloss: 0.320315	valid_1's auc: 0.914384
[30]	training's binary_logloss: 0.303032	training's auc: 0.908765	valid_1's binary_logloss: 0.296136	valid_1's auc: 0.916137
[40]	training's binary_logloss: 0.292174	training's auc: 0.90963	valid_1's binary_logloss: 0.284261	valid_1's auc: 0.916987
[50]	training's binary_logloss: 0.286569	training's auc: 0.91078	valid_1's binary_logloss: 0.278024	valid_1's auc: 0.918111
[60]	training's binary_logloss: 0.283469	training's auc: 0.911635	valid_1's binary_logloss: 0.274567	valid_1's auc: 0.918932
[70]	training's binary_logloss: 0.281445	training's auc: 0.912432	valid_1's binary_logloss: 0.272292	valid_1's auc: 0.919678
[80]	training's binary_logloss: 0.280126	training's auc: 0.91312

[660]	training's binary_logloss: 0.267969	training's auc: 0.921453	valid_1's binary_logloss: 0.262607	valid_1's auc: 0.925145
[670]	training's binary_logloss: 0.267879	training's auc: 0.921519	valid_1's binary_logloss: 0.262578	valid_1's auc: 0.925162
[680]	training's binary_logloss: 0.267803	training's auc: 0.921577	valid_1's binary_logloss: 0.262558	valid_1's auc: 0.925173
[690]	training's binary_logloss: 0.267723	training's auc: 0.921638	valid_1's binary_logloss: 0.262535	valid_1's auc: 0.925189
[700]	training's binary_logloss: 0.267649	training's auc: 0.921695	valid_1's binary_logloss: 0.262522	valid_1's auc: 0.925196
[710]	training's binary_logloss: 0.267579	training's auc: 0.921745	valid_1's binary_logloss: 0.262513	valid_1's auc: 0.925203
[720]	training's binary_logloss: 0.267504	training's auc: 0.921801	valid_1's binary_logloss: 0.262493	valid_1's auc: 0.925221
[730]	training's binary_logloss: 0.267424	training's auc: 0.921859	valid_1's binary_logloss: 0.262476	valid_1's auc: 0

[1310]	training's binary_logloss: 0.263472	training's auc: 0.924886	valid_1's binary_logloss: 0.26182	valid_1's auc: 0.925674
[1320]	training's binary_logloss: 0.263409	training's auc: 0.924935	valid_1's binary_logloss: 0.261811	valid_1's auc: 0.925681
[1330]	training's binary_logloss: 0.263347	training's auc: 0.924978	valid_1's binary_logloss: 0.261796	valid_1's auc: 0.925684
[1340]	training's binary_logloss: 0.263281	training's auc: 0.925029	valid_1's binary_logloss: 0.261793	valid_1's auc: 0.925685
[1350]	training's binary_logloss: 0.263207	training's auc: 0.925082	valid_1's binary_logloss: 0.26179	valid_1's auc: 0.92569
[1360]	training's binary_logloss: 0.263154	training's auc: 0.925121	valid_1's binary_logloss: 0.261784	valid_1's auc: 0.925693
[1370]	training's binary_logloss: 0.263091	training's auc: 0.925172	valid_1's binary_logloss: 0.261774	valid_1's auc: 0.9257
[1380]	training's binary_logloss: 0.263026	training's auc: 0.92522	valid_1's binary_logloss: 0.261772	valid_1's auc:

[1960]	training's binary_logloss: 0.259685	training's auc: 0.927703	valid_1's binary_logloss: 0.261549	valid_1's auc: 0.925819
[1970]	training's binary_logloss: 0.259629	training's auc: 0.927745	valid_1's binary_logloss: 0.26154	valid_1's auc: 0.925827
[1980]	training's binary_logloss: 0.259566	training's auc: 0.927788	valid_1's binary_logloss: 0.261536	valid_1's auc: 0.925831
[1990]	training's binary_logloss: 0.259511	training's auc: 0.927831	valid_1's binary_logloss: 0.261533	valid_1's auc: 0.925831
[2000]	training's binary_logloss: 0.259449	training's auc: 0.927881	valid_1's binary_logloss: 0.26153	valid_1's auc: 0.925831
Did not meet early stopping. Best iteration is:
[2000]	training's binary_logloss: 0.259449	training's auc: 0.927881	valid_1's binary_logloss: 0.26153	valid_1's auc: 0.925831


In [10]:
def get_result(result, data_X):
    result['proba'] = lgb_model.predict_proba(data_X)[:,1]
    result = (result.sort_values(GR_COLS + ["proba"], ascending=[True, True, True, False]))
    result = result.groupby(GR_COLS)['impresssions'].first().rename('recommend_mode')
    return result

valid_binary_df = pd.DataFrame()
valid_index = (data_binary.req_time > '2018-11-23') & (data_binary.req_time < '2018-12-01')
valid_binary_df['sid'] = data_binary[valid_index]['sid'].values
valid_binary_df['click_mode'] = data_binary[valid_index]['click_mode'].values
valid_binary_df['transport_mode'] = data_binary[valid_index]['transport_mode'].values
valid_binary_df['proba'] = lgb_model.predict_proba(valid_X)[:,1]

valid_binary_df = (valid_binary_df.sort_values(["sid","proba"], ascending=[True,False]))
valid_binary_df = valid_binary_df.groupby(['sid'])[['click_mode','transport_mode']].first()
f1_score(y_true=valid_binary_df['click_mode'], y_pred=valid_binary_df['transport_mode'], average='weighted')

valid_binary_df['proba'] = lgb_model.predict_proba(valid_x)[:,1]

valid_binary_df = (valid_binary_df.sort_values(["sid","proba"], ascending=[True,False]))
valid_binary_df = valid_binary_df.groupby(['sid'])[['click_mode','transport_mode']].first()
f1_score(y_true=valid_binary_df['click_mode'], y_pred=valid_binary_df['transport_mode'], average='weighted')

0.6774892959919135

In [None]:

valid_proba_df['click_mode'] = valid_y
dic_ = valid_proba_df['click_mode'].value_counts(normalize = True)
valid_proba_df['recommend_mode'] = proba_process(valid_proba)
get_weighted_fscore(y_true = valid_proba_df['click_mode'] , y_pred = valid_proba_df['recommend_mode'])