In [32]:
import gc
import json
import math
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score
from time import gmtime, strftime
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

In [2]:
def read_profile_data():
    profile_data = pd.read_csv(dir_path + 'profiles.csv')
    profile_na = np.zeros(67)-1
    profile_na = pd.DataFrame(profile_na.reshape(1, -1))
    profile_na.columns = profile_data.columns
    profile_data = profile_data.append(profile_na)
    return profile_data


def merge_raw_data():
    te_plans = pd.read_csv(dir_path + 'test_plans.csv')
    te_queries = pd.read_csv(dir_path + 'test_queries.csv')

    tr_click = pd.read_csv(dir_path + 'train_clicks.csv')
    tr_plans = pd.read_csv(dir_path + 'train_plans.csv')
    tr_queries = pd.read_csv(dir_path + 'train_queries.csv')

    tr_data = tr_queries.merge(tr_click, on='sid', how='left')
    tr_data = tr_data.merge(tr_plans, on='sid', how='left')
    tr_data = tr_data.drop(['click_time'], axis=1)
    tr_data['click_mode'] = tr_data['click_mode'].fillna(0)

    te_data = te_queries.merge(te_plans, on='sid', how='left')
    te_data['click_mode'] = -1

    data = pd.concat([tr_data, te_data], axis=0)
#     data = data.drop(['plan_time'], axis=1)
    data = data.reset_index(drop=True)
    
    print('total data size: {}'.format(data.shape))
    print('raw data columns: {}'.format(', '.join(data.columns)))
    return data


def gen_od_feas(data):
    def GetDistance(lng1, lat1, lng2, lat2):
        EARTH_RADIUS = 6378.137

        lng1 = lng1*math.pi / 180.0
        lng2 = lng2*math.pi / 180.0
        lat1 = lat1*math.pi / 180.0
        lat2 = lat2*math.pi / 180.0

        dis1 = lat1-lat2
        dis2 = lng1-lng2

        s = 2*math.asin( ((math.sin(dis1/2))**2 + math.cos(lat1)*math.cos(lat2)*(math.sin(dis2/2))**2)**0.5 )
        s = s * EARTH_RADIUS  * 1000
        return s
    
#     subway = pd.read_csv('../data/other_data/beijing_subway.csv')
#     bus_station = pd.read_csv('../data/other_data/beijing_bus_station.csv')
    
    data['o1'] = data['o'].apply(lambda x: float(x.split(',')[0]))
    data['o2'] = data['o'].apply(lambda x: float(x.split(',')[1]))
    data['d1'] = data['d'].apply(lambda x: float(x.split(',')[0]))
    data['d2'] = data['d'].apply(lambda x: float(x.split(',')[1]))
    data['dist'] = data.apply(lambda line:((line['o1']-line['d1'])**2 + (line['o2']-line['d2'])**2)**(0.5),axis=1)
    data['real_dis'] = data.apply(lambda row:GetDistance(row['o1'], row['o2'], row['d1'], row['d2']), axis=1)
    data['real_dis_60000'] = data['real_dis'].apply(lambda x:1 if x>=60000 else 0)
    data['real_dis_10000'] = data['real_dis'].apply(lambda x:1 if x<=10000 else 0)
    data['real_dis_7500'] = data['real_dis'].apply(lambda x:1 if x<=7500 else 0)
    
    data['real_dis_2500'] = data['real_dis'].apply(lambda x:1 if x<=2500 else 0)
    data['od_manhattan_distance'] = abs(data['o1']-data['d1'])+abs(data['o2']-data['d2'])
    
#     data['o_nearest_sub'] = data.apply(lambda row:(abs(subway['station_longitude']-row['o1'])
#                                       +abs(subway['station_latitude']-row['o2'])).min(), axis=1)

#     data['d_nearest_sub'] = data.apply(lambda row:(abs(subway['station_longitude']-row['d1'])
#                                       +abs(subway['station_latitude']-row['d2'])).min(), axis=1)
    
#     data['d_nearest_bus'] = data.apply(lambda row:(abs(bus_station['lng']-row['o1'])
#                                       +abs(bus_station['lat']-row['o2'])).min(), axis=1)
#     data['d_nearest_bus'] = data.apply(lambda row:(abs(bus_station['lng']-row['d1'])
#                                       +abs(bus_station['lat']-row['d2'])).min(), axis=1)

    data = data.drop(['o', 'd'], axis=1)
    return data


def gen_profile_feas(data):
    profile_data = read_profile_data()
    p_feat = ['p'+str(num) for num in range(66)]
    profile_data[p_feat] = profile_data[p_feat].astype(int).astype(str)
#     data['pid'] = data['pid'].fillna(-1)
    profile_data = pd.get_dummies(profile_data)
    data = data.merge(profile_data, on='pid', how='left')
    
#     x = profile_data.drop(['pid'], axis=1).values
#     svd = TruncatedSVD(n_components=20, n_iter=20, random_state=2019)
#     svd_x = svd.fit_transform(x)
#     svd_feas = pd.DataFrame(svd_x)
#     svd_feas.columns = ['svd_fea_{}'.format(i) for i in range(20)]
#     svd_feas['pid'] = profile_data['pid'].values
#     data = data.merge(svd_feas, on='pid', how='left')
    
    return data


def gen_time_feas(data):
    data['req_time'] = pd.to_datetime(data['req_time'])
    data['plan_time'] = pd.to_datetime(data['plan_time'])
    data['time_diff'] = data['plan_time'].astype(int) - data['req_time'].astype(int)
    
    data['weekday'] = data['req_time'].dt.dayofweek
    data['IsWeek'] = data['weekday'].apply(lambda x:1 if x > 5 else 0).astype(str)
    data['hour'] = data['req_time'].dt.hour
    data['minute'] = data['req_time'].dt.minute
    
    data['hour_3_23'] = data['hour'].apply(lambda x:1 if x>=3 & x <= 23 else 0)
    data['hour_5_23'] = data['hour'].apply(lambda x:1 if x>=5 & x <= 23 else 0)
    data['hour_minute'] = data['hour']*60 + data['minute']
    
#     data['weekday'] = data['req_time'].astype(str)
#     data['hour'] = data['hour'].astype(str)
#     data = pd.concat([data,pd.get_dummies(data[['IsWeek','weekday','hour']])], axis=1)
    data = pd.concat([data,pd.get_dummies(data[['IsWeek']])], axis=1)
    data = data.drop(['plan_time'], axis=1)
    
    return data


def gen_plan_feas(data):
    def num_plans(plan):
        try:
            cur_plan_list = json.loads(plan)
        except:
            cur_plan_list = []
        return len(cur_plan_list)

    n = data.shape[0]
    mode_list_feas = np.zeros((n, 12))
    max_dist, min_dist, mean_dist, std_dist = \
        np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))

    max_price, min_price, mean_price, std_price = \
        np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))

    max_eta, min_eta, mean_eta, std_eta = \
        np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))
    
    max_price_eta, min_price_eta, mean_price_eta, std_price_eta = \
        np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))
    
    min_dist_mode, max_dist_mode, min_price_mode, max_price_mode = \
        np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))
        
    min_eta_mode, max_eta_mode, min_price_eta_mode, max_price_eta_mode, first_mode = \
        np.zeros((n,)),np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))
    
    mode_texts = []

    for i, plan in tqdm(enumerate(data['plans'].values)):
        try:
            cur_plan_list = json.loads(plan)
        except:
            cur_plan_list = []

        if len(cur_plan_list) == 0:
            mode_list_feas[i, 0] = 1
            first_mode[i] = 0

            max_dist[i] = -1
            min_dist[i] = -1
            mean_dist[i] = -1
            std_dist[i] = -1

            max_price[i] = -1
            min_price[i] = -1
            mean_price[i] = -1
            std_price[i] = -1

            max_eta[i] = -1
            min_eta[i] = -1
            mean_eta[i] = -1
            std_eta[i] = -1

            min_dist_mode[i] = -1
            max_dist_mode[i] = -1
            min_price_mode[i] = -1
            max_price_mode[i] = -1
            min_eta_mode[i] = -1
            max_eta_mode[i] = -1

            mode_texts.append('word_null')

        else:
            distance_list = []
            price_list = []
            eta_list = []
            price_eta_list = []
            mode_list = []
            
            for tmp_dit in cur_plan_list:
                distance_list.append(int(tmp_dit['distance']))
                if tmp_dit['price'] == '':
                    if tmp_dit['transport_mode'] == 3:
                        temp_price = 10000
                    else:
                        temp_price = 0
                else:
                    temp_price = int(tmp_dit['price'])
                    
                price_list.append(temp_price)
                price_eta_list.append(int(tmp_dit['eta']) * int(temp_price))
                eta_list.append(int(tmp_dit['eta']))
                mode_list.append(int(tmp_dit['transport_mode']))

            mode_texts.append(' '.join(['word_{}'.format(mode) for mode in mode_list]))
                        
            # 保存成array
            distance_list = np.array(distance_list)
            price_list = np.array(price_list)
            eta_list = np.array(eta_list)
            price_eta_list = np.array(price_eta_list)
            mode_list = np.array(mode_list, dtype='int')
            
            mode_list_feas[i, mode_list] = 1
            
            # 对array进行排序然后得到其index
            distance_sort_idx = np.argsort(distance_list)
            price_sort_idx = np.argsort(price_list)
            eta_sort_idx = np.argsort(eta_list)
            price_eta_sort_idx = np.argsort(price_eta_list)
            
            # 求dist的最大、小、均值，标准差
            max_dist[i] = distance_list[distance_sort_idx[-1]]
            min_dist[i] = distance_list[distance_sort_idx[0]]
            mean_dist[i] = np.mean(distance_list)
            std_dist[i] = np.std(distance_list)
                
            # 求price的最大、小、均值，标准差
            max_price[i] = price_list[price_sort_idx[-1]]
            min_price[i] = price_list[price_sort_idx[0]]
            mean_price[i] = np.mean(price_list)
            std_price[i] = np.std(price_list)
            
            # 求eta的最大、小、均值，标准差
            max_eta[i] = eta_list[eta_sort_idx[-1]]
            min_eta[i] = eta_list[eta_sort_idx[0]]
            mean_eta[i] = np.mean(eta_list)
            std_eta[i] = np.std(eta_list)
            
            # 求price*eta的最大、小、均值，标准差
            max_price_eta[i] = price_eta_list[price_eta_sort_idx[-1]]
            min_price_eta[i] = price_eta_list[price_eta_sort_idx[0]]
            mean_price_eta[i] = np.mean(price_eta_list)
            std_price_eta[i] = np.std(price_eta_list)
            
            # 求第一个mode
            first_mode[i] = mode_list[0]
            
            # 求dist最大最小时的mode
            max_dist_mode[i] = mode_list[distance_sort_idx[-1]]
            min_dist_mode[i] = mode_list[distance_sort_idx[0]]
            
            # 求price最大最小时的mode
            max_price_mode[i] = mode_list[price_sort_idx[-1]]
            min_price_mode[i] = mode_list[price_sort_idx[0]]
            
            # 求eta最大最小时的mode
            max_eta_mode[i] = mode_list[eta_sort_idx[-1]]
            min_eta_mode[i] = mode_list[eta_sort_idx[0]]
            
            # 求price×eta最大最小时的mode
            max_price_eta_mode[i] = mode_list[price_eta_sort_idx[-1]]
            min_price_eta_mode[i] = mode_list[price_eta_sort_idx[0]]

    feature_data = pd.DataFrame(mode_list_feas)
    feature_data.columns = ['mode_feas_{}'.format(i) for i in range(12)]
    feature_data['max_dist'] = max_dist
    feature_data['min_dist'] = min_dist
    feature_data['mean_dist'] = mean_dist
    feature_data['std_dist'] = std_dist

    feature_data['max_price'] = max_price
    feature_data['min_price'] = min_price
    feature_data['mean_price'] = mean_price
    feature_data['std_price'] = std_price

    feature_data['max_eta'] = max_eta
    feature_data['min_eta'] = min_eta
    feature_data['mean_eta'] = mean_eta
    feature_data['std_eta'] = std_eta
    
    feature_data['max_price_eta'] = max_price_eta
    feature_data['min_price_eta'] = min_price_eta
    feature_data['mean_price_eta'] = mean_price_eta
    feature_data['std_price_eta'] = std_price_eta

    feature_data['max_dist_mode'] = max_dist_mode
    feature_data['min_dist_mode'] = min_dist_mode
    
    feature_data['max_price_mode'] = max_price_mode
    feature_data['min_price_mode'] = min_price_mode
    
    feature_data['max_eta_mode'] = max_eta_mode
    feature_data['min_eta_mode'] = min_eta_mode
    
    feature_data['max_price_eta_mode'] = max_price_eta_mode
    feature_data['min_price_eta_mode'] = min_price_eta_mode
    feature_data['first_mode'] = first_mode

    print('mode tfidf...')
    tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec = tfidf_enc.fit_transform(mode_texts)
    svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)
    mode_svd = svd_enc.fit_transform(tfidf_vec)
    mode_svd = pd.DataFrame(mode_svd)
    mode_svd.columns = ['svd_mode_{}'.format(i) for i in range(10)]
    data = pd.concat([data, feature_data, mode_svd], axis=1)

    data['nums_plans'] = data['plans'].apply(num_plans)
    data = data.drop(['plans'], axis=1)
    return data

In [3]:
dir_path = '../data/data_set_phase1/'
data = merge_raw_data()
data = gen_od_feas(data)
data = gen_time_feas(data)

data = gen_plan_feas(data)
data = gen_profile_feas(data)
bus_subway = pd.read_csv('../data/other_data/feature_subway_rela.csv')
data = data.merge(bus_subway, on='sid',how='left')

total data size: (594358, 8)
raw data columns: click_mode, d, o, pid, plan_time, plans, req_time, sid


594358it [02:10, 4545.40it/s]


mode tfidf...


In [None]:
data['']

In [5]:
pid_feat = ['p'+str(num) for num in range(66)]

time_feature_cate = ['day','weekday','IsWeek','hour','hour_3_23','hour_5_23']
time_feature_num = ['minute','time_diff','hour_minute']

od_feat_num = ['o1','o2','d1','d2','dist','real_dis','od_manhattan_distance']
od_feat_cate = ['o','d','real_dis_60000','real_dis_10000','real_dis_7500']

plan_feat_cate = ['min_dist_mode', 'max_dist_mode', 'min_price_mode', 'max_price_mode',
                  'min_eta_mode', 'max_eta_mode', 'first_mode'] + ['mode_feas_{}'.format(i) for i in range(12)]

plan_feat_num = ['max_dist', 'min_dist', 'mean_dist', 'std_dist',
                 'max_price', 'min_price', 'mean_price', 'std_price',
                 'max_eta', 'min_eta', 'mean_eta', 'std_eta','nums_plans','nums_plans']

plan_feat_svd = ['svd_mode_{}'.format(i) for i in range(10)]

num_feat = time_feature_num + od_feat_num + plan_feat_num
cate_feat = time_feature_cate + plan_feat_cate + ['pid'] + ['sid']

feature = [col for col in data.columns if col not in ['click_mode','req_time','sid','IsWeek']]

# minmax-scaler
scaler = MinMaxScaler()
data[num_feat] = scaler.fit_transform(data[num_feat])

# Z-scaler
# scaler = StandardScaler()
# data[num_feat] = scaler.fit_transform(data[num_feat])

# from sklearn.preprocessing import OneHotEncoder
# ohEncoder = OneHotEncoder()
# ohEncoder.fit_transform(data[['hour','IsWeek']])

In [6]:
%%time
train_index = (data.req_time < '2018-11-23')
train_x     = data[train_index][feature].reset_index(drop=True)
train_y     = data[train_index].click_mode.reset_index(drop=True)

valid_index = (data.req_time > '2018-11-23') & (data.req_time < '2018-12-01')
valid_x     = data[valid_index][feature].reset_index(drop=True)
valid_y     = data[valid_index].click_mode.reset_index(drop=True)

output = pd.DataFrame()
output['sid']   = data[valid_index]['sid']
output['label'] = valid_y.values
dic_ = output['label'].value_counts(normalize = True)

test_index = (data.req_time > '2018-12-01')
test_x     = data[test_index][feature].reset_index(drop=True)
gc.collect()

CPU times: user 1.48 s, sys: 2.22 s, total: 3.7 s
Wall time: 3.7 s


In [1]:
def f1_weighted(labels,preds):
    print(preds)
    preds = np.argmax(preds, axis=0)
    score = f1_score(y_true=labels, y_pred=preds)
    return 'f1_weighted', score, True


def lgb_train_binary(train_x, train_y, valid_x, valid_y,f_score):

    lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=120, reg_alpha=0, reg_lambda=0,
                                   max_depth=-1, n_estimators=2000, objective='binary',
                                   learning_rate=0.03, random_state=123, metric="binary_logloss",n_jobs=-1,
                                   subsample=0.8, colsample_bytree=0.8, subsample_freq=1, min_child_samples = 50,)

    eval_set = [(train_x, train_y),(valid_x, valid_y)]
    lgb_model.fit(train_x, train_y, eval_set=eval_set, eval_metric='logloss', 
                  verbose=50, early_stopping_rounds=200)
    
    valid_prob_y = lgb_model.predict_proba(valid_x)[:,1]
    
    output[str(mode)] = valid_prob_y
    
    best_f1_ = 0
    best_thresh = 0
    
    for thresh in range(3,9):
        thresh *= 0.1
        valid_pred_y = valid_prob_y >= thresh
        f1_ = f1_score(y_true=valid_y, y_pred=valid_pred_y)
        if f1_ > best_f1_:
            best_f1_ = f1_
            best_thresh = thresh
            
    valid_pred_y = valid_prob_y >= best_thresh
    weighted_f1 = dic_[mode] * f1_
    f_score += weighted_f1
    precision_ = precision_score(y_true=valid_y, y_pred=valid_pred_y)
    recall_ = recall_score(y_true=valid_y, y_pred=valid_pred_y)

    print(f'mode:{mode},  F1:{best_f1_:.6f}, weighted_f1:{weighted_f1:.6f}, ratio:{dic_[mode]:.6f}, Precision:{precision_:.6f}, Recall:{recall_:.6f}')

In [3]:
f_score = 0 
for mode in range(12):
    temp_train_y = train_y == mode
    temp_valid_y = valid_y == mode
    lgb_train_binary(train_x, temp_train_y, valid_x, temp_valid_y,f_score)

NameError: name 'train_y' is not defined

In [None]:
mode:0,  F1:0.347218, weighted_f1:0.030352, ratio:0.087414, Precision:0.899623, Recall:0.215124
mode:1,  F1:0.685645, weighted_f1:0.099156, ratio:0.144617, Precision:0.615552, Recall:0.773754
mode:2,  F1:0.901384, weighted_f1:0.282426, ratio:0.313324, Precision:0.850406, Recall:0.958864
mode:3,  F1:0.103310, weighted_f1:0.004607, ratio:0.044598, Precision:0.411330, Recall:0.059073
mode:4,  F1:0.015228, weighted_f1:0.000372, ratio:0.024453, Precision:0.461538, Recall:0.007742
mode:5,  F1:0.841895, weighted_f1:0.082147, ratio:0.097574, Precision:0.774548, Recall:0.922070
mode:6,  F1:0.174896, weighted_f1:0.003479, ratio:0.019893, Precision:0.350000, Recall:0.116574
mode:7,  F1:0.789026, weighted_f1:0.140384, ratio:0.177920, Precision:0.704656, Recall:0.896347
mode:8,  F1:0.263048, weighted_f1:0.001199, ratio:0.004559, Precision:0.331579, Recall:0.217993
mode:9,  F1:0.517820, weighted_f1:0.025855, ratio:0.049931, Precision:0.579132, Recall:0.468246
mode:10,  F1:0.556323, weighted_f1:0.015842, ratio:0.028475, Precision:0.487886, Recall:0.647091
mode:11,  F1:0.468368, weighted_f1:0.003392, ratio:0.007241, Precision:0.477376, Recall:0.459695
0.6892103959970385