In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import mean_absolute_error as MAE
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import metrics
import numpy as np
from gensim.models import word2vec
from gensim.corpora.dictionary import Dictionary
import pickle
from keras.preprocessing import sequence

import time, datetime
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
import gc
from sklearn.model_selection import GridSearchCV
import jieba, pdb
from sklearn.decomposition import PCA


jieba.set_dictionary('jieba_dict/dict.txt.big')
# load stopwords set
stopword_set = set()
with open('jieba_dict/stopwords.txt','r', encoding='utf-8') as stopwords:
    for stopword in stopwords:
        stopword_set.add(stopword.strip('\n'))

model = word2vec.Word2Vec.load("word2vec2.model")


def create_dictionaries(p_model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(p_model.wv.vocab.keys(), allow_update=True)
    w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引，从1开始编号
    w2vec = {word: model[word] for word in w2indx.keys()}  # 词语的词向量
    return w2indx, w2vec

def Convert_orderid(x):
    return str(x).strip('\n')

def Convert_Date(x):
    Year='20'+x[-2:]
    Month=month[x[-6:-3]]
    Day=x[:-7]
    date1 = pd.to_datetime(Year+'-'+Month+'-'+Day)
    return date1

def Date2Ticks(x):
    Year='20'+x[-2:]
    Month=month[x[-6:-3]]
    Day=x[:-7]
    date1 = str(Year+'/'+Month+'/'+Day)
    return time.mktime(datetime.datetime.strptime(date1, "%Y/%m/%d").timetuple())

index_dict, word_vectors= create_dictionaries(model)
output = open("wordwmbedding.pkl", 'wb')
pickle.dump(index_dict, output)  # 索引字典
pickle.dump(word_vectors, output)  # 词向量字典
output.close()





In [3]:
# load dataset
df_order = pd.read_csv("dataset/order.csv")
df_group = pd.read_csv("dataset/group.csv")
df_airline = pd.read_csv("dataset/airline2.csv")
df_day_schedule = pd.read_csv("day_schedule_processed.txt")
df_train = pd.read_csv("training-set.csv")
df_result = pd.read_csv("testing-set.csv")
# date Conversion

month = {'Jan': '01', 'Feb': '02' , 'Mar':'03' ,'Apr': '04', 
'May': '05', 'Jun': '06' , 'Jul': '07' , 'Aug':'08', 
'Sep':'09', 'Oct':'10' , 'Nov':'11', 'Dec':'12' }

# group data
df_group['Begin_Date']=df_group.begin_date.apply(lambda x: Convert_Date(x))
df_group['Begin_Tick']=df_group.begin_date.apply(lambda x: Date2Ticks(x))
df_group['SubLine']= df_group.sub_line.apply(lambda x: int(x[14:]))
df_group['Area']= df_group.area.apply(lambda x: int(x[11:]))
df_group['name']= df_group.area.apply(lambda x: len(x))
df_group['group_id']=df_group.group_id.apply(lambda x: Convert_orderid(x))
df_airline['group_id']=df_airline.group_id.apply(lambda x: Convert_orderid(x))
df_order['group_id']=df_order.group_id.apply(lambda x: Convert_orderid(x))
df_day_schedule['group_id']=df_day_schedule.group_id.apply(lambda x: Convert_orderid(x))
df_train['order_id']=df_train.order_id.apply(lambda x: Convert_orderid(x))
df_result['order_id']=df_result.order_id.apply(lambda x: Convert_orderid(x))
df_airline = df_airline.drop_duplicates(subset='group_id', keep='first', inplace=False)


group_used_cols=['group_id','Begin_Date','Begin_Tick','days','Area','SubLine','price', 'name']
df_group_0 = df_group[group_used_cols].merge(df_airline, on='group_id')
df_order_1 = df_order.merge(df_group_0, on='group_id')

# for order data
df_order_1['Order_Date']=df_order_1.order_date.apply(lambda x: Convert_Date(x))
df_order_1['Order_Tick']=df_order_1.order_date.apply(lambda x: Date2Ticks(x))
df_order_1['order_id']=df_order_1.order_id.apply(lambda x: Convert_orderid(x))
df_order_1['Source_1']= df_order_1.source_1.apply(lambda x: int(x[11:]))
df_order_1['Source_2']= df_order_1.source_2.apply(lambda x: int(x[11:]))
df_order_1['Unit']= df_order_1.unit.apply(lambda x: int(x[11:]))
df_order_1['Begin_Date']=pd.to_datetime(df_order_1['Begin_Date'])
df_order_1['Order_Date']=pd.to_datetime(df_order_1['Order_Date'])
df_order_1['PreDays']=(df_order_1['Begin_Date']-df_order_1['Order_Date']).dt.days
df_order_1['Begin_Date_Weekday']= df_order_1['Begin_Date'].dt.dayofweek
df_order_1['Order_Date_Weekday']= df_order_1['Order_Date'].dt.dayofweek
df_order_1['Return_Date_Weekday']= (df_order_1['Begin_Date'].dt.dayofweek+df_order_1['days'])%7
df_order_1['tick_diff'] = (df_order_1['Begin_Tick'] - df_order_1['Order_Tick'])/10000
df_order_1['price'] = df_order_1['price']/1000

order_used_columns=['order_id', 'group_id','tick_diff', 'Source_1', 'Source_2', 'Unit',
'people_amount', 'days', 'Area', 'SubLine', 'price','PreDays','Begin_Date_Weekday', 
'Order_Date_Weekday', 'Return_Date_Weekday', 'fly_h','fly_tick',
"src_airport", "arrive_h", "arrive_tick", "dst_airport", 'name']

df_order_2=df_order_1[order_used_columns].merge(df_day_schedule[['group_id','title']], on='group_id')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# train/test data
print("Got %d, %d training, testing data" % (len(df_train), len(df_result)))
df_train_1=df_train.merge(df_order_2,on='order_id')
df_result_1=df_result.merge(df_order_2,on='order_id')
print("Got %d, %d training, testing data" % (len(df_train_1), len(df_result_1)))


Got 297020, 99895 training, testing data
Got 295997, 99752 training, testing data


In [5]:
# train/test data
df_train_1=df_train.merge(df_order_2,on='order_id')
df_result_1=df_result.merge(df_order_2,on='order_id')

Y=df_train_1['deal_or_not'].values.tolist()
swX_tmp = df_train_1['title'].values.tolist()
Xid = df_train_1['order_id'].values.tolist()
del df_train_1['deal_or_not'] 
del df_train_1['title']
del df_train_1['group_id'] 
del df_train_1['order_id']
X = df_train_1.values.tolist()

rid = df_result_1['order_id'].values.tolist()
swrx = df_result_1['title'].values.tolist()
del df_result_1['deal_or_not']
del df_result_1['title']
del df_result_1['order_id']
del df_result_1['group_id']

rx = df_result_1.values.tolist()


sX, sY, Xid =np.asarray(X), np.asarray(Y), np.asarray(Xid)
rx,rid = np.asarray(rx), np.asarray(rid)
X,Y, swX=[],[], []
for i in range(len(sY)):
   # if (int(Xid[i])<=204000):
        X.append(sX[i,:])
        Y.append(sY[i])
        swX.append(swX_tmp[i])
X, Y = np.array(X), np.array(Y)


def text_to_index_array(p_new_dic, p_sen):  # 文本转为索引数字模式
    new_sentences = []
    for sen in p_sen:
        new_sen = []
        for word in str(sen):
            try:
                new_sen.append(p_new_dic[word])  # 单词转索引数字
            except:
                new_sen.append(0)  # 索引字典里没有的词转为数字0
        new_sentences.append(new_sen)

    return np.array(new_sentences)


wX = text_to_index_array(index_dict, swX)
wrx = text_to_index_array(index_dict, swrx)
wX = sequence.pad_sequences(wX, maxlen=200)
wrx = sequence.pad_sequences(wrx, maxlen=200)


X=np.concatenate([X, wX], axis=1)
rx=np.concatenate([rx, wrx], axis=1)

    
print(X.shape)

# np.save("data.npy", [X,Y,rx])
# [X,Y,rx] = np.load("data.npy")

(295997, 220)


In [18]:

params = {
'nthread': 32, 'boosting_type': 'dart','objective': 'regression', 'metric': 'auc', 
    'learning_rate': 0.01, 'num_leaves': 100,
'max_depth': 12, 'subsample': 0.8, 'feature_fraction': 0.8, 
    'min_split_gain': 0.09, 'min_child_weight': 9.5,
    'min_data_in_leaf':80,
# parameters for dart
'drop_rate':0.5, 'skip_drop':0.5, 'max_drop':4, 'uniform_drop':False, 
    'xgboost_dart_mode':True, 'drop_seed':5 }

parameters = {
              'max_depth': [5, 6,7,8,9,10,12,15, 20, 30],
              'learning_rate': [0.005, 0.01, 0.02, 0.05],
              'num_leaves': [50, 70, 90,110,150,200,250,300],
              'min_child_weight': [0, 2, 5, 10, 15, 20],
              'subsample': [0.1,0.3,0.5, 0.7, 0.95,1],
              'feature_fraction': [0.1,0.3,0.5, 0.7, 0.9,0.95,1],
              'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
              'reg_lambda': [0, 0.2, 0.4, 0.6, 0.8, 1],
              'min_data_in_leaf': [5,10,30,50,80,100,120,140],
                'boosting_type': ['dart','rf','goss','gbdt'],
}

len1 = len(Y)
tind = np.zeros(len1, np.int)
for i in range(len1):
    tind[i]=i
import random as rn
rn.shuffle(tind)

train_x, train_y = X[tind[1000:],:], Y[tind[1000:]]
valid_x, valid_y = X[tind[:1000],:], Y[tind[:1000]]

dtrain = lgb.Dataset(train_x, label=train_y)
dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)
gbm = lgb.LGBMRegressor(
                         objective = 'binary',
                         metric = 'auc',min_child_weight=9.5,
                         verbose = 0,uniform_drop=False,drop_seed=5,
                         learning_rate = 0.01,nthread=8,xgboost_dart_mode=True
                       )
# 有了gridsearch我们便不需要fit函数
gsearch = GridSearchCV(gbm, param_grid=parameters, scoring='roc_auc', cv=3)
gsearch.fit(train_x, train_y)

print("Best score: %0.3f" % gsearch.best_score_)
print("Best parameters set:")
best_parameters = gsearch.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

bst = lgb.train(best_parameters, dtrain, num_boost_round=5000, valid_sets=[dval], early_stopping_rounds=2500, 
                        verbose_eval=500)

tmp_valid = bst.predict(valid_x, num_iteration=bst.best_iteration)
res1 = bst.predict(rx, num_iteration=bst.best_iteration) 
tmp_valid= np.reshape(tmp_valid, [-1])
res1 =  np.reshape(res1, [-1])

print('AUC : %.6f' % (roc_auc_score(valid_y, tmp_valid)))


# create output sub-folder
def nanMapping(x):
    if np.isnan(x):
        return 0
    else:
        return x

app_test = pd.read_csv('testing-set.csv', usecols=['order_id'])
predF=pd.DataFrame({"order_id":rid, "deal_or_not":sub_preds})
preds = pd.DataFrame({"order_id":app_test['order_id']})
preds=preds.merge(predF, on="order_id",how="outer")
preds['deal_or_not'] = preds.deal_or_not.apply(lambda x: nanMapping(x))
# create output sub-folder
preds.to_csv("output/LGBM_%f.csv"%(roc_auc_score(valid_y, tmp_valid)), index=False)


KeyboardInterrupt: 

In [11]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']