In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import mean_absolute_error as MAE
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import metrics
import numpy as np
from gensim.models import word2vec
from gensim.corpora.dictionary import Dictionary
import pickle
from keras.preprocessing import sequence

import time, datetime
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
import gc
from sklearn.model_selection import GridSearchCV
import jieba, pdb
from sklearn.decomposition import PCA


jieba.set_dictionary('jieba_dict/dict.txt.big')
# load stopwords set
stopword_set = set()
with open('jieba_dict/stopwords.txt','r', encoding='utf-8') as stopwords:
    for stopword in stopwords:
        stopword_set.add(stopword.strip('\n'))

model = word2vec.Word2Vec.load("word2vec2.model")


def create_dictionaries(p_model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(p_model.wv.vocab.keys(), allow_update=True)
    w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引，从1开始编号
    w2vec = {word: model[word] for word in w2indx.keys()}  # 词语的词向量
    return w2indx, w2vec

def Convert_orderid(x):
    return str(x).strip('\n')

def Convert_Date(x):
    Year='20'+x[-2:]
    Month=month[x[-6:-3]]
    Day=x[:-7]
    date1 = pd.to_datetime(Year+'-'+Month+'-'+Day)
    return date1

def Date2Ticks(x):
    Year='20'+x[-2:]
    Month=month[x[-6:-3]]
    Day=x[:-7]
    date1 = str(Year+'/'+Month+'/'+Day)
    return time.mktime(datetime.datetime.strptime(date1, "%Y/%m/%d").timetuple())

index_dict, word_vectors= create_dictionaries(model)
output = open("wordwmbedding.pkl", 'wb')
pickle.dump(index_dict, output)  # 索引字典
pickle.dump(word_vectors, output)  # 词向量字典
output.close()





In [4]:
# load dataset
df_order = pd.read_csv("dataset/order.csv")
df_group = pd.read_csv("dataset/group.csv")
df_airline = pd.read_csv("dataset/airline2.csv")
df_day_schedule = pd.read_csv("day_schedule_processed.txt")
df_train = pd.read_csv("training-set.csv")
df_result = pd.read_csv("testing-set.csv")
# date Conversion

month = {'Jan': '01', 'Feb': '02' , 'Mar':'03' ,'Apr': '04', 
'May': '05', 'Jun': '06' , 'Jul': '07' , 'Aug':'08', 
'Sep':'09', 'Oct':'10' , 'Nov':'11', 'Dec':'12' }

# group data
df_group['Begin_Date']=df_group.begin_date.apply(lambda x: Convert_Date(x))
df_group['Begin_Tick']=df_group.begin_date.apply(lambda x: Date2Ticks(x))
df_group['SubLine']= df_group.sub_line.apply(lambda x: int(x[14:]))
df_group['Area']= df_group.area.apply(lambda x: int(x[11:]))
df_group['name']= df_group.area.apply(lambda x: len(x))
df_group['group_id']=df_group.group_id.apply(lambda x: Convert_orderid(x))
df_airline['group_id']=df_airline.group_id.apply(lambda x: Convert_orderid(x))
df_order['group_id']=df_order.group_id.apply(lambda x: Convert_orderid(x))
df_day_schedule['group_id']=df_day_schedule.group_id.apply(lambda x: Convert_orderid(x))
df_train['order_id']=df_train.order_id.apply(lambda x: Convert_orderid(x))
df_result['order_id']=df_result.order_id.apply(lambda x: Convert_orderid(x))
df_airline = df_airline.drop_duplicates(subset='group_id', keep='first', inplace=False)


group_used_cols=['group_id','Begin_Date','Begin_Tick','days','Area','SubLine','price', 'product_name']
df_group_0 = df_group[group_used_cols].merge(df_airline, on='group_id')
df_order_1 = df_order.merge(df_group_0, on='group_id')

# for order data
df_order_1['Order_Date']=df_order_1.order_date.apply(lambda x: Convert_Date(x))
df_order_1['Order_Tick']=df_order_1.order_date.apply(lambda x: Date2Ticks(x))
df_order_1['order_id']=df_order_1.order_id.apply(lambda x: Convert_orderid(x))
df_order_1['Source_1']= df_order_1.source_1.apply(lambda x: int(x[11:]))
df_order_1['Source_2']= df_order_1.source_2.apply(lambda x: int(x[11:]))
df_order_1['Unit']= df_order_1.unit.apply(lambda x: int(x[11:]))
df_order_1['Begin_Date']=pd.to_datetime(df_order_1['Begin_Date'])
df_order_1['Order_Date']=pd.to_datetime(df_order_1['Order_Date'])
df_order_1['PreDays']=(df_order_1['Begin_Date']-df_order_1['Order_Date']).dt.days
df_order_1['Begin_Date_Weekday']= df_order_1['Begin_Date'].dt.dayofweek
df_order_1['Order_Date_Weekday']= df_order_1['Order_Date'].dt.dayofweek
df_order_1['Return_Date_Weekday']= (df_order_1['Begin_Date'].dt.dayofweek+df_order_1['days'])%7
df_order_1['tick_diff'] = (df_order_1['Begin_Tick'] - df_order_1['Order_Tick'])/10000
df_order_1['price'] = df_order_1['price']/1000

order_used_columns=['order_id', 'group_id','tick_diff', 'Source_1', 'Source_2', 'Unit',
'people_amount', 'Begin_Tick','days', 'Order_Tick', 'Area', 'SubLine', 'price','PreDays','Begin_Date_Weekday', 
'Order_Date_Weekday', 'Return_Date_Weekday', 'fly_t', 'fly_date',
"src_airport", "arrive_t", "arrive_date", "dst_airport", 'product_name']

# df_order_2=df_order_1[order_used_columns].merge(df_day_schedule[['group_id','title']], on='group_id')
df_order_2=df_order_1[order_used_columns]

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# train/test data
print("Got %d, %d training, testing data" % (len(df_train), len(df_result)))
df_train_1=df_train.merge(df_order_2,on='order_id')
df_result_1=df_result.merge(df_order_2,on='order_id')
print("Got %d, %d training, testing data" % (len(df_train_1), len(df_result_1)))


Got 297020, 99895 training, testing data
Got 296237, 99736 training, testing data


In [6]:
# train/test data
df_train_1=df_train.merge(df_order_2,on='order_id')
df_result_1=df_result.merge(df_order_2,on='order_id')

Y=df_train_1['deal_or_not'].values.tolist()
swX_tmp = (df_train_1['product_name']).values.tolist()
Xid = df_train_1['order_id'].values.tolist()
del df_train_1['deal_or_not'] 
del df_train_1['product_name']
del df_train_1['group_id'] 
del df_train_1['order_id']
X = df_train_1.values.tolist()

rid = df_result_1['order_id'].values.tolist()
swrx = (df_result_1['product_name']).values.tolist()
del df_result_1['product_name']
del df_result_1['deal_or_not']
del df_result_1['order_id']
del df_result_1['group_id']

rx = df_result_1.values.tolist()


sX, sY, Xid =np.asarray(X), np.asarray(Y), np.asarray(Xid)
rx,rid = np.asarray(rx), np.asarray(rid)
X,Y, swX=[],[], []
for i in range(len(sY)):
   # if (int(Xid[i])<=204000):
        X.append(sX[i,:])
        Y.append(sY[i])
        swX.append(swX_tmp[i])
X, Y = np.array(X), np.array(Y)


def text_to_index_array(p_new_dic, p_sen):  # 文本转为索引数字模式
    new_sentences = []
    for sen in p_sen:
        new_sen = []
        for word in str(sen):
            try:
                new_sen.append(p_new_dic[word])  # 单词转索引数字
            except:
                new_sen.append(0)  # 索引字典里没有的词转为数字0
        new_sentences.append(new_sen)

    return np.array(new_sentences)


wX = text_to_index_array(index_dict, swX)
wrx = text_to_index_array(index_dict, swrx)
wX = sequence.pad_sequences(wX, maxlen=60)
wrx = sequence.pad_sequences(wrx, maxlen=60)


# X=np.concatenate([X, wX], axis=1)
# rx=np.concatenate([rx, wrx], axis=1)
# xlen=len(X)
# from sklearn.preprocessing import normalize
# Xtmp=normalize(np.concatenate([X, rx], axis=0),norm='max', axis=0)
# X=Xtmp[:xlen]
# rx=Xtmp[xlen:]

print(X.shape)

# np.save("data.npy", [X,Y,rx])
# [X,Y,rx] = np.load("data.npy")

(296237, 21)


In [8]:
from sklearn.linear_model import Lasso, Ridge, RidgeCV, ElasticNet
import xgboost as xgb
import lightgbm as lgb
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from catboost import Pool, CatBoostRegressor, cv
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor



len1 = len(Y)
tind = np.zeros(len1, np.int)
for i in range(len1):
    tind[i]=i
import random as rn
rn.Random(4).shuffle(tind)

train_x, train_y = X[tind[1000:],:], Y[tind[1000:]]
valid_x, valid_y = X[tind[:1000],:], Y[tind[:1000]]
print(train_x.shape)


model_ridge = Ridge(alpha = 0.1)
model_ridge.fit(train_x, train_y)
print('Ridge model done')

model_lasso = Lasso(alpha = 0.005)
model_lasso.fit(train_x, train_y)
print('Lasso model done')

model_en = ElasticNet(alpha = 0.005)
model_en.fit(train_x, train_y)
print('ElasticNet model done')

model_gbr = GradientBoostingRegressor(n_estimators=100, 
                                      learning_rate=0.03,
                                      max_depth=3, 
                                      max_features='sqrt',
                                      min_samples_leaf=50, 
                                      min_samples_split=10, 
                                      loss='huber',
                                      random_state=5)
model_gbr.fit(train_x, train_y)
print('GBTree model done')

model_xgb = xgb.XGBRegressor(colsample_bytree=0.2,
                             learning_rate=0.03,
                             max_depth=4,verbose=200,
                             n_estimators=100)
model_xgb.fit(train_x, train_y)
print('XGBTree model done')

model_cat = CatBoostRegressor(iterations=500,custom_metric='AUC',
                              learning_rate=0.05,
                              depth=3,
                              l2_leaf_reg=20,
                              border_count=15,
                              loss_function='RMSE',
                              verbose=200)
model_cat.fit(train_x, train_y)
print('CatBoosting model done')

model_rf = RandomForestRegressor(max_depth=4, bootstrap = True, n_estimators=50, max_features="auto", n_jobs=4)
model_rf.fit(train_x, train_y)
print('Random Forest model done')

params = {
'nthread': 8, 'boosting_type': 'dart','objective': 'regression', 'metric': 'auc', 
'learning_rate': 0.01, 'num_leaves': 40,
'max_depth': 3, 'subsample': 0.5, 'feature_fraction': 0.5, 
'min_split_gain': 0.09, 'min_child_weight': 9.5,
'drop_rate':0.5, 'skip_drop':0.5, 'max_drop':5, 'uniform_drop':False, 
'xgboost_dart_mode':True, 'drop_seed':5 }

dtrain = lgb.Dataset(train_x, label=train_y)
dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)
bst = lgb.train(params, dtrain, num_boost_round=2500, valid_sets=[dval], early_stopping_rounds=1000, 
                verbose_eval=500)
print('Light GBM model done')

(295237, 21)
Ridge model done


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.060688e-21
  overwrite_a=True).T


Lasso model done




ElasticNet model done
GBTree model done
XGBTree model done
0:	learn: 0.4391621	total: 102ms	remaining: 50.7s
200:	learn: 0.3779972	total: 9.67s	remaining: 14.4s
400:	learn: 0.3765536	total: 19.2s	remaining: 4.75s
499:	learn: 0.3760102	total: 24s	remaining: 0us
CatBoosting model done
Random Forest model done
Training until validation scores don't improve for 1000 rounds.
[500]	valid_0's auc: 0.624298
[1000]	valid_0's auc: 0.627953
[1500]	valid_0's auc: 0.634957
[2000]	valid_0's auc: 0.639883
[2500]	valid_0's auc: 0.642465
Did not meet early stopping. Best iteration is:
[2487]	valid_0's auc: 0.642477
Light GBM model done


In [9]:
preds = []
preds.append(model_ridge.predict(X))
preds.append(model_lasso.predict(X))
preds.append(model_en.predict(X))
preds.append(model_gbr.predict(X))
preds.append(model_xgb.predict(X))
preds.append(model_rf.predict(X))
preds.append(model_cat.predict(X))
preds.append(bst.predict(X, num_iteration=bst.best_iteration))

# train/test data
print("Got %d, %d training, testing data" % (len(df_train), len(df_result)))
df_train_1=df_train.merge(df_order_2,on='order_id')
df_result_1=df_result.merge(df_order_2,on='order_id')
print("Got %d, %d training, testing data" % (len(df_train_1), len(df_result_1)))
Y=df_train_1['deal_or_not'].values.tolist()

df_train_1['deal_or_not'] =Y

for i in range(8):
    df_train_1['pred%d'%(i)] = preds[i]

pred_scores = np.zeros((8))
for i in range(8):
    pred_scores[i] =roc_auc_score(Y, preds[i])

print('AUC Ridge : %.6f' % (roc_auc_score(Y, preds[0])))
print('AUC Lasso: %.6f' % (roc_auc_score(Y, preds[1])))
print('AUC ENet: %.6f' % (roc_auc_score(Y, preds[2])))
print('AUC GBR: %.6f' % (roc_auc_score(Y, preds[3])))
print('AUC XGBR: %.6f' % (roc_auc_score(Y, preds[4])))
print('AUC RandomForest: %.6f' % (roc_auc_score(Y, preds[5])))
print('AUC CATBoost: %.6f' % (roc_auc_score(Y, preds[6])))
print('AUC LightGBM: %.6f' % (roc_auc_score(Y, preds[7])))

idx1 = np.array(pred_scores).argsort()[::-1]
final_val=preds[idx1[0]]
print(pred_scores[idx1])
for i in range(1,3):
    final_val += preds[idx1[i]]
final_val /= 3.0
print('AUC KNN: %.6f' % (roc_auc_score(Y, final_val)))


sub_preds=[]
sub_preds.append(model_ridge.predict(rx))
sub_preds.append(model_lasso.predict(rx))
sub_preds.append(model_en.predict(rx))
sub_preds.append(model_gbr.predict(rx))
sub_preds.append(model_xgb.predict(rx))
sub_preds.append(model_rf.predict(rx))
sub_preds.append(model_cat.predict(rx))
sub_preds.append(bst.predict(rx, num_iteration=bst.best_iteration))

for i in range(8):
    df_result_1['pred%d'%(i)] = sub_preds[i]
    
df_train_1.to_csv('myTrain3.csv', index=False)
df_result_1.to_csv('myTest3.csv', index=False)

idx1 = np.array(pred_scores).argsort()
final_res=sub_preds[idx1[0]]
print(final_res.shape)
for i in range(1,3):
    final_res += sub_preds[idx1[i]]
final_res /= 3.0

Got 297020, 99895 training, testing data
Got 296237, 99736 training, testing data
AUC Ridge : 0.580105
AUC Lasso: 0.579050
AUC ENet: 0.579585
AUC GBR: 0.637086
AUC XGBR: 0.648177
AUC RandomForest: 0.619821
AUC CATBoost: 0.669745
AUC LightGBM: 0.673441
[0.67344079 0.66974472 0.64817692 0.63708568 0.61982148 0.58010452
 0.57958532 0.57904961]
AUC KNN: 0.668756
(99736,)


In [10]:
def nanMapping(x):
    if np.isnan(x):
        return 0
    else:
        return x

app_test = pd.read_csv('testing-set.csv', usecols=['order_id'])
predF=pd.DataFrame({"order_id":rid, "deal_or_not":final_res})
predR = pd.DataFrame({"order_id":app_test['order_id']})
predR=predR.merge(predF, on="order_id",how="outer")
predR['deal_or_not'] = predR.deal_or_not.apply(lambda x: nanMapping(x))
predR.to_csv("output/lgb_dart_" + str(roc_auc_score(valid_y, final_val)) + ".csv", index=False)

ValueError: Found input variables with inconsistent numbers of samples: [1000, 296237]

In [None]:
df_train_1