In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import mean_absolute_error as MAE
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import metrics
import numpy as np
from keras.layers import concatenate
import time, datetime
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import pickle
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout, Activation
import jieba, pdb
from gensim.models import word2vec
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from gensim.corpora.dictionary import Dictionary
import tensorflow as tf
from keras.models import load_model
from keras.models import Model
from keras.utils import plot_model

jieba.set_dictionary('jieba_dict/dict.txt.big')
# load stopwords set
stopword_set = set()
with open('jieba_dict/stopwords.txt','r', encoding='utf-8') as stopwords:
    for stopword in stopwords:
        stopword_set.add(stopword.strip('\n'))

model = word2vec.Word2Vec.load("word2vec2.model")
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value
    
def create_dictionaries(p_model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(p_model.wv.vocab.keys(), allow_update=True)
    w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引，从1开始编号
    w2vec = {word: model[word] for word in w2indx.keys()}  # 词语的词向量
    return w2indx, w2vec


def word2vec(x):
    words = jieba.cut(str(x), cut_all=False)
    vec = np.zeros((100))
    cnt = 1

    for word in words:
        if (word not in stopword_set) and (word != ' ') and (word in model.wv.vocab):
            vec += model[word]
            cnt +=1
    vec /= cnt
    return vec

def Convert_orderid(x):
    return str(x).strip('\n')

def cv(x):
    x=x.replace('\n','')
    xx=x.split(' ')
    y=np.zeros((64))
    for i in range(64):          
        print(xx[i+1])
        y[i]=float(xx[i+1])          
    return y

def Convert_Date(x):
    Year='20'+x[-2:]
    Month=month[x[-6:-3]]
    Day=x[:-7]
    date1 = pd.to_datetime(Year+'-'+Month+'-'+Day)
    return date1

def Date2Ticks(x):
    Year='20'+x[-2:]
    Month=month[x[-6:-3]]
    Day=x[:-7]
    date1 = str(Year+'/'+Month+'/'+Day)
    return time.mktime(datetime.datetime.strptime(date1, "%Y/%m/%d").timetuple())
def text_to_index_array(p_new_dic, p_sen):  # 文本转为索引数字模式
    new_sentences = []
    for sen in p_sen:
        new_sen = []
        for word in sen:
            try:
                new_sen.append(p_new_dic[word])  # 单词转索引数字
            except:
                new_sen.append(0)  # 索引字典里没有的词转为数字0
        new_sentences.append(new_sen)

    return np.array(new_sentences)
from keras.callbacks import EarlyStopping,ReduceLROnPlateau


def train_lstm(p_n_symbols, p_embedding_weights, p_X_train, p_y_train, p_X_test, p_y_test):
    print('Creating the multi-model LSTM model...')
    early_stopping=keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=2, 
    verbose=0, 
    mode='auto'
)
    lr=keras.callbacks.ReduceLROnPlateau(
 monitor='val_loss', 
 factor=0.1, 
 patience=10, 
 verbose=0, 
 mode='auto', 
 epsilon=0.0001, 
 cooldown=0, 
 min_lr=0
)
    model = Sequential()

    model.add(Embedding(output_dim=140+14, input_dim=p_n_symbols, input_length=140+14))
    model.add(LSTM(200, activation='sigmoid', name='lstm1'))
    model.add(Dense(64, name='dense-1'))
    model.add(Activation('relu'))
    model.add(Dense(32, name='dense0'))
    model.add(Activation('relu'))
    model.add(Dense(16, name='dense1'))
    model.add(Activation('relu'))
    model.add(Dense(1, name='dense2'))
    model.add(Activation('sigmoid'))

    print("Start to train a model...")
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[auc_roc, 'accuracy'])
    model.fit(p_X_train, p_y_train, batch_size=256, nb_epoch=30,callbacks=[lr],
          shuffle=True, validation_data=(p_X_test, p_y_test))
    return model

#     score, acc = model.evaluate(p_X_test, p_y_test, batch_size=batch_size)
index_dict, word_vectors= create_dictionaries(model)
output = open("wordwmbedding.pkl", 'wb')
pickle.dump(index_dict, output)  # 索引字典
pickle.dump(word_vectors, output)  # 词向量字典
output.close()


n_symbols = len(index_dict) + 1  # 索引数字的个数，因为有的词语索引为0，所以+1
embedding_weights = np.zeros((n_symbols, 100+15))  # 创建l一个n_symbols * 100的0矩阵
for w, index in index_dict.items():  # 从索引为1的词语开始，用词向量填充矩阵
    embedding_weights[index, 15:] = word_vectors[w]  # 词向量矩阵，第一行是0向量（没有索引为0的词语，未被填充）




In [6]:
# load dataset
df_order = pd.read_csv("dataset/order.csv")
df_group = pd.read_csv("dataset/group.csv")
df_airline = pd.read_csv("dataset/airline.csv")
df_day_schedule = pd.read_csv("day_schedule_processed.txt")
df_train = pd.read_csv("training-set.csv")
df_test = pd.read_csv("val.csv")
df_result = pd.read_csv("testing-set.csv")
# date Conversion

month = {'Jan': '01', 'Feb': '02' , 'Mar':'03' ,'Apr': '04', 
'May': '05', 'Jun': '06' , 'Jul': '07' , 'Aug':'08', 
'Sep':'09', 'Oct':'10' , 'Nov':'11', 'Dec':'12' }

# group data
df_group['Begin_Date']=df_group.begin_date.apply(lambda x: Convert_Date(x))
df_group['Begin_Tick']=df_group.begin_date.apply(lambda x: Date2Ticks(x))
df_group['SubLine']= df_group.sub_line.apply(lambda x: int(x[14:]))
df_group['Area']= df_group.area.apply(lambda x: int(x[11:]))
df_group['name']= df_group.area.apply(lambda x: len(x))
df_group['group_id']=df_group.group_id.apply(lambda x: Convert_orderid(x))
df_airline['group_id']=df_airline.group_id.apply(lambda x: Convert_orderid(x))
df_order['group_id']=df_order.group_id.apply(lambda x: Convert_orderid(x))
df_day_schedule['group_id']=df_day_schedule.group_id.apply(lambda x: Convert_orderid(x))


group_used_cols=['group_id','Begin_Date','Begin_Tick','days','Area','SubLine','price', 'name']
df_train['order_id']=df_train.order_id.apply(lambda x: Convert_orderid(x))
df_result['order_id']=df_result.order_id.apply(lambda x: Convert_orderid(x))

df_order_1 = df_order.merge(df_group[group_used_cols], on='group_id')
# for order data
df_order_1['Order_Date']=df_order_1.order_date.apply(lambda x: Convert_Date(x))
df_order_1['Order_Tick']=df_order_1.order_date.apply(lambda x: Date2Ticks(x))
df_order_1['order_id']=df_order_1.order_id.apply(lambda x: Convert_orderid(x))
df_order_1['Source_1']= df_order_1.source_1.apply(lambda x: int(x[11:]))
df_order_1['Source_2']= df_order_1.source_2.apply(lambda x: int(x[11:]))
df_order_1['Unit']= df_order_1.unit.apply(lambda x: int(x[11:]))
df_order_1['Begin_Date']=pd.to_datetime(df_order_1['Begin_Date'])
df_order_1['Order_Date']=pd.to_datetime(df_order_1['Order_Date'])
df_order_1['PreDays']=(df_order_1['Begin_Date']-df_order_1['Order_Date']).dt.days
df_order_1['Begin_Date_Weekday']= df_order_1['Begin_Date'].dt.dayofweek
df_order_1['Order_Date_Weekday']= df_order_1['Order_Date'].dt.dayofweek
df_order_1['Return_Date_Weekday']= (df_order_1['Begin_Date'].dt.dayofweek+df_order_1['days'])%7
df_order_1['tick_diff'] = (df_order_1['Begin_Tick'] - df_order_1['Order_Tick'])/10000
df_order_1['price'] = df_order_1['price']/1000

order_used_columns=['order_id', 'group_id','tick_diff', 'Source_1', 'Source_2', 'Unit',
'people_amount', 'days', 'Area', 'SubLine', 'price',
'PreDays','Begin_Date_Weekday', 'Order_Date_Weekday', 'Return_Date_Weekday', 'name']

df_order_2=df_order_1[order_used_columns].merge(df_day_schedule[['group_id','title']], on='group_id')



  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# train/test data
df_train_1=df_train.merge(df_order_2,on='order_id')
df_result_1=df_result.merge(df_order_2,on='order_id')

Y=df_train_1['deal_or_not'].values.tolist()
swX_tmp = df_train_1['title'].values.tolist()
Xid = df_train_1['order_id'].values.tolist()
del df_train_1['deal_or_not'] 
del df_train_1['title']
del df_train_1['group_id'] 
del df_train_1['order_id']
X = df_train_1.values.tolist()

rid = df_result_1['order_id'].values.tolist()
swrx = df_result_1['title'].values.tolist()
del df_result_1['deal_or_not']
del df_result_1['title']
del df_result_1['order_id']
del df_result_1['group_id']

rx = df_result_1.values.tolist()


sX, sY, Xid =np.asarray(X), np.asarray(Y), np.asarray(Xid)
rx,rid = np.asarray(rx), np.asarray(rid)
X,Y, swX=[],[], []
for i in range(len(sY)):
    if (int(Xid[i])<=204000):
        X.append(sX[i,:])
        Y.append(sY[i])
        swX.append(swX_tmp[i])
X, Y = np.array(X), np.array(Y)


def text_to_index_array(p_new_dic, p_sen):  # 文本转为索引数字模式
    new_sentences = []
    for sen in p_sen:
        new_sen = []
        for word in str(sen):
            try:
                new_sen.append(p_new_dic[word])  # 单词转索引数字
            except:
                new_sen.append(0)  # 索引字典里没有的词转为数字0
        new_sentences.append(new_sen)

    return np.array(new_sentences)


wX = text_to_index_array(index_dict, swX)
wrx = text_to_index_array(index_dict, swrx)
wX = sequence.pad_sequences(wX, maxlen=140)
wrx = sequence.pad_sequences(wrx, maxlen=140)


X=np.concatenate([X, wX], axis=1)
rx=np.concatenate([rx, wrx], axis=1)

    
print(X.shape)

# np.save("data.npy", [X,Y,rx])
# [X,Y,rx] = np.load("data.npy")

(201634, 154)


In [37]:
folds = StratifiedKFold(n_splits= 10, shuffle=True)

oof_preds = np.zeros(X.shape[0])
sub_preds = np.zeros(rx.shape[0])

feature_importance_df = pd.DataFrame()


for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):

    train_x, train_y = X[train_idx,:], Y[train_idx]
    valid_x, valid_y = X[valid_idx,:], Y[valid_idx]
    train_wx = wX[train_idx,:]
    valid_wx = wX[valid_idx,:]
    train_id, valid_id = Xid[train_idx], Xid[valid_idx]

    print("Train Index:",train_idx,",Val Index:",valid_idx)

    if n_fold >= 0:
        lstmmodel=train_lstm(n_symbols, embedding_weights,train_x, train_y, valid_x, valid_y)
#         feats = Model(inputs=lstmmodel.input, outputs=lstmmodel.get_layer('dense1').output)
        lstmmodel.save('LSTM_fold_%d.h5'%(n_fold))
        
        
        tmp_valid = lstmmodel.predict(valid_x)
        tmp_valid= np.reshape(tmp_valid, [-1])
        oof_preds[valid_idx] = tmp_valid
        res1 =  np.reshape(lstmmodel.predict(rx), [-1])
        sub_preds += (res1) / folds.n_splits
        

        print('Fold %2d AUC-LSTM : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        

        del train_x, train_y, valid_x, valid_y
        
app_test = pd.read_csv('testing-set.csv', usecols=['order_id'])
preds = pd.DataFrame({"order_id":app_test["order_id"], "deal_or_not":sub_preds})
# create output sub-folder
preds.to_csv("output/LSTM_" + str(roc_auc_score(Y, oof_preds)) + ".csv", index=False)

Train Index: [     0      2      3 ... 201630 201631 201633] ,Val Index: [     1     16     29 ... 201603 201620 201632]
Creating the multi-model LSTM model...




Start to train a model...




Train on 181470 samples, validate on 20164 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold  1 AUC-LSTM : 0.683352
Train Index: [     0      1      2 ... 201630 201631 201632] ,Val Index: [    21     55     60 ... 201609 201622 201633]
Creating the multi-model LSTM model...




Start to train a model...




Train on 181470 samples, validate on 20164 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold  2 AUC-LSTM : 0.671048
Train Index: [     0      1      2 ... 201631 201632 201633] ,Val Index: [     4     17     22 ... 201569 201624 201630]
Creating the multi-model LSTM model...




Start to train a model...




Train on 181470 samples, validate on 20164 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold  3 AUC-LSTM : 0.677931
Train Index: [     0      1      2 ... 201631 201632 201633] ,Val Index: [     7      9     13 ... 201591 201606 201621]
Creating the multi-model LSTM model...




Start to train a model...




Train on 181470 samples, validate on 20164 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold  4 AUC-LSTM : 0.672268
Train Index: [     0      1      2 ... 201631 201632 201633] ,Val Index: [     6     15     18 ... 201615 201626 201627]
Creating the multi-model LSTM model...




Start to train a model...




Train on 181470 samples, validate on 20164 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold  5 AUC-LSTM : 0.675073
Train Index: [     0      1      2 ... 201631 201632 201633] ,Val Index: [    14     19     23 ... 201598 201602 201607]
Creating the multi-model LSTM model...




Start to train a model...




Train on 181470 samples, validate on 20164 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold  6 AUC-LSTM : 0.676805
Train Index: [     1      2      3 ... 201631 201632 201633] ,Val Index: [     0     27     33 ... 201588 201618 201628]
Creating the multi-model LSTM model...




Start to train a model...




Train on 181470 samples, validate on 20164 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold  7 AUC-LSTM : 0.673232
Train Index: [     0      1      2 ... 201630 201632 201633] ,Val Index: [    10     12     39 ... 201619 201625 201631]
Creating the multi-model LSTM model...




Start to train a model...




Train on 181472 samples, validate on 20162 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold  8 AUC-LSTM : 0.682941
Train Index: [     0      1      3 ... 201631 201632 201633] ,Val Index: [     2      8     11 ... 201593 201610 201614]
Creating the multi-model LSTM model...




Start to train a model...




Train on 181472 samples, validate on 20162 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold  9 AUC-LSTM : 0.679253
Train Index: [     0      1      2 ... 201631 201632 201633] ,Val Index: [     3      5     32 ... 201604 201623 201629]
Creating the multi-model LSTM model...




Start to train a model...




Train on 181472 samples, validate on 20162 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold 10 AUC-LSTM : 0.682130


In [44]:

len1 = len(Y)
tind = np.zeros(len1, np.int)
for i in range(len1):
    tind[i]=i
import random as rn
rn.shuffle(tind)

train_x, train_y = X[tind[1000:],:], Y[tind[1000:]]
valid_x, valid_y = X[tind[:1000],:], Y[tind[:1000]]

print("Train Index:",tind[1000:],",Val Index:",tind[:1000])

lstmmodel=train_lstm(n_symbols, embedding_weights,train_x, train_y, valid_x, valid_y)
lstmmodel.save('LSTM_%d.h5'%(n_fold))


tmp_valid = lstmmodel.predict(valid_x)
tmp_valid= np.reshape(tmp_valid, [-1])
res1 =  np.reshape(lstmmodel.predict(rx), [-1])



        
app_test = pd.read_csv('testing-set.csv', usecols=['order_id'])
preds = pd.DataFrame({"order_id":app_test["order_id"], "deal_or_not":res1})
# create output sub-folder
preds.to_csv("output/LSTM_all.csv", index=False)



Train Index: [ 91034 102071   8807 ... 197629  74153 200479] ,Val Index: [177901  30518 191480  67706 164426 153939 127370  86998  85444 172402
 190974 187102 149877 141819  85437  46263 127475  42836 179290 105981
  64053 119425 191635 145877  44326 164821  44417 128515  11976 136094
  33691  78842 197582 181768   2594  49228  74431 182739  34700 156744
  50498  77965 160363  20750 133106  32178 158197  82506  18264  41936
  17047 179908 127485 197630 135225 188643 200639  89466 109988   9723
  64309  28588  30484  92552 164072 170478  29047 193281 147501 200953
  46090 139683 137867 192115 169139 188532 171403  32914  19593 116001
  65011  18261  30457 169033  75054 102900 128878 142955  95581  37707
 141259  30334 169351 155088 190494  91669  35179 137010  30760  76351
  98719  25699  25615  64852 122899 165487 185003 142770 110000  66221
  43858  13427 172058 152716   5239  21114 116878  79194  29430 103613
  53104  84285  39126  20641  18260  16907  91806 141025 120375  42847
 115



Train on 200634 samples, validate on 1000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
app_test = pd.read_csv('testing-set.csv', usecols=['order_id'])
preds = pd.DataFrame({"order_id":app_test["order_id"], "deal_or_not":res1})
# create output sub-folder
preds.to_csv("output/LSTM_all.csv", index=False)
         