In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import mean_absolute_error as MAE
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import metrics
import numpy as np
import time, datetime
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import jieba, pdb
from gensim.models import word2vec
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from gensim.models import word2vec
from gensim.corpora.dictionary import Dictionary

jieba.set_dictionary('jieba_dict/dict.txt.big')
# load stopwords set
stopword_set = set()
with open('jieba_dict/stopwords.txt','r', encoding='utf-8') as stopwords:
    for stopword in stopwords:
        stopword_set.add(stopword.strip('\n'))

model = word2vec.Word2Vec.load("word2vec2.model")


def create_dictionaries(p_model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(p_model.wv.vocab.keys(), allow_update=True)
    w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引，从1开始编号
    w2vec = {word: model[word] for word in w2indx.keys()}  # 词语的词向量
    return w2indx, w2vec

def word2vec(x):
    words = jieba.cut(str(x), cut_all=False)
    vec = np.zeros((4))
    cnt = 1

    for word in words:
        if (word not in stopword_set) and (word != ' ') and (word in model.wv.vocab):
            vec += model[word]
            cnt +=1
    vec /= cnt
    return vec

def cv(x):
    x=x.replace('\n','')
    xx=x.split(' ')
    y=np.zeros((64))
    for i in range(64):          
        print(xx[i+1])
        y[i]=float(xx[i+1])          
    return y

def Convert_Date(x):
    Year='20'+x[-2:]
    Month=month[x[-6:-3]]
    Day=x[:-7]
    date1 = pd.to_datetime(Year+'-'+Month+'-'+Day)
    return date1

def Convert_orderid(x):
    return str(x).strip('\n')

def Date2Ticks(x):
    Year='20'+x[-2:]
    Month=month[x[-6:-3]]
    Day=x[:-7]
    date1 = str(Year+'/'+Month+'/'+Day)
    return time.mktime(datetime.datetime.strptime(date1, "%Y/%m/%d").timetuple())



In [2]:
# load dataset
df_order = pd.read_csv("dataset/order.csv")
df_group = pd.read_csv("dataset/group.csv")
df_airline = pd.read_csv("dataset/airline.csv")
df_day_schedule = pd.read_csv("day_schedule_processed.txt")
df_train = pd.read_csv("training-set.csv")
df_test = pd.read_csv("val.csv")
df_result = pd.read_csv("testing-set.csv")
# date Conversion

month = {'Jan': '01', 'Feb': '02' , 'Mar':'03' ,'Apr': '04', 
'May': '05', 'Jun': '06' , 'Jul': '07' , 'Aug':'08', 
'Sep':'09', 'Oct':'10' , 'Nov':'11', 'Dec':'12' }

# group data
df_group['Begin_Date']=df_group.begin_date.apply(lambda x: Convert_Date(x))
df_group['Begin_Tick']=df_group.begin_date.apply(lambda x: Date2Ticks(x))
df_group['SubLine']= df_group.sub_line.apply(lambda x: int(x[14:]))
df_group['Area']= df_group.area.apply(lambda x: int(x[11:]))
df_group['name']= df_group.area.apply(lambda x: len(x))
df_group['group_id']=df_group.group_id.apply(lambda x: Convert_orderid(x))
df_airline['group_id']=df_airline.group_id.apply(lambda x: Convert_orderid(x))
df_order['group_id']=df_order.group_id.apply(lambda x: Convert_orderid(x))
df_day_schedule['group_id']=df_day_schedule.group_id.apply(lambda x: Convert_orderid(x))


group_used_cols=['group_id','Begin_Date','Begin_Tick','days','Area','SubLine','price', 'name']
df_train['order_id']=df_train.order_id.apply(lambda x: Convert_orderid(x))
df_result['order_id']=df_result.order_id.apply(lambda x: Convert_orderid(x))

df_order_1 = df_order.merge(df_group[group_used_cols], on='group_id')
# for order data
df_order_1['Order_Date']=df_order_1.order_date.apply(lambda x: Convert_Date(x))
df_order_1['Order_Tick']=df_order_1.order_date.apply(lambda x: Date2Ticks(x))
df_order_1['order_id']=df_order_1.order_id.apply(lambda x: Convert_orderid(x))
df_order_1['Source_1']= df_order_1.source_1.apply(lambda x: int(x[11:]))
df_order_1['Source_2']= df_order_1.source_2.apply(lambda x: int(x[11:]))
df_order_1['Unit']= df_order_1.unit.apply(lambda x: int(x[11:]))
df_order_1['Begin_Date']=pd.to_datetime(df_order_1['Begin_Date'])
df_order_1['Order_Date']=pd.to_datetime(df_order_1['Order_Date'])
df_order_1['PreDays']=(df_order_1['Begin_Date']-df_order_1['Order_Date']).dt.days
df_order_1['Begin_Date_Weekday']= df_order_1['Begin_Date'].dt.dayofweek
df_order_1['Order_Date_Weekday']= df_order_1['Order_Date'].dt.dayofweek
df_order_1['Return_Date_Weekday']= (df_order_1['Begin_Date'].dt.dayofweek+df_order_1['days'])%7
df_order_1['tick_diff'] = (df_order_1['Begin_Tick'] - df_order_1['Order_Tick'])/10000
df_order_1['price'] = df_order_1['price']/1000

order_used_columns=['order_id', 'group_id','tick_diff', 'Source_1', 'Source_2', 'Unit',
'people_amount', 'days', 'Area', 'SubLine', 'price',
'PreDays','Begin_Date_Weekday', 'Order_Date_Weekday', 'Return_Date_Weekday', 'name']

df_order_2=df_order_1[order_used_columns].merge(df_day_schedule[['group_id','title']], on='group_id')



  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
import pickle

index_dict, word_vectors= create_dictionaries(model)
output = open("wordwmbedding.pkl", 'wb')
pickle.dump(index_dict, output)  # 索引字典
pickle.dump(word_vectors, output)  # 词向量字典
output.close()


n_symbols = len(index_dict) + 1  # 索引数字的个数，因为有的词语索引为0，所以+1
embedding_weights = np.zeros((n_symbols, 100))  # 创建l一个n_symbols * 100的0矩阵
for w, index in index_dict.items():  # 从索引为1的词语开始，用词向量填充矩阵
    embedding_weights[index, :] = word_vectors[w]  # 词向量矩阵，第一行是0向量（没有索引为0的词语，未被填充）

# train/test data
df_train_1=df_train.merge(df_order_2,on='order_id')
df_result_1=df_result.merge(df_order_2,on='order_id')

Y=df_train_1['deal_or_not'].values.tolist()
swX_tmp = df_train_1['title'].values.tolist()
Xid = df_train_1['order_id'].values.tolist()
del df_train_1['deal_or_not'] 
del df_train_1['title']
del df_train_1['group_id'] 
del df_train_1['order_id']
X = df_train_1.values.tolist()

rid = df_result_1['order_id'].values.tolist()
swrx = df_result_1['title'].values.tolist()
del df_result_1['deal_or_not']
del df_result_1['title']
del df_result_1['order_id']
del df_result_1['group_id']

rx = df_result_1.values.tolist()


sX, sY, Xid =np.asarray(X), np.asarray(Y), np.asarray(Xid)
rx,rid = np.asarray(rx), np.asarray(rid)
X,Y, swX=[],[], []
for i in range(len(sY)):
    if (int(Xid[i])<=204000):
        X.append(sX[i,:])
        Y.append(sY[i])
        swX.append(swX_tmp[i])
X, Y = np.array(X), np.array(Y)


def text_to_index_array(p_new_dic, p_sen):  # 文本转为索引数字模式
    new_sentences = []
    for sen in p_sen:
        new_sen = []
        for word in str(sen):
            try:
                new_sen.append(p_new_dic[word])  # 单词转索引数字
            except:
                new_sen.append(0)  # 索引字典里没有的词转为数字0
        new_sentences.append(new_sen)

    return np.array(new_sentences)
from keras.preprocessing import sequence


wX = text_to_index_array(index_dict, swX)
wrx = text_to_index_array(index_dict, swrx)
wX = sequence.pad_sequences(wX, maxlen=140)
wrx = sequence.pad_sequences(wrx, maxlen=140)


# X=np.concatenate([X, wX], axis=1)
# rx=np.concatenate([rx, wrx], axis=1)

    
print(X.shape)

# np.save("data.npy", [X,Y,rx])
# [X,Y,rx] = np.load("data.npy")



(201634, 14)


In [12]:

folds = StratifiedKFold(n_splits= 10, shuffle=True)

oof_preds = np.zeros(X.shape[0])
sub_preds = np.zeros(rx.shape[0])
feature_importance_df = pd.DataFrame()


for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):

    train_x, train_y,train_id = X[train_idx,:], Y[train_idx], Xid[train_idx]
    valid_x, valid_y = X[valid_idx,:], Y[valid_idx]
    valid_id=Xid[valid_idx]

    print("Train Index:",train_idx,",Val Index:",valid_idx)

    if n_fold >= 0:
        clf = RandomForestRegressor(n_estimators=100, max_features="log2", n_jobs=4)
        clf.fit(train_x, train_y)
        py = clf.predict(train_x)
#         clf2 = RandomForestRegressor(n_estimators=500, max_features="log2",n_jobs=4)
#         clf2.fit(train_x, train_y-py)
        
        tmp_valid = (clf.predict(valid_x))
        tmp_valid[tmp_valid>1]=1
        tmp_valid[tmp_valid<0]=0
        oof_preds[valid_idx] =tmp_valid
#         tmp.dump('kfold_' + str(n_fold) + '.pkl')
        sub_preds += (clf.predict(rx)) / folds.n_splits
        sub_preds[sub_preds>1]=1          
        sub_preds[sub_preds<0]=0

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

        del train_x, train_y, valid_x, valid_y
        
app_test = pd.read_csv('testing-set.csv', usecols=['order_id'])
preds = pd.DataFrame({"order_id":app_test["order_id"], "deal_or_not":sub_preds})
# create output sub-folder
preds.to_csv("output/RF_" + str(roc_auc_score(Y, oof_preds)) + ".csv", index=False)


Train Index: [     0      1      2 ..., 201631 201632 201633] ,Val Index: [     3      9     14 ..., 201616 201621 201628]
Fold  1 AUC : 0.655304
Train Index: [     0      1      2 ..., 201631 201632 201633] ,Val Index: [    29     30     34 ..., 201615 201617 201619]
Fold  2 AUC : 0.653729
Train Index: [     2      3      5 ..., 201630 201631 201632] ,Val Index: [     0      1      4 ..., 201622 201629 201633]
Fold  3 AUC : 0.656581
Train Index: [     0      1      2 ..., 201631 201632 201633] ,Val Index: [    10     18     25 ..., 201600 201626 201627]
Fold  4 AUC : 0.651918
Train Index: [     0      1      2 ..., 201631 201632 201633] ,Val Index: [    22     32     33 ..., 201598 201602 201618]
Fold  5 AUC : 0.657529
Train Index: [     0      1      3 ..., 201631 201632 201633] ,Val Index: [     2      5      7 ..., 201611 201620 201630]
Fold  6 AUC : 0.660874
Train Index: [     0      1      2 ..., 201631 201632 201633] ,Val Index: [    20     24     39 ..., 201588 201606 201624]
F