In [1]:
import pandas as pd
import numpy as np
import keras
import jieba
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### 斷詞

- 使用繁體jieba對各個feature斷詞
- 移除某些停用字

In [2]:
train = pd.read_excel('train.xlsx')
test = pd.read_excel('test.xlsx', names=['user_id','company','product','depart','position','description','fix'])

train_Y_indus = train['a08a01']
train_Y_occup = train['a08a02']
train_X = train[['k_a08a_1','k_a08a_2','k_a08a_3','k_a08a_4','k_a08a_5','fix']]
train_X.columns = ['company','product','depart','position','description','fix']

all_x=pd.concat((train_X,test[['company','product','depart','position','description','fix']]),axis=0)
all_x=all_x.reset_index()

jieba.load_userdict(["化粧品", "買賣", "卷釘", "影印機", "耗財", "製造", "國中小", "製作", "紙箱", "褓姆", "放存款", "汽車", "快炒", "進出口",
           "手機", "電腦", "生產", "塑膠", "飛機", "麵包", "被動", "元件", "半導體", "機械", "便當", "維修", "冷氣", "托育", "金屬",
           "才藝", "軟體", "產品", "排汗", "月子", "美髮", "連接器", "鋼鐵", "蓮藕", "產品", "晶圓", "代工", "五穀", "敎学", "賣麵",
           "不知道", "打掃", "魚塭", "餐飲", "批發", "販售", "捲餅", "國中", "國小", " 高中職", "傢俱", "通訊", "教育", "後勤",
           "隨身碟", "鮪魚", "不鏽鋼", "鐵窗", "敎育", "髮型", "连花", "網路","7-11","一般","統一","第一","中西式","成會",'塩酥雞'
                    ,'噴藥','研究','輕鋼架'])

for idx,item in all_x.iterrows():
    seg_list = jieba.cut(item['description'])
    all_x.loc[idx,'description seq']="/".join(seg_list)
    seg_list2 = jieba.cut(item['product'])
    all_x.loc[idx,'product seq']="/".join(seg_list2)
    seg_list3 = jieba.cut(item['company'])
    all_x.loc[idx,'company seq']="/".join(seg_list3)
    seg_list4 = jieba.cut(item['position'])
    all_x.loc[idx,'position seq']="/".join(seg_list4)
    seg_list4 = jieba.cut(item['depart'])
    all_x.loc[idx,'depart seq']="/".join(seg_list4)
    
all_x['indus seq']=all_x['company seq']+'/'+all_x['product seq']+'/'+all_x['description seq']+'/'+all_x['position']
all_x['occup seq']=all_x['depart']+'/'+all_x['position seq']+'/'+all_x['company seq']+'/'+all_x['description seq']

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/n7/kd4twb9x5_z9c2vf17wfmgvm0000gn/T/jieba.cache
Loading model cost 0.608 seconds.
Prefix dict has been built succesfully.


In [3]:
def filter_stopword(str_val):
    stopWords=['丶', '(', ')', '.', '，','*','的','等','及','XX','。','大','小','與','、','和','做','在','之',"跟","給","Xx",'-',"並","其",'一','上','是','等等','為','但','&&']
    arr=str_val.split('/')
    arr=list(map(lambda x: x.capitalize(),arr))
    arr=list(filter(lambda a: a not in stopWords and a != '\n' and a!=" " and a!='' and not a.isdigit(), arr))
    arr="/".join(arr)
    return  arr

all_x['indus seq']=all_x['indus seq'].apply(filter_stopword)
all_x['occup seq']=all_x['occup seq'].apply(filter_stopword)

### 產業

- 將斷詞結果轉換成tfidf
- 將y轉成one hot encoding形式
- 使用nn模型進行訓練

In [4]:
num_labels_indus = 64
vocab_size_indus = 7000
batch_size_indus = 128

tokenizer = Tokenizer(num_words=vocab_size_indus)
tokenizer.fit_on_texts(all_x['indus seq'])
x_train_indus = tokenizer.texts_to_matrix(all_x['indus seq'].iloc[:3200,], mode='tfidf')
 
encoder_indus = LabelBinarizer()
encoder_indus.fit(train_Y_indus)
y_train_indus = encoder_indus.transform(train_Y_indus)

model1 = Sequential()
model1.add(Dense(512, input_shape=(vocab_size_indus,)))
model1.add(Activation('relu'))
model1.add(Dropout(0.7))
model1.add(Dense(512))
model1.add(Activation('relu'))
model1.add(Dropout(0.7))
model1.add(Dense(num_labels_indus))
model1.add(Activation('softmax'))
model1.summary()

model1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
model1.fit(x_train_indus,
           y_train_indus,
           batch_size=batch_size_indus,
           epochs=20,
           verbose=1)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               3584512   
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_2 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)  

<keras.callbacks.History at 0x1a2b496cc0>

### 職業

In [5]:
num_labels_occup = 119
vocab_size_occup = 6000
batch_size_occup = 128

tokenizer = Tokenizer(num_words=vocab_size_occup)
tokenizer.fit_on_texts(all_x['occup seq'])
x_train_occup = tokenizer.texts_to_matrix(all_x['occup seq'].iloc[:3200,], mode='tfidf')
 
encoder_occup = LabelBinarizer()
encoder_occup.fit(train_Y_occup)
y_train_occup = encoder_occup.transform(train_Y_occup)

model2 = Sequential()
model2.add(Dense(512, input_shape=(vocab_size_occup,)))
model2.add(Activation('relu'))
model2.add(Dropout(0.6))
model2.add(Dense(512))
model2.add(Activation('relu'))
model2.add(Dropout(0.6))
model2.add(Dense(num_labels_occup))
model2.add(Activation('softmax'))
model2.summary()

model2.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
model2.fit(x_train_occup, 
           y_train_occup,
           batch_size=batch_size_occup,
           epochs=15,
           verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 512)               3072512   
_________________________________________________________________
activation_4 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_5 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 119)               61047     
__________

<keras.callbacks.History at 0x1a2b667f98>

### 預測

In [6]:
tokenizer = Tokenizer(num_words=vocab_size_indus)
tokenizer.fit_on_texts(all_x['indus seq'])
x_test_indus = tokenizer.texts_to_matrix(all_x['indus seq'].iloc[3200:,], mode='tfidf')

labels = encoder_indus.classes_
ypredclf=[]
y_pred = model1.predict(x_test_indus)
for i in y_pred:
    ypredclf.append(labels[np.argmax(i)])
    
    
tokenizer = Tokenizer(num_words=vocab_size_occup)
tokenizer.fit_on_texts(all_x['occup seq'])
x_test_occup = tokenizer.texts_to_matrix(all_x['occup seq'].iloc[3200:,], mode='tfidf')
 
labels = encoder_occup.classes_
ypredclfp=[]
y_pred2 = model2.predict(x_test_occup)
for i in y_pred2:
    ypredclfp.append(labels[np.argmax(i)])

In [10]:
# to csv file
a08a01=pd.concat((test['user_id'],pd.Series(ypredclf)),axis=1)
a08a02=pd.concat((test['user_id'],pd.Series(ypredclfp)),axis=1)
a08a01['res']=a08a01['user_id'].astype(str)+"_a08a01,"+a08a01[0].astype(str)
a08a02['res']=a08a02['user_id'].astype(str) + "_a08a02," + a08a02[0].astype(str)

with open('result_'+datetime.datetime.now().strftime('%m%d%H%M')+'.csv', 'w') as myfile:
    myfile.write('x01,prediction\n')
    for i in pd.concat((a08a01,a08a02),axis=0)['res']:
        myfile.write(i+'\n')

### voting

- 嘗試不同的dropout分別是0.5 0.6 0.7 並將結果進行voting

In [7]:
result1=pd.read_csv('result_06182248_0.69986.csv')
result2=pd.read_csv('result_06241351_0.69851.csv')
result3=pd.read_csv('result_06262031.csv')

In [8]:
merged=result1.merge(result2,on='x01').merge(result3,on='x01')

In [9]:
for idx,item in merged.iterrows():
    if item[['prediction_x','prediction_y','prediction']].value_counts().shape[0]==3:
        merged.loc[idx,'vote']=item['prediction_x']
    else:
        merged.loc[idx,'vote']=item[['prediction_x','prediction_y','prediction']].value_counts().idxmax()
merged['vote']=merged['vote'].astype(int)
merged.columns=['x01','pred1','pred2','pred3','prediction']

In [102]:
merged[['x01','prediction']].to_csv('end2.csv',index=None)