In [None]:
# Kaggle分數：0.84462

In [1]:
# 參考 https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import data.tokenization
import pandas as pd # 引用套件並縮寫為 pd  
import numpy as np
from tensorflow.keras.optimizers import Adam

In [3]:
num_top_word=1000
maxlen=60

In [4]:
# build model
input_word_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32, name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/2", trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

clf_output = sequence_output[:, 0, :]
out = tf.keras.layers.Dense(1, activation='sigmoid')(clf_output)
    
model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

In [5]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 60)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 60)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 60)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [6]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = data.tokenization.FullTokenizer(vocab_file, do_lower_case)

In [7]:
data_train = pd.read_csv('data/train.csv')  
data_test = pd.read_csv('data/test.csv') 

In [8]:
data_train=data_train[['text','target']]
data_train['text']=data_train['text'].str.lower()
data_train

Unnamed: 0,text,target
0,our deeds are the reason of this #earthquake m...,1
1,forest fire near la ronge sask. canada,1
2,all residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,just got sent this photo from ruby #alaska as ...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @thetawniest the out of control w...,1
7610,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,1
7611,police investigating after an e-bike collided ...,1


In [9]:
data_test=data_test[['text']]
data_test['text']=data_test['text'].str.lower()
data_test

Unnamed: 0,text
0,just happened a terrible car crash
1,"heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,apocalypse lighting. #spokane #wildfires
4,typhoon soudelor kills 28 in china and taiwan
...,...
3258,earthquake safety los angeles ûò safety faste...
3259,storm in ri worse than last hurricane. my city...
3260,green line derailment in chicago http://t.co/u...
3261,meg issues hazardous weather outlook (hwo) htt...


In [10]:
data_train_seq=[]
for text in data_train['text']:
    #print(text)
    text=tokenizer.tokenize(text)
    #print(text)
    data_train_seq.append(text)
    
data_test_seq=[]
for text in data_test['text']:
    #print(text)
    text=tokenizer.tokenize(text)
    #print(text)
    data_test_seq.append(text)

In [11]:
data_train_seq=np.array(data_train_seq)
data_test_seq=np.array(data_test_seq)

  """Entry point for launching an IPython kernel.
  


In [12]:
data_train_seq

array([list(['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', '#', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all']),
       list(['forest', 'fire', 'near', 'la', 'ron', '##ge', 'sas', '##k', '.', 'canada']),
       list(['all', 'residents', 'asked', 'to', "'", 'shelter', 'in', 'place', "'", 'are', 'being', 'notified', 'by', 'officers', '.', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected']),
       ...,
       list(['m1', '.', '94', '[', '01', ':', '04', 'utc', ']', '?', '5', '##km', 's', 'of', 'volcano', 'hawaii', '.', 'http', ':', '/', '/', 't', '.', 'co', '/', 'z', '##dt', '##oy', '##d', '##8', '##eb', '##j']),
       list(['police', 'investigating', 'after', 'an', 'e', '-', 'bike', 'collided', 'with', 'a', 'car', 'in', 'little', 'portugal', '.', 'e', '-', 'bike', 'rider', 'suffered', 'serious', 'non', '-', 'life', 'threatening', 'injuries', '.']),
       list(['the', 'latest', ':', 'more', 'homes', 'ra', '##zed', 'by', 'northern', 

In [13]:
#t=tokenizer.convert_tokens_to_ids(data_train_seq[0])
#print(t)

In [14]:
#t2=tokenizer.convert_ids_to_tokens(t)
#print(t2)

In [15]:
#convert_sentence_to_features(data_train['text'][0],tokenizer,maxlen)

In [16]:
def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len-1:
        tokens = tokens[:max_seq_len-1]
    tokens.append('[SEP]')
    
    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    #Zero Mask till seq_length
    zero_mask = [0] * (max_seq_len-len(tokens))
    input_ids.extend(zero_mask)
    input_mask.extend(zero_mask)
    segment_ids.extend(zero_mask)
    
    return input_ids, input_mask, segment_ids

def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    
    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)
    
    return all_input_ids, all_input_mask, all_segment_ids

In [17]:
train_input_ids,train_mask,train_segment_ids=convert_sentences_to_features(data_train['text'],tokenizer,maxlen)

In [18]:
test_input_ids,test_mask,test_segment_ids=convert_sentences_to_features(data_test['text'],tokenizer,maxlen)

In [19]:
train_input=np.array(train_input_ids),np.array(train_mask),np.array(train_segment_ids)

In [20]:
test_input=np.array(test_input_ids),np.array(test_mask),np.array(test_segment_ids)

In [21]:
train_label=data_train['target']
train_label.values

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [22]:
count=0
for i in train_input_ids:
    if(i[-1]!=0):
        count+=1
print(count)

101


In [24]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('model_V6_10epochs.h5', monitor='val_loss', save_best_only=True)
history = model.fit(
    train_input, train_label,
    validation_split=0.2,
    epochs=10,
    callbacks=[checkpoint],
    batch_size=8
)

Train on 6090 samples, validate on 1523 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
model.load_weights('model_V6_10epochs.h5')
test_pred = model.predict(test_input)
submission = pd.read_csv("data/sample_submission.csv")
submission['target'] = test_pred.round().astype(int)
submission.to_csv('Kaggle-6.csv', index=False)