In [4]:
## Check GPU recognized
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 12062205500539301742,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 39395347712
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 8100219883508106116
 physical_device_desc: "device: 0, name: A100-PCIE-40GB, pci bus id: 0000:c1:00.0, compute capability: 8.0"]

In [3]:
!pip install -U pandas==1.1.5
!pip install -U tensorflow_hub==0.12.0
!pip install -U bert-tensorflow==1.0.1

import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from bert import tokenization

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [5]:
train =pd.read_csv("./input/train.csv")
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
test =pd.read_csv("./input/test.csv")
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 9.33 s, sys: 5.07 s, total: 14.4 s
Wall time: 24 s


In [8]:
tf.gfile = tf.io.gfile
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [9]:
text = "This is a Goat, and I am riding a Boat...."
tokenize_ = tokenizer.tokenize(text)
print("Text after tokenization: ")
print(tokenize_)
max_len = 25
text = tokenize_[:max_len-2]
input_sequence = ["[CLS]"] + text + ["[SEP]"]
pad_len = max_len - len(input_sequence)
print("After adding [CLS] and [SEP]: ")
print(input_sequence)
tokens = tokenizer.convert_tokens_to_ids(input_sequence)
print("After converting Tokens to Id: ")
print(tokens)
tokens += [0] * pad_len
print("tokens: ")
print(tokens)
pad_masks = [1] * len(input_sequence) + [0] * pad_len
print("Pad Masking: ")
print(pad_masks)
segment_ids = [0] * max_len
print("Segment Ids: ")
print(segment_ids)

Text after tokenization: 
['this', 'is', 'a', 'goat', ',', 'and', 'i', 'am', 'riding', 'a', 'boat', '.', '.', '.', '.']
After adding [CLS] and [SEP]: 
['[CLS]', 'this', 'is', 'a', 'goat', ',', 'and', 'i', 'am', 'riding', 'a', 'boat', '.', '.', '.', '.', '[SEP]']
After converting Tokens to Id: 
[101, 2023, 2003, 1037, 13555, 1010, 1998, 1045, 2572, 5559, 1037, 4049, 1012, 1012, 1012, 1012, 102]
tokens: 
[101, 2023, 2003, 1037, 13555, 1010, 1998, 1045, 2572, 5559, 1037, 4049, 1012, 1012, 1012, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0]
Pad Masking: 
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
Segment Ids: 
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [15]:
def pre_Process_data(documents, tokenizer, max_len=512):
    '''
    For preprocessing we have regularized, transformed each upper case into lower case, tokenized,
    Normalized and remove stopwords. For normalization, we have used PorterStemmer. Porter stemmer transforms 
    a sentence from this "love loving loved" to this "love love love"
    
    '''
    all_tokens = []
    all_masks = []
    all_segments = []
    print("Pre-Processing the Data.........\n")
    for data in documents:
        review = re.sub('[^a-zA-Z]', ' ', data)
        url = re.compile(r'https?://\S+|www\.\S+')
        review = url.sub(r'',review)
        html=re.compile(r'<.*?>')
        review = html.sub(r'',review)
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        review = emoji_pattern.sub(r'',review)
        text = tokenizer.tokenize(review)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [16]:
input_word_id = Input(shape=(max_len,),dtype=tf.int32, name="input_word_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
segment_id = Input(shape=(max_len,), dtype=tf.int32, name = "segment_id")

_, sequence_output = bert_layer([input_word_id, input_mask, segment_id])
clf_output = sequence_output[:, 0, :]
model = Model(inputs=[input_word_id, input_mask, segment_id],outputs=clf_output)
model.compile(Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
print("shape of _ layer of BERT: "+str(_.shape))
print("shape of last layer of BERT: "+str(sequence_output.shape))

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 25)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 25)]         0                                            
__________________________________________________________________________________________________
segment_id (InputLayer)         [(None, 25)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [17]:
def build_model(bert_layer, max_len=512):
    input_word_id = Input(shape=(max_len,),dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_id = Input(shape=(max_len,), dtype=tf.int32, name = "segment_id")
    
    _, sequence_output = bert_layer([input_word_id, input_mask, segment_id])
    clf_output = sequence_output[:, 0, :]
    dense_layer1 = Dense(units=256,activation='relu')(clf_output)
    dense_layer1 = Dropout(0.4)(dense_layer1)
    dense_layer2 = Dense(units=128, activation='relu')(dense_layer1)
    dense_layer2 = Dropout(0.4)(dense_layer2)
    out = Dense(1, activation='sigmoid')(dense_layer2)
    
    model = Model(inputs=[input_word_id, input_mask, segment_id],outputs=out)
    model.compile(Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [18]:
train_input = pre_Process_data(train.text.values, tokenizer, max_len=260)
test_input = pre_Process_data(test.text.values, tokenizer, max_len=260)
train_labels = train.target.values

Pre-Processing the Data.........

Pre-Processing the Data.........



In [19]:
model = build_model(bert_layer, max_len=260)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 260)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 260)]        0                                            
__________________________________________________________________________________________________
segment_id (InputLayer)         [(None, 260)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [21]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=10,
    callbacks=[checkpoint],
    batch_size=32
    # batch_size=2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
submission = pd.read_csv("./input/sample_submission.csv")
submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [24]:
model.load_weights('model.h5')
test_pred = model.predict(test_input)
test_pred

array([[0.95661265],
       [0.9531481 ],
       [0.9847703 ],
       ...,
       [0.98830086],
       [0.88970786],
       [0.76841307]], dtype=float32)