# 1. Exploring Data

In [2]:
!pip install tensorflow
!pip install --upgrade tensorflow-hub
!pip install bert-for-tf2
!pip install sentencepiece

Requirement already up-to-date: tensorflow-hub in c:\dev\anaconda\lib\site-packages (0.8.0)


In [3]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import bert

In [4]:
train=pd.read_csv('./DATA/train.csv', index_col='id')
test=pd.read_csv('./DATA/test.csv', index_col='id') # final test dataset for the kaggle competition
submission = pd.read_csv("./DATA/sample_submission.csv")

In [5]:
train

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


## NaN Values

In [6]:
training_has_keyword = train["keyword"].notna().value_counts(normalize=True)
training_has_location = train["location"].notna().value_counts(normalize=True)
try:
    print(f'{training_has_keyword[True]*100:.1f}', "% of training data have keyword,", f'{(training_has_keyword[False])*100:.1f}', "% of values are missing")
    print(f'{training_has_keyword[True]*100:.1f}', "% of training data have location,", f'{(training_has_keyword[False])*100:.1f}', "% of values are missing")

    test_has_keyword = test["keyword"].notna().value_counts(normalize=True)
    test_has_location = test["location"].notna().value_counts(normalize=True)
    print(f'{test_has_keyword[True]*100:.1f}', "% of test data have keyword,", f'{(test_has_keyword[False])*100:.1f}', "% of values are missing")
    print(f'{test_has_location[True]*100:.1f}', "% of test data have location,", f'{(test_has_location[False])*100:.1f}', "% of values are missing")
except KeyError:
    print("NaN values already replaced")

99.2 % of training data have keyword, 0.8 % of values are missing
99.2 % of training data have location, 0.8 % of values are missing
99.2 % of test data have keyword, 0.8 % of values are missing
66.1 % of test data have location, 33.9 % of values are missing


In [7]:
print("Replacing NaN with missing_keyword and missing_location")
train["keyword"] = train["keyword"].fillna('missing_keyword')
train["location"] = train["location"].fillna('missing_location')
test["keyword"] = test["keyword"].fillna('missing_keyword')
test["location"] = test["location"].fillna('missing_location')

Replacing NaN with missing_keyword and missing_location


## Duplicate Values

In [8]:
duplicates = train.groupby(['text']).count().sort_values(by='target', ascending=False)
duplicates = duplicates[duplicates['target']>1]
duplicates.shape

(69, 3)

## Exploration

In [9]:
print("We have", len(train["keyword"]), "tweets with", train["keyword"].nunique(), "unique keywords.")
print("We have", len(train["location"]), "tweets with", train["location"].nunique(), "unique locations.")

We have 7613 tweets with 222 unique keywords.
We have 7613 tweets with 3342 unique locations.


In [10]:
grouped = train.groupby(train["keyword"])
grouped.size().sort_values()

grouped = train.groupby(train["location"])
grouped.target.sum().sort_values(ascending=False).head(10)

location
missing_location    1075
USA                   67
United States         27
Nigeria               22
India                 20
Mumbai                19
UK                    16
New York              16
London                16
Washington, DC        15
Name: target, dtype: int64

# Training

In [11]:
training_parameters = {
    "learning_rate": 0.0001,
    "epochs": 10,
    "batch_size": 32
}

In [12]:
def bert_encode(tweet_texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []

    for text in tweet_texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [13]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [14]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

Wall time: 13.5 s


In [15]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
FullTokenizer=bert.bert_tokenization.FullTokenizer
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [16]:
train_input = bert_encode(train.text.values, tokenizer, max_len=160)
test_input = bert_encode(test.text.values, tokenizer, max_len=160)
train_labels = train.target.values

In [1]:
model = build_model(bert_layer, max_len=160)
model.summary()

checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

NameError: name 'build_model' is not defined

In [None]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=3,
    callbacks=[checkpoint],
    batch_size=16
)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer_2 (KerasLayer)      [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [None]:
model.load_weights('model.h5')
test_pred = model.predict(test_input)

submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission.csv', index=False)

In [None]:
def predict_single_tweet(tweet)
    return 0

In [None]:
def train_disaster_tweets(train):
    train_input = bert_encode(train.text.values, tokenizer, max_len=160)
    return

In [None]:
train_input = bert_encode(train.text.values, tokenizer, max_len=160)