In [None]:
!pip install bert-for-tf2

In [34]:
%matplotlib inline
%config IPCompleter.greedy=True

import string
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt

from tqdm import tqdm

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras

from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam,Nadam,SGD
from tensorflow.keras.models import Model
from tensorflow.keras import losses
from tensorflow.keras.callbacks import ModelCheckpoint

from keras.constraints import unit_norm

import bert
import bert_tokenizer as tokenizer

import nltk
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

In [35]:
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davidkolb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
test = pd.read_csv('../../data/nlpnovice/test.csv')
train = pd.read_csv('../../data/nlpnovice/train.csv')


In [37]:
train.describe, test.describe

(<bound method NDFrame.describe of          id keyword location  \
 0         1     NaN      NaN   
 1         4     NaN      NaN   
 2         5     NaN      NaN   
 3         6     NaN      NaN   
 4         7     NaN      NaN   
 ...     ...     ...      ...   
 7608  10869     NaN      NaN   
 7609  10870     NaN      NaN   
 7610  10871     NaN      NaN   
 7611  10872     NaN      NaN   
 7612  10873     NaN      NaN   
 
                                                    text  target  
 0     Our Deeds are the Reason of this #earthquake M...       1  
 1                Forest fire near La Ronge Sask. Canada       1  
 2     All residents asked to 'shelter in place' are ...       1  
 3     13,000 people receive #wildfires evacuation or...       1  
 4     Just got sent this photo from Ruby #Alaska as ...       1  
 ...                                                 ...     ...  
 7608  Two giant cranes holding a bridge collapse int...       1  
 7609  @aria_ahrary @TheTawniest

# Cleaing the Data

In [38]:
def remove_whitespace(data):
    return data.strip()

In [39]:
def remove_URL(data):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',data)

In [40]:
def remove_html(data):
    html=re.compile(r'<.*?>')
    return html.sub(r'',data)

In [41]:
def remove_emoji(data):
    emoji_pattern = re.compile('['
                           u'\U0001F600-\U0001F64F'  # emoticons
                           u'\U0001F300-\U0001F5FF'  # symbols & pictographs
                           u'\U0001F680-\U0001F6FF'  # transport & map symbols
                           u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
                           u'\U00002702-\U000027B0'
                           u'\U000024C2-\U0001F251'
                           ']+', flags=re.UNICODE)
    return emoji_pattern.sub(r'', data)

In [42]:
def remove_accented_chars(data):
    return unicodedata.normalize('NFKD', data).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [43]:
def remove_punctuation(data):
    table=str.maketrans('','',string.punctuation)
    return data.translate(table)

In [44]:
def single_char(data):
    new_data=''
    for w in data:
        print(w)
        if len(w) > 1:
            print(w)
            new_data = new_data +  ' ' + w
    
    return new_data

In [45]:
def remove_special_characters(data, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    data = re.sub(pattern, '', data)
    return data

In [46]:
def convert_lower_case(data):
    return data.lower()

In [47]:
def tokenise(data):
    data = word_tokenize(data)
    return data

In [48]:
def remove_stop_words(data):
    filtered_sentence = [w for w in data if not w in stop_words] 
    return filtered_sentence

In [49]:
def stemming(data):
    data.apply(lambda x: [stemmer.stem(e) for e in x])
    return data

In [50]:
def lemmatise(data):
    lemmatizer = WordNetLemmatizer()
    data.apply(lambda x: [lemmatizer.lemmatize(e) for e in x])
    return data 

In [51]:
def nlp_clean(data):
    data = remove_URL(data)
    data = remove_html(data)
    data = remove_emoji(data)
    data = remove_whitespace(data)    
    data = remove_accented_chars(data) 
    data = remove_special_characters(data)
    data = remove_punctuation(data)
    data = convert_lower_case(data)
    return data
    
def nlp_tokenise(data):
    stop_words = set(stopwords.words('english'))
    data = tokenise(data)
    data = remove_stop_words(data)
    return data
     
def nlp_normalise(data):
    stemmer = SnowballStemmer('english')
    data = stemming(data)
    data = lemmatise(data)
    return data

In [52]:
# Run preprossing steps to clean data
train['text']=train['text'].apply(lambda x : nlp_clean(x))
test['text']=test['text'].apply(lambda x : nlp_clean(x))

In [53]:
# Run steps to remove stop words
train['text']=train['text'].apply(lambda x : nlp_tokenise(x))
test['text']=test['text'].apply(lambda x : nlp_tokenise(x))

In [54]:
# rejoin Data after tokenisation 
def combine_text(list_of_text):
    combined_text = ''
    for word in list_of_text:
        combined_text = combined_text + ' ' + word
    return combined_text

In [55]:
train['text'] = train['text'].apply(lambda x : combine_text(x))
test['text'] = test['text'].apply(lambda x : combine_text(x))

In [56]:
train['text'].head(30)

0          deeds reason earthquake may allah forgive us
1                 forest fire near la ronge sask canada
2      residents asked shelter place notified office...
3      13000 people receive wildfires evacuation ord...
4      got sent photo ruby alaska smoke wildfires po...
5      rockyfire update california hwy 20 closed dir...
6      flood disaster heavy rain causes flash floodi...
7                            im top hill see fire woods
8      theres emergency evacuation happening buildin...
9                         im afraid tornado coming area
10                      three people died heat wave far
11     haha south tampa getting flooded hah wait sec...
12     raining flooding florida tampabay tampa 18 19...
13                      flood bago myanmar arrived bago
14        damage school bus 80 multi car crash breaking
15                                            whats man
16                                          love fruits
17                                        summer

In [57]:
test['text'].head(30)

0                           happened terrible car crash
1      heard earthquake different cities stay safe e...
2      forest fire spot pond geese fleeing across st...
3                 apocalypse lighting spokane wildfires
4                typhoon soudelor kills 28 china taiwan
5                                 shakingits earthquake
6      theyd probably still show life arsenal yester...
7                                                   hey
8                                              nice hat
9                                                  fuck
10                                       dont like cold
11                                      nooooooooo dont
12                                            dont tell
13                                                     
14                                              awesome
15     birmingham wholesale market ablaze bbc news f...
16               sunkxssedharry wear shorts race ablaze
17     previouslyondoyintv toke makinwauas marri

In [58]:
# train.to_csv('../../Data/NLPNovice/DSKtrain.csv', index=False)
# test.to_csv('../../Data/NLPNovice/DSKtest.csv', index=False)

# Load pre cleaned text files

In [59]:
# Load pre cleaned text files
# train = pd.read_csv('../../Data/NLPNovice/DSKtrain.csv')
# test = pd.read_csv('../../Data/NLPNovice/DSKtest.csv')

# Convert text column from object to string
train['text'] = train['text'].apply(lambda x : str(x))
test['text'] = test['text'].apply(lambda x : str(x))

#Shuffle Data 
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)

# Prepare Tensor Flow Data

In [60]:
%%time
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1'
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 23.2 s, sys: 679 ms, total: 23.9 s
Wall time: 23.9 s


In [61]:
# See BERT paper: https://arxiv.org/pdf/1810.04805.pdf
# And BERT implementation convert_single_example() at 

def bert_encode(texts, tokenizer, max_len):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ['[CLS]'] + text + ['[SEP]']
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens = tokens + [0] * pad_len

        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [62]:
# build the architecture of the model.
# maxlenght BERT has a constraint on the maximum length of a sequence after tokenizing.
# input token ids (tokenizer converts tokens using vocab file)
# Input masks (1 for useful tokens, 0 for padding)
# segment ids (for 2 text training: 0 for the first one, 1 for the second one)
# pooled_output of shape [batch_size, 768] with representations for the entire input sequences
# sequence_output of shape [batch_size, max_seq_length, 768] with representations for each input token

maxlength=130 # Length of longest train tweet

input_word_ids = Input(shape=(maxlength,), dtype=tf.int32, name='input_word_ids')
input_mask = Input(shape=(maxlength,), dtype=tf.int32, name='input_mask')
segment_ids = Input(shape=(maxlength,), dtype=tf.int32, name='segment_ids')

In [63]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.tokenization.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

AttributeError: 'Tensor' object has no attribute 'numpy'

In [33]:
train_input = bert_encode(train.text.values, tokenizer, max_len=maxlength)

NameError: name 'tokenizer' is not defined

In [316]:
test_input = bert_encode(test.text.values, tokenizer, max_len=maxlength)
train_labels = train.target.values

# Define Tensor Flow Model

In [None]:
_, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

clf_output = sequence_output[:, 0, :]


out = Dense(1, activation='sigmoid')(clf_output)

model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)



In [317]:
model.compile(loss='binary_crossentropy',  optimizer=Adam(lr=1e-4),metrics=['accuracy'])

In [318]:
model.summary()

Model: "model_24"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 130)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 130)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 130)]        0                                            
__________________________________________________________________________________________________
keras_layer_6 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]          

In [None]:
# Model Parameters with learning rate scheduler
batchsize = 32
num_classes = 10
epochs = 40
stepsperepoch = train_input.shape[0]// batchsize
annealer = LearningRateScheduler(lambda x: 1e-3 * 0.9 ** x)

In [320]:
history = model.fit(
        train_input, train_labels,
        callbacks=[annealer]],
        epochs=epoch,
        batch_size=batchsize)    

Train on 6090 samples, validate on 1523 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
 480/6090 [=>............................] - ETA: 1:22 - loss: 0.0519 - accuracy: 0.9821

KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print('Training Accuracy: {:.4f}'.format(accuracy))
loss, accuracy = model.evaluate(X_val, y_val, verbose=False)
print('Testing Accuracy:  {:.4f}'.format(accuracy))
history_dict = history.history

In [None]:
# Plot Loss
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, color='b', label='Training loss')
plt.plot(epochs, val_loss, color='r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# Plot Accuracy
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']

plt.clf() 
plt.plot(epochs, acc, color='b', label='Training acc')
plt.plot(epochs, val_acc, color='r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
# Error analysis
y_hat = pipeline_grid.predict_proba(X_train2)[:,1]
checker = train.loc[:,['text','keyword','location','target']]
checker['pred_prob'] = y_hat
checker['error'] = np.abs(checker['target'] - checker['pred_prob'])

# Top 50 mispredicted tweets
error50 = checker.sort_values('error', ascending=False).head(50)
error50 = error50.rename_axis('id').reset_index()
error50.target.value_counts()

# Predict and Submit Final Model

In [None]:
predict=model.predict(test_input)
print(predict)

In [None]:
predict=np.round(predict).astype(int).reshape(3263)
sub=pd.DataFrame({'id':test['id'].values.tolist(),'target':predict})

In [None]:
sub.to_csv('DSKsubmission.csv', index=False)

In [None]:
!kaggle competitions submit -c nlp-getting-started -f DSKsubmission.csv -m 'DSK NLP with Keras'

In [0]:
import pandas as pd
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
