In [372]:
import pandas as pd
import numpy as np

import tensorflow as tf 
from tensorflow.keras.layers import SimpleRNN, GRU, LSTM, Embedding, Dense
from keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import seaborn as sns # check if still needed

# I - EDA & Data Cleaning

In [333]:
df = pd.read_csv('train.csv')
df.head(15)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [334]:
df.describe(include='all')

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


Column 'id' is not going to help us with the prediction, we can drop it straight away.
Furthermore, we will look straight away into getting the prediction purely from the tweet content as we have a lot of missing value, so we'll drop the keyword & location column.

It seems like we have some duplicate within the text column too, we'll look into that. Do they have the same target? If they do we'll remove them to have just one of each text, if they don't we need to look into why they don't.

In [335]:
# Dropping aforementionned columns
df.drop(['id','keyword','location'],axis=1,inplace=True)

# Keep = False is to avoid that it keeps the first instance of each duplicate which would result in duplicates remaining
#Looking at duplicated text
df[df['text'].duplicated(keep=False)].sort_values('text')

Unnamed: 0,text,target
4290,#Allah describes piling up #wealth thinking it...,0
4299,#Allah describes piling up #wealth thinking it...,0
4312,#Allah describes piling up #wealth thinking it...,1
6363,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1
6373,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1
...,...,...
6091,that horrible sinking feeling when youÛªve be...,1
6103,that horrible sinking feeling when youÛªve be...,0
6094,that horrible sinking feeling when youÛªve be...,0
5641,wowo--=== 12000 Nigerian refugees repatriated ...,0


Looks like some have the same target, some don't. If they have the same target, we'll clean up such that we have just one of each, if they don't we'll remove them altogether

In [336]:
df['index'] = df.index # Will help to make list of originals index to pick up the data after

aggs = {'target': 'mean',
         'index':lambda x: list(x)} # Keep track of the index 

grouped_df = df[df['text'].duplicated(keep=False)].groupby('text').agg(aggs) 

# if target mean is not equal to 0 or 1 then the target is not consistent throughout the observations, let's see which one they are
dif_targets = grouped_df[(grouped_df.target!=1) & (grouped_df.target!=0)].reset_index()
dif_targets

Unnamed: 0,text,target,index
0,#Allah describes piling up #wealth thinking it...,0.333333,"[4290, 4299, 4312]"
1,#foodscare #offers2go #NestleIndia slips into ...,0.666667,"[4221, 4239, 4244]"
2,.POTUS #StrategicPatience is a strategy for #G...,0.75,"[2830, 2831, 2832, 2833]"
3,CLEARED:incident with injury:I-495 inner loop...,0.666667,"[4597, 4605, 4618]"
4,Caution: breathing may be hazardous to your he...,0.5,"[4232, 4235]"
5,He came to a land which was engulfed in tribal...,0.333333,"[3240, 3243, 3248, 3251, 3261, 3266]"
6,Hellfire is surrounded by desires so be carefu...,0.333333,"[4285, 4305, 4313]"
7,Hellfire! We donÛªt even want to think about ...,0.5,"[4306, 4320]"
8,I Pledge Allegiance To The P.O.P.E. And The Bu...,0.5,"[1214, 1365]"
9,In #islam saving a person is equal in reward t...,0.5,"[6614, 6616]"


We could take fix the target manually but given there is only 18 of them and some are ambiguous, we will proceed with removing them

In [337]:
# We'll save down the column of index as a list for us to drop them
to_drop_dif_targets = dif_targets['index'].tolist()
to_drop_dif_targets = [item for sublist in to_drop_dif_targets for item in sublist] #flatten nested list

# Dropping duplicates with  target
shape = df.shape[0]
df.drop(to_drop_dif_targets,inplace=True)
dropped =  shape - df.shape[0]

print(f"We dropped {dropped} observations")


# Cleaning up
df.drop('index',axis=1,inplace=True) # no longer need that column


We dropped 55 observations


In [338]:
# Let's now look at the one that have the same target
# For those we'll keep only one of each to avoid any bias
same_targets = grouped_df[(grouped_df.target==1) | (grouped_df.target==0)].reset_index()

print(same_targets.shape)
same_targets.head(5)

(51, 3)


Unnamed: 0,text,target,index
0,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1.0,"[6363, 6366, 6373, 6377, 6378, 6392]"
1,#KCA #VoteJKT48ID 12News: UPDATE: A family of ...,1.0,"[2822, 2828]"
2,#Myanmar Displaced #Rohingya at #Sittwe point...,1.0,"[2816, 2841]"
3,#Newswatch: 2 vehicles collided at Lock and La...,1.0,"[1704, 1725]"
4,#SigAlert: North &amp; Southbound 133 closed b...,1.0,"[3790, 3795]"


In [339]:
# Dropping duplicates above keeping only one instance of each tweet

shape = df.shape[0]

df.drop_duplicates(keep='first',inplace=True)

dropped =  shape - df.shape[0]
print("We dropped {} observations".format(dropped))

We dropped 73 observations


Let's have a final look at the dataset before we proceed to verify everything seems cleaned up

In [340]:
df.describe(include='all').iloc[0:2]

Unnamed: 0,text,target
count,7485,7485.0
unique,7485,


In [341]:
df['target'].value_counts(normalize=True) # Checking for balanced classes

0    0.574081
1    0.425919
Name: target, dtype: float64

In [342]:
df.reset_index(inplace=True,drop=True) # deals with some missing index


for i in range(20):
    print(df['text'][i]) # allows to see the complete text

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Forest fire near La Ronge Sask. Canada
All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
13,000 people receive #wildfires evacuation orders in California 
Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school 
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas
I'm on top of the hill and I can see a fire in the woods...
There's an emergency evacuation happening now in the building across the street
I'm afraid that the tornado is coming to our area...
Three people died from the heat wave so far
Haha South Tampa is getting flooded hah- WAIT A SECOND I LIVE IN SOUTH TAMPA WHAT AM I GONNA DO WHAT AM I GONNA DO FVCK #flooding
#raining #flooding #Florida 

This already looks better, no more duplicated values. There is no missing values so we won't need to remove any more observations. 

The dataset is fairly balanced - about 42% of the tweets indicate a disaster.

Let's clean up the texts a bit. There's a lot of links, but those will get out by themselves once we limit the vocabulary size as they are all unique.  

Furthermore, we'll remove ponctuation, clean up if there's any double space, put everything in lowercase and remove the stop words, along with lemmatising the words.

In [343]:
nlp = spacy.load("en_core_web_sm") # Loading english language elements from spacy

df['text_clean'] = df["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
df['text_clean'] = df["text_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())
df['text_clean'] = df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) and (token.text not in STOP_WORDS)]))

df.head(20)


Unnamed: 0,text,target,text_clean
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfire evacuation order...
4,Just got sent this photo from Ruby #Alaska as ...,1,got send photo ruby alaska smoke wildfires pou...
5,#RockyFire Update => California Hwy. 20 closed...,1,rockyfire update california hwy 20 close dir...
6,#flood #disaster Heavy rain causes flash flood...,1,flood disaster heavy rain cause flash flooding...
7,I'm on top of the hill and I can see a fire in...,1,m hill fire wood
8,There's an emergency evacuation happening now ...,1,s emergency evacuation happen building street
9,I'm afraid that the tornado is coming to our a...,1,m afraid tornado come area


### Splitting data & tokenizing the texts

Even though the dataset is already split into train / test, the test one is only to submit to Kaggle and it does not have a target, making it complicated to evalue our model. 

We should thus train/test the train dataset to choose the best model. We'll also keep 10% for final testing to have an idea of how we expect our model to perform. 
The proportion are thus as follow:

    * Train: 75%

    * Validation: 15%
    
    * Test: 10%

We'll also both set a random state, to ensure we are always working with the same split, and stratify with respect to the target as we are working with a classification problem. 

However, before submitting to Kaggle, we'll retrain the model on the entire dataset as to not 'waste' any data.


In [450]:
X_train, X_test, Y_train, Y_test = train_test_split(df.text_clean, df.target ,test_size=0.25,random_state = 0, stratify = df.target)
X_test, X_valid, Y_test, Y_valid = train_test_split(X_test, Y_test ,test_size=0.6,random_state = 0, stratify = Y_test) # 0.6 * 0.25 = 0.15
 
print('Training:')
print(X_train.shape,Y_train.shape)
print('Validation:')
print(X_valid.shape, Y_valid.shape)
print('Testing:')
print(X_test.shape,Y_test.shape)


Training:
(5613,) (5613,)
Validation:
(1124,) (1124,)
Testing:
(748,) (748,)


In [451]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000, oov_token="<OOV>") # instanciate the tokenizer

tokenizer.fit_on_texts(X_train) # fitting tokenizer only on train dataset to prevent any data leakage

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_valid = tokenizer.texts_to_sequences(X_valid)



# Dropping empty observations which occur when no word in the tweet are in the dictionnary --
#  think no longer needed now that oov_token
#X_valid = [sublist for sublist in X_valid if len(sublist)!=0]
#X_train = [sublist for sublist in X_train if len(sublist)!=0]
#X_test = [sublist for sublist in X_test if len(sublist)!=0]




In [452]:
# Finding out the max length out of all the texts
MAX_LENGTH = len(max(X_train + X_test + X_valid, key =  lambda i: len(i) ))
print("The max length is ",MAX_LENGTH)

# Padding the sequences 
X_valid = tf.keras.preprocessing.sequence.pad_sequences(X_valid, padding="post", maxlen= MAX_LENGTH)
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding="post", maxlen= MAX_LENGTH)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding="post", maxlen= MAX_LENGTH)

The max length is  25


In [453]:
BATCH_SIZE = 124

train = tf.data.Dataset.from_tensor_slices((X_train,Y_train)).batch(BATCH_SIZE)
test = tf.data.Dataset.from_tensor_slices((X_test,Y_test.values)).batch(BATCH_SIZE)
valid = tf.data.Dataset.from_tensor_slices((X_test,Y_test.values)).batch(BATCH_SIZE)

for text, target in train.take(1):
  print(text, target)

tf.Tensor(
[[777   1  14 ...   0   0   0]
 [247  17 134 ...   0   0   0]
 [778  92   5 ...   0   0   0]
 ...
 [  1   1 910 ...   0   0   0]
 [ 51 310   1 ...   0   0   0]
 [104   1 566 ...   0   0   0]], shape=(124, 25), dtype=int32) tf.Tensor(
[1 1 1 1 0 1 0 0 1 0 0 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1 1 0 1 0 0 0 0 0 1 1 0
 0 1 0 1 1 1 1 0 1 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 0 0 1 1 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 1 1 1 0 0
 0 1 0 1 1 0 0 0 0 0 0 0 0], shape=(124,), dtype=int64)


## Normal all the way below --- TO ignore

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000) # instanciate the tokenizer
tokenizer.fit_on_texts(df.text_clean)
df["text_encoded"] = tokenizer.texts_to_sequences(df.text_clean)

# Let's remove observations which are empty once encoded with the chosen number of words
df["len_text"] = df["text_encoded"].apply(lambda x: len(x))
df = df[df["len_text"]!=0]
df.head(10)

# Padding sequences such that each has the same length
text_pad = tf.keras.preprocessing.sequence.pad_sequences(df.text_encoded, padding="post")


Even though the dataset is already split into train / test, the test one is only to submit to Kaggle and it does not have a target, making it complicated to evalue our model. 

We should thus train/test the train dataset to choose the best model. We'll also keep 10% for final testing to have an idea of how we expect our model to perform. 
The proportion are thus as follow:

    * Train: 75%

    * Validation: 15%
    
    * Test: 10%

We'll also both set a random state, to ensure we are always working with the same split, and stratify with respect to the target as we are working with a classification problem. 

However, before submitting to Kaggle, we'll retrain the model on the entire dataset as to not 'waste' any data.



X_train, X_test, Y_train, Y_test = train_test_split(text_pad, df.target ,test_size=0.25,random_state = 0, stratify = df.target)
X_test, X_valid, Y_test, Y_valid = train_test_split(X_test, Y_test ,test_size=0.6,random_state = 0, stratify = Y_test) # 0.6 * 0.25 = 0.15
 
print('Training:')
print(X_train.shape,Y_train.shape)
print('Validation:')
print(X_valid.shape, Y_valid.shape)
print('Testing:')
print(X_test.shape,Y_test.shape)

BATCH_SIZE = 124

train = tf.data.Dataset.from_tensor_slices((X_train,Y_train)).batch(BATCH_SIZE)
test = tf.data.Dataset.from_tensor_slices((X_test,Y_test.values)).batch(BATCH_SIZE)


for text, target in train.take(1):
  print(text, target)

Now that the dataset is all pre-processed, we can move on to the model.

# Recurrent Neural Network model

In [454]:
vocab_size = len(tokenizer.word_index)

model = tf.keras.Sequential([
                  # Word Embedding layer           
                  Embedding(vocab_size, 64,input_shape=[MAX_LENGTH,]),
                  # Gobal average pooling
                  SimpleRNN(units=64, return_sequences=True), # maintains the sequential nature
                  SimpleRNN(units=32, return_sequences=False), # returns the last output
                  # Dense layers once the data is flat
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),

                  # output layer with as many neurons as the number of classes
                  # for the target variable and softmax activation
                  Dense(1, activation="sigmoid")
])

In [455]:
model.summary()

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 25, 64)            1051008   
                                                                 
 simple_rnn_20 (SimpleRNN)   (None, 25, 64)            8256      
                                                                 
 simple_rnn_21 (SimpleRNN)   (None, 32)                3104      
                                                                 
 dense_66 (Dense)            (None, 16)                528       
                                                                 
 dense_67 (Dense)            (None, 8)                 136       
                                                                 
 dense_68 (Dense)            (None, 1)                 9         
                                                                 
Total params: 1,063,041
Trainable params: 1,063,041
N

In [456]:
optimizer= tf.keras.optimizers.Adam()

callbacks = EarlyStopping(monitor ="val_loss", 
                                        mode ="min", patience = 5, 
                                        restore_best_weights = True)

model.compile(optimizer=optimizer, 
              loss = tf.keras.losses.BinaryCrossentropy(), # bc binary case here
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [457]:
model.fit(train,
          epochs=25, 
          validation_data=valid,
          callbacks=callbacks)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25


<keras.callbacks.History at 0x7fea0d6e3df0>

In [458]:
model.save("models/model_RNN.h5") # model_simpleRNN_1 was done on batch size = 64


# GRU Model

In [459]:
model_gru = tf.keras.Sequential([
                  Embedding(vocab_size+1, 64, input_shape=[MAX_LENGTH,],name="embedding"),
                  GRU(units=64, return_sequences=True), # maintains the sequential nature
                  GRU(units=32, return_sequences=False), # returns the last output
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),

                  Dense(1, activation="sigmoid")
])

In [460]:
model_gru.summary()

Model: "sequential_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 64)            1051072   
                                                                 
 gru_10 (GRU)                (None, 25, 64)            24960     
                                                                 
 gru_11 (GRU)                (None, 32)                9408      
                                                                 
 dense_69 (Dense)            (None, 16)                528       
                                                                 
 dense_70 (Dense)            (None, 8)                 136       
                                                                 
 dense_71 (Dense)            (None, 1)                 9         
                                                                 
Total params: 1,086,113
Trainable params: 1,086,113
N

In [461]:
optimizer= tf.keras.optimizers.Adam()


model_gru.compile(optimizer=optimizer, 
              loss = tf.keras.losses.BinaryCrossentropy(), # bc binary case here
              metrics=[tf.keras.metrics.BinaryAccuracy()])


callbacks = EarlyStopping(monitor ="val_loss", 
                                        mode ="min", patience = 5, 
                                        restore_best_weights = True)

In [462]:
model_gru.fit(train,
              epochs=20, 
              validation_data=valid,
              callbacks=callbacks
              )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


<keras.callbacks.History at 0x7fea0c5062e0>

In [463]:
model_gru.save('models/model_GRU.h5')

## Long Short Term Memory Model

In [464]:
model_lstm = tf.keras.Sequential([
                  Embedding(vocab_size+1, 64, input_shape=[MAX_LENGTH],name="embedding"),
                  LSTM(units=64, return_sequences=True), # maintains the sequential nature
                  LSTM(units=32, return_sequences=False), # returns the last output
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),
                  Dense(1, activation="sigmoid")
])

In [465]:
model_lstm.summary()

Model: "sequential_27"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 64)            1051072   
                                                                 
 lstm_20 (LSTM)              (None, 25, 64)            33024     
                                                                 
 lstm_21 (LSTM)              (None, 32)                12416     
                                                                 
 dense_72 (Dense)            (None, 16)                528       
                                                                 
 dense_73 (Dense)            (None, 8)                 136       
                                                                 
 dense_74 (Dense)            (None, 1)                 9         
                                                                 
Total params: 1,097,185
Trainable params: 1,097,185
N

In [466]:
optimizer= tf.keras.optimizers.Adam()

model_lstm.compile(optimizer=optimizer, 
              loss = tf.keras.losses.BinaryCrossentropy(), # bc binary case here
              metrics=[tf.keras.metrics.BinaryAccuracy()])

callbacks = EarlyStopping(monitor ="val_loss", 
                                        mode ="min", patience = 10, 
                                        restore_best_weights = True)

In [467]:
model_lstm.fit(train,
                epochs = 30,
                validation_data = valid,
                callbacks=callbacks)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30


<keras.callbacks.History at 0x7fea07b0cc70>

In [468]:
model_lstm.save('models/model_lstm.h5')

# BERT Model

Source used: https://swatimeena989.medium.com/bert-text-classification-using-keras-903671e0207d 

In [234]:
from transformers import BertTokenizer, TFBertForSequenceClassification
# from transformers import TFBertModel, BertConfig # Think those two are not needed but to double check if all works this way
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
test_encode = df['text_clean'][1] # copy text from first few tweets and check how it tokenizes

tokenized_sequence = bert_tokenizer.encode_plus(test_encode,
                                                add_special_tokens = True,
                                                max_length =18, # Size of the longest tweet
                                                pad_to_max_length = True,
                                                truncation = True,
                                                return_attention_mask = True,
                                                )


print('Tokenized text:')
print(tokenized_sequence)

print('Decoded text:')

print(bert_tokenizer.decode(tokenized_sequence['input_ids']))

Tokenized text:
{'input_ids': [101, 3224, 2543, 2379, 2474, 6902, 3351, 21871, 2243, 2710, 102, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}
Decoded text:
[CLS] forest fire near la ronge sask canada [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [None]:
# BERT has its own tokenizer hence we need to redo it 
# Tokenize all the dataset

input_ids=[]
attention_masks=[]

for tweet in df['text_clean']:
    bert_inp=bert_tokenizer.encode_plus(tweet,
                                        add_special_tokens = True,
                                        max_length =MAX_LENGTH,
                                        pad_to_max_length = True,
                                        return_attention_mask = True
                                        )
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])
    
input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(df['target'])




In [None]:
X_train, X_test, Y_train, Y_test, Mask_train, Mask_test = train_test_split(
                                                            input_ids,
                                                            labels,
                                                            attention_masks,
                                                            test_size=0.25,
                                                            random_state = 0, 
                                                            stratify = labels
                                                            )

X_test, X_valid, Y_test, Y_valid, Mask_test, Mask_valid = train_test_split(
                                                            X_test,
                                                            Y_test,
                                                            Mask_test,
                                                            test_size=0.6,
                                                            random_state = 0,
                                                            stratify = Y_test # 0.6 * 0.25 = 0.15
                                                            ) 

## Creating the model


In [None]:
log_dir='tensorboard_data/tb_bert'
model_save_path='./bert_model.h5'

### see if can change the below to other callbacks
#callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,
#save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),tf.keras.callbacks.TensorBoard(log_dir=log_dir)]

callbacks = EarlyStopping(monitor ="val_loss", 
                                        mode ="min", patience = 5, 
                                        restore_best_weights = True)

for layer in bert_model.layers[:-1]:
    layer.trainable = False  # Only change the last dense layer, we are just finetuning this model not training it from scratch

print('\nBert Model',bert_model.summary())

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 1,538
Non-trainable params: 109,482,240
_________________________________________________________________

Bert Model None


In [None]:
optimizer = tf.keras.optimizers.Adam()

bert_model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [None]:
bert_model.fit([X_train,Mask_train],Y_train,batch_size=128,epochs=3,validation_data=([X_valid,Mask_valid],Y_valid),callbacks=callbacks)


Epoch 1/10

KeyboardInterrupt: 

# Model Comparison

In [477]:
# Load the different models
RNN_model = tf.keras.models.load_model('models/model_RNN.h5')
GRU_model = tf.keras.models.load_model('models/model_GRU.h5')
LSTM_model = tf.keras.models.load_model('models/model_lstm.h5')
bert_model = tf.keras.models.load_model('bert_model.h5')  ## change path after

## Setting up 
actual_values = Y_test 
target_names = ['No disaster','Disaster']
threshold = 0.5

# Get the predictions
pred_RNN  = RNN_model.predict(X_test)
pred_GRU  = GRU_model.predict(X_test)
pred_LSTM = LSTM_model.predict(X_test)

pred_RNN = np.where(pred_RNN > threshold, 1, 0)
pred_GRU = np.where(pred_GRU > threshold, 1, 0)
pred_LSTM = np.where(pred_LSTM > threshold, 1, 0)

# Compare F1 scores
print('RNN F1 score:')
print(f1_score(actual_values,pred_RNN))

print('GRU F1 score:')
print(f1_score(actual_values,pred_GRU))

print('LSTM F1 score:')
print(f1_score(actual_values,pred_LSTM))


RNN F1 score:
0.6741154562383612
GRU F1 score:
0.7440273037542663
LSTM F1 score:
0.7504132231404959


RNN F1 score:
0.6958105646630236
GRU F1 score:
0.7416107382550335
LSTM F1 score:
0.7204116638078902

In [400]:
bert_model = tf.keras.models.load_model("model.GRU_1.h5")
bert_model.layers


[<keras.layers.embeddings.Embedding at 0x7fea1d3dad30>,
 <keras.layers.recurrent.SimpleRNN at 0x7fea3549da60>,
 <keras.layers.recurrent.SimpleRNN at 0x7fea12f8fd00>,
 <keras.layers.core.dense.Dense at 0x7fea247cba60>,
 <keras.layers.core.dense.Dense at 0x7fea1d6150a0>,
 <keras.layers.core.dense.Dense at 0x7fea1d49da90>]

In [401]:
model_save_path='./bert_model.h5'

trained_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
trained_model.compile(loss=loss,optimizer=optimizer, metrics=[metric])
trained_model.load_weights(model_save_path)

preds = trained_model.predict([val_inp,val_mask],batch_size=32)
#pred_labels = preds.argmax(axis=1)

target_names = ['Disaster','No disaster']
pred_labels = preds['logits'].argmax(axis=1)
f1 = f1_score(val_label,pred_labels)
print('F1 score',f1)
print('Classification Report')
print(classification_report(val_label,pred_labels,target_names=target_names))
 

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1 score 0.7196029776674938
Classification Report
              precision    recall  f1-score   support

    Disaster       0.78      0.84      0.81       849
 No disaster       0.76      0.69      0.72       635

    accuracy                           0.77      1484
   macro avg       0.77      0.76      0.76      1484
weighted avg       0.77      0.77      0.77      1484



In [None]:
preds['logits'].shape

(1484, 2)

In [None]:
tf.keras.models.load_model("model.GRU_1.h5")