<a href="https://colab.research.google.com/github/juliuswiscmsba/NLP-Disaster-Tweets/blob/main/Disaster_Tweets_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 38.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 55.5 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 70.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, Activation, RepeatVector, Bidirectional, LSTM, Dropout, Embedding
from tensorflow.python.keras.layers.embeddings import Embedding 
from keras.losses import sparse_categorical_crossentropy
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from tensorflow.python.client import device_lib

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score

from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForSequenceClassification

In [2]:
from google.colab import files
train_clean = files.upload()
test_clean = files.upload()

Saving train_clean.csv to train_clean.csv


Saving test_clean.csv to test_clean.csv


In [4]:
train = pd.read_csv("train_clean.csv")
test = pd.read_csv("test_clean.csv")

In [19]:
#Load BERT model
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-uncased', 
    hidden_dropout_prob = 0.15,
    attention_probs_dropout_prob = 0.15,
    hidden_act = "relu",
    num_labels = 2
)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
#BERT tolenizer
input_ids=[]
attention_masks=[]

sentences=train['text']
labels=train['target']
for sent in sentences:
    bert_inp=bert_tokenizer.encode_plus(sent, add_special_tokens = True, max_length =64, padding='max_length', return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)

In [21]:
#Split the train and test data
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids, labels, attention_masks, test_size=0.2)

In [22]:
#BERT model
model_save_path='bert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path, save_weights_only=True, monitor='val_sparse_categorical_accuracy', mode='max', patience=3, save_best_only=True)]

optimizer = tf.keras.optimizers.Adam(
    learning_rate = 1e-5, 
    epsilon = 1e-07,
    clipvalue = 10
)
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

bert_model.compile(loss=loss,optimizer=optimizer,metrics=tf.keras.metrics.SparseCategoricalAccuracy())
bert_model.trainable = True
bert_model.summary()

Model: "tf_bert_for_sequence_classification_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_149 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [23]:
history=bert_model.fit([train_inp,train_mask],train_label,batch_size=32,epochs=7,validation_data=([val_inp,val_mask],val_label), callbacks=callbacks)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [26]:
#Validation prediction
preds = bert_model.predict([val_inp,val_mask],batch_size=32)
pred_labels = np.argmax(preds.logits, axis=1)
f_score = f1_score(val_label,pred_labels)
a_score = accuracy_score(val_label,pred_labels)
print('F1 Score: ', f_score)
print('Accuracy Score: ', a_score)
print('Classification Report:')
print(classification_report(val_label,pred_labels))

print('Training and saving built model.....')   

F1 Score:  0.7864150943396226
Accuracy Score:  0.814182534471438
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       864
           1       0.78      0.79      0.79       659

    accuracy                           0.81      1523
   macro avg       0.81      0.81      0.81      1523
weighted avg       0.81      0.81      0.81      1523

Training and saving built model.....


In [27]:
#Tokenize test data
input_ids_test=[]
attention_masks_test=[]

sentences=test['text']
for sent in sentences:
    bert_inp_test=bert_tokenizer.encode_plus(sent, add_special_tokens = True,max_length =64, padding='max_length', return_attention_mask = True)
    input_ids_test.append(bert_inp_test['input_ids'])
    attention_masks_test.append(bert_inp_test['attention_mask'])

input_ids_test=np.asarray(input_ids_test)
attention_masks_test=np.array(attention_masks_test)

In [28]:
preds_test = bert_model.predict([input_ids_test,attention_masks_test],batch_size=32)
test['target']  = np.argmax(preds_test.logits, axis=1)
sub_bert = test[['id','target']].copy()
sub_bert.to_csv('submission_bert.csv', index = False)
files.download('submission_bert.csv')
#Result: 0.82899

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>