Dataset used: [Coronavirus tweets NLP](https://www.kaggle.com/datatattle/covid-19-nlp-text-classification)



In [None]:
!pip install bert-for-tf2

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/af/c1/015648a2186b25c6de79d15bec40d3d946fcf1dd5067d1c1b28009506486/bert-for-tf2-0.14.6.tar.gz (40kB)
[K     |████████                        | 10kB 24.8MB/s eta 0:00:01[K     |████████████████                | 20kB 2.8MB/s eta 0:00:01[K     |████████████████████████▏       | 30kB 3.7MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 2.7MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [None]:
import pandas as pd
import numpy as np
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tqdm import tqdm
import matplotlib.pyplot as plt
import re

In [None]:
train_dir = '/content/Corona_NLP_train.csv'
#test_dir = 'C:/Users/Dejan/Downloads/Corona_NLP_test.csv'

train_data = pd.read_csv(train_dir, encoding='latin-1')
#test_data = pd.read_csv(test_dir, encoding='latin-1')

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    #text = text.replace('\%','')
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    #text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = " ".join(filter(lambda x:x[0]!="@", text.split()))
    text = re.sub('([#])|([^a-zA-Z])', ' ', text)
    #text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return text

train_data['OriginalTweet'] = train_data['OriginalTweet'].apply(lambda x: clean_text(x))
test_data['OriginalTweet'] = test_data['OriginalTweet'].apply(lambda x: clean_text(x))

NameError: ignored

In [None]:
print("Max tweet length:", train_data['OriginalTweet'].map(len).max())
print("Min tweet length:", train_data['OriginalTweet'].map(len).min())
print("Average tweet length:", train_data['OriginalTweet'].map(len).mean())

chars = sorted(list(set(train_data['OriginalTweet'])))
print('Total characters:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# Functions for constructing BERT Embeddings: input_ids, input_masks, input_segments and Inputs
MAX_SEQ_LEN = 500  # max sequence length


def get_masks(tokens):
    """Masks: 1 for real tokens and 0 for paddings"""
    return [1] * len(tokens) + [0] * (MAX_SEQ_LEN - len(tokens))


def get_segments(tokens):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == '[SEP]':
            current_segment_id = 1
    return segments + [0] * (MAX_SEQ_LEN - len(tokens))


def get_ids(tokens, tokenizer):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens, )
    input_ids = token_ids + [0] * (MAX_SEQ_LEN - len(token_ids))
    return input_ids


def create_single_input(sentence, tokenizer, max_len):
    """Create an input from a sentence"""
    stokens = tokenizer.tokenize(sentence)
    stokens = stokens[:max_len]
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
    ids = get_ids(stokens, tokenizer)
    masks = get_masks(stokens)
    segments = get_segments(stokens)
    return ids, masks, segments


def convert_sentences_to_features(sentences, tokenizer):
    """Convert sentences to features: input_ids, input_masks and input_segments"""
    input_ids, input_masks, input_segments = [], [], []

    for sentence in tqdm(sentences, position=0, leave=True):
        ids, masks, segments = create_single_input(sentence, tokenizer, MAX_SEQ_LEN - 2)
        assert len(ids) == MAX_SEQ_LEN
        assert len(masks) == MAX_SEQ_LEN
        assert len(segments) == MAX_SEQ_LEN
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)

    return [np.asarray(input_ids, dtype=np.int32),
            np.asarray(input_masks, dtype=np.int32),
            np.asarray(input_segments, dtype=np.int32)]


def create_tonkenizer(bert_layer):
    """Instantiate Tokenizer with vocab"""
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)
    return tokenizer

def nlp_model(callable_object):
    # Load the pre-trained BERT base model
    bert_layer = hub.KerasLayer(handle=callable_object, trainable=True)

    input_ids = Input(shape=(MAX_SEQ_LEN, ), dtype=tf.int32, name="input_ids")
    input_masks = Input(shape=(MAX_SEQ_LEN, ), dtype=tf.int32, name="input_masks")
    input_segments = Input(shape=(MAX_SEQ_LEN, ), dtype=tf.int32, name='segment_ids')

    inputs = [input_ids, input_masks, input_segments] #Bert inputs
    pooled_output, sequence_output = bert_layer(inputs) #Bert outputs

    x = Dense(768, activation='relu')(pooled_output)
    x = Dropout(0.1)(x)

    outputs = Dense(5, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

model = nlp_model("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1")
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 500)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 500)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 500)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_ids[0][0]                  
                                                                 input_masks[0][0]     

In [None]:
# Create examples for training and testing
train_data = train_data.sample(frac=1) # Shuffle the dataset
tokenizer = create_tonkenizer(model.layers[3])
X_train = convert_sentences_to_features(train_data['OriginalTweet'][:37000], tokenizer)
X_test = convert_sentences_to_features(train_data['OriginalTweet'][37000:], tokenizer)

train_data['Sentiment'].replace('Extremely Negative',0.,inplace=True)
train_data['Sentiment'].replace('Negative',1.,inplace=True)
train_data['Sentiment'].replace('Neutral',2.,inplace=True)
train_data['Sentiment'].replace('Positive',3.,inplace=True)
train_data['Sentiment'].replace('Extremely Positive',4.,inplace=True)

100%|██████████| 37000/37000 [00:18<00:00, 2043.59it/s]
100%|██████████| 4157/4157 [00:01<00:00, 2102.54it/s]


In [None]:
one_hot_encoded = to_categorical(train_data['Sentiment'].values)
y_train = one_hot_encoded[:37000]
y_test =  one_hot_encoded[37000:]

batch_size = 4
opt = Adam(learning_rate=2e-5)
model.compile(optimizer=opt,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    validation_data = (X_test, y_test),
                    epochs=1,
                    batch_size=batch_size)

