<a href="https://colab.research.google.com/github/hkolgur/UOH/blob/main/BERT_tweet_Sentiment_Bert_As_Embedding_only.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import numpy as np
import pandas as pd
import math
import re 
from bs4 import BeautifulSoup
import random
from google.colab import drive


### Imports related to BERT

In [2]:
!pip install bert-for-tf2  #tensorflow2 
!pip install sentencepiece #bert-for-tf2 need for decode

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/a5/a1/acb891630749c56901e770a34d6bac8a509a367dd74a05daf7306952e910/bert-for-tf2-0.14.9.tar.gz (41kB)
[K     |████████                        | 10kB 15.9MB/s eta 0:00:01[K     |████████████████                | 20kB 19.9MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 23.9MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 21.1MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.8MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/aa/e0/4f663d8abf83c8084b75b995bd2ab3a9512ebc5b97206fde38cef906ab07/py-params-0.10.2.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... 

In [3]:
import tensorflow as tf #tensor flow version is 2.x + 
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

# Data pre-processing

In [4]:
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
cols=['sentiment','id','date','query','user','text']


df=pd.read_csv("/content/drive/MyDrive/UOH/stanford-twitter/t55.csv",
               names=cols,
               header=None,
               engine="python",
              # sep='delimiter',
               encoding="latin1")


#Keep label and text . Drop other columns
df.drop(['id','date','query','user'],axis=1,inplace=True)

In [6]:
df['sentiment'].value_counts()

0    4893
4    3623
Name: sentiment, dtype: int64

# Cleaning

In [20]:
df['text'][1]

"is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"

In [8]:
def clean_tweet(tweet):
  tweet=BeautifulSoup(tweet,"lxml").get_text() #get english format text form the input that is in lxml format 
  tweet=re.sub(r'@[A-za-z0-9]+','',tweet) # Remove @name
  tweet=re.sub(r'https?://[A-Za-z0-9./]+',' ',tweet) #remove http or https links 
  tweet=re.sub(r'[^a-zA-Z0-9.!?\']',' ',tweet) # Only keep alpha,num,punctuations 
  tweet=re.sub(r' +',' ',tweet)  #replace more than one space with single space
  return tweet


In [9]:
#Sample test to check how clean_tweet is working 
print("Before Cleaning:\n",df['text'][0])
t=clean_tweet(df['text'][0])
print("After Cleaning:\n",t)

Before Cleaning:
 @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
After Cleaning:
  Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D


In [10]:
#clean text for the whole data frame 
data_clean=[clean_tweet(x)for x in df['text']]

In [11]:
#separate the label colum form data frame
data_label=df.sentiment


In [12]:
#change the data label 4 to 1 to mark as positve or 0 for negative review
data_label[data_label==4]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [71]:
data_label.value_counts()

0    4893
1    3623
Name: sentiment, dtype: int64

# Tokenization

##### Use tokenization tool from BERT  


In [13]:
FullTokenizer=bert.bert_tokenization.FullTokenizer


#Create bert layer because there is information about tokenizer in it .
#pre-trained models are stored in tensor flow hub. From there we try to get the weights
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1",
                            trainable=False) # Because we just want to use it for tokenization


In [14]:
#Get vocab file for the tokenizer 
vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()  
#lowercasing the text or not 
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

tokenizer=FullTokenizer(vocab_file,do_lower_case)


In [15]:
#sample tokenizer of a sentence 
print(tokenizer.tokenize("I love cherries"))
# Get the token ids for each of the token 
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("I love cherries")))
#convert id  16138 to token
print(tokenizer.convert_ids_to_tokens([16138]))

['I', 'love', 'che', '##rries']
[146, 16138, 10262, 107788]
['love']


In [16]:
# #Apply tokenizer to each sentence of data_clean
# def encode_sentence(sentence):
#   return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))

In [17]:
#New encode sentence format to suit it as input to BERT layer
def encode_sentence(sentence):
    return ["[CLS]"] + tokenizer.tokenize(sentence) + ["[SEP]"]  

In [18]:
data_inputs=[encode_sentence(sent) for sent in data_clean]

In [19]:
# Here is how sample data_input looks like with tokens 
data_inputs[1]

['[CLS]',
 'is',
 'upset',
 'that',
 'he',
 'can',
 "'",
 't',
 'update',
 'his',
 'Facebook',
 'by',
 'text',
 '##ing',
 'it',
 '.',
 '.',
 '.',
 'and',
 'might',
 'c',
 '##ry',
 'as',
 'a',
 'result',
 'School',
 'today',
 'also',
 '.',
 'B',
 '##lah',
 '!',
 '[SEP]']

# Dataset Creation

### we need to create 3 differnt inputs for each sentence (tokenized sentence with cls and sep added , list of mask (where aer padding values) , segment input ( seq of 1 /0 - 0 indicate we are in first sentence and 1 indiciate we are in second sentence.


In [25]:
# GET WORD VECTOR FROM A LIST OF TOKENS
#sample input: ['[CLS]', 'That', '##s', 'messe', '##d', 'up', '[SEP]']
def get_ids(tokens):
  return tokenizer.convert_tokens_to_ids(tokens)

# Check if  TOKENS HAVE [PAD] PADDING OR NOT
# NOTE: In this case it is not important but we will use it to maintain general norm
#sample input: ['[CLS]', 'That', '##s', 'messe', '##d', 'up', '[SEP]']
def get_mask(tokens):
  return np.char.not_equal(tokens, "[PAD]").astype(int)

#Get ID of Segments 
#sample input: ['[CLS]', 'That', '##s', 'messe', '##d', 'up', '[SEP]']
def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # turns 1 into 0 and vice versa
    return seg_ids
  

### We will create padded batches (so we pad sentences for each batch independently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [26]:
#we need to pad the sentences to be of same length while training. To do this we can use below techinique
#we can train in batches and sentences in each batch has to be of same length (apply padding).But all the sentences in 
#all the batches need not be of same lenght. 

#We can split the sentences from input based on their lenghts so that we need not apply much padding when we group them into 
#batches.
data_with_len=[[sent_token,data_label[i],len(sent_token)] for i,sent_token in enumerate(data_inputs)]

#now in our original data we had all labels of 0 together and 1 at end. so lets shuffle data to get a mix
random.shuffle(data_with_len)

#Sort data based on len(sent_token)
data_with_len.sort(key=lambda x:x[2]) # x will have sent_token,data_label[i],len(sent_token) 
# Data with len is list of lists like:
#[[['[CLS]', 'That', '##s', 'messe', '##d', 'up', '[SEP]'], 0, 7],
# [['[CLS]', 'Thank', '##s', 'for', 'that', '.', '[SEP]'], 1, 7]]
# Above each list item has the first element as a list with cls token sep , second item as label third item as length




In [27]:
#Next drop the len(sent_token) and also keep only the sentence tokens that have a lenght of >7 . This is to make sure we have
#longer senternces to convery meaning. If sentence length was <7 then it may not convey much meaning. 
#7 is arbitary choose your own lenght you think is most precise

sorted_all=[(
    [get_ids(x[0]),
    get_mask(x[0]),
    get_segments(x[0])],
    x[1]) for x in data_with_len if x[2]>7] #storing as a tuple

#[(ids[],maks[],segmentids[])],[labels]


In [28]:
#Usually we use the  from tensor methods/tensor slices etc to create a dataset .Here we cannot use that because all 
#sentences are of not same lenght.
#So we need to call the from_generator . Creates a Dataset whose elements are generated by generator.
#List is a generator so we can use above list  to create a dataset form generator 
 
all_dataset=tf.data.Dataset.from_generator(lambda : sorted_all,output_types=(tf.int32, tf.int32))


In [29]:
#check element of all_dataset
next(iter(all_dataset))

(<tf.Tensor: shape=(3, 8), dtype=int32, numpy=
 array([[  101, 11084, 10174, 13028, 10142, 10105, 62975,   102],
        [    1,     1,     1,     1,     1,     1,     1,     1],
        [    0,     0,     0,     0,     0,     0,     0,     0]],
       dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [53]:
BATCH_SIZE=32
#Padded_batch takes size of the batch and the padded_shapes
#Dimensions  used for padding are indicated with None,dim corresponding to real value if inputs
# (first ele of tuple corresponds to the input) 
#labels will be batched according to the batch size of input if we leave it as blank () to indicate zero dim tensor
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((3,None, ), ()),padding_values=(0, 0))


### Create a testing set

In [54]:
# divide size of data /batch_size to get number of batches
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)

# Form the number of batches take 10% as Test batches
NB_BATCHES_TEST = NB_BATCHES // 10 

#Shuffle all_batced data because we have shortest sentences at the beginning and longest at the end 
#If we dont shuffle ,all small sizes batches data will be alloted to test longer sentences end up in trainng 
#shuffle takes buffer size as input . IF data set is not too big give buffer size = batch_size

all_batched.shuffle(NB_BATCHES) 

#take method which allows to take first n number 
# take 10% of batches to test 
test_dataset = all_batched.take(NB_BATCHES_TEST) 
# take 90% of data to train data set
#skip skips n number
train_dataset = all_batched.skip(NB_BATCHES_TEST) 

In [55]:
print("Total Batches:",NB_BATCHES)
print("Total Test Batches:",NB_BATCHES_TEST)
print("Total Train Batches:",NB_BATCHES-NB_BATCHES_TEST)


Total Batches: 247
Total Test Batches: 24
Total Train Batches: 223


# Model Building 

In [56]:
# Test code to check what we  get when we call a  Bert Layer
my_sent=["[CLS]"] + tokenizer.tokenize("This is good to see") + ["[SEP]"] # this is the format of sentence A

#create 3 differnt types of tokens(tensors and simulate like a batch) of inputs then call bert layer

# first arg is to simulate a batch with input as tensor,cast it to int and  simulate batch along first dim 0
bert_layer([tf.expand_dims(tf.cast(get_ids(my_sent),tf.int32),0), 
            tf.expand_dims(tf.cast(get_mask(my_sent),tf.int32),0),
            tf.expand_dims(tf.cast(get_segments(my_sent),tf.int32),0)])
            
#output:
#The input is made of 2 elemetns the first one is a tensor of shape 1,768 - 1- for simulated batch 768 for hidden dims
#second element is tensor of shape (1,7,768) -1-for simulated batch ,7-for tokens in inputs(cls,sent,sep) and 768 for hidden dim 

#so each time we have these 2 outputs if cls task use firt output , else to go to  token level of specification ,
# which we want to do now in this task to use bert as embedder. for each word/token we get a vector 


[<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
 array([[ 0.32685918, -0.0878264 ,  0.38913628, -0.22595926, -0.14458975,
          0.545284  ,  0.30522987,  0.24586181, -0.50160766,  0.36522722,
          0.00836129, -0.2676508 , -0.22702737, -0.1438212 ,  0.14432485,
         -0.13008605,  0.7506511 ,  0.12150548,  0.19158159, -0.3550185 ,
         -0.99991184, -0.15698163, -0.4492925 , -0.19456938, -0.35273194,
          0.13880508, -0.22038977,  0.26147228,  0.2627607 , -0.16515788,
          0.22825648, -0.99991405,  0.6173775 ,  0.762883  ,  0.29918417,
         -0.24160737,  0.30260637,  0.26335764,  0.33180252, -0.29963297,
         -0.05847782,  0.05625503, -0.10038682,  0.04993977, -0.0599305 ,
         -0.35278335, -0.17938833,  0.23497139, -0.39601618,  0.12681796,
         -0.00395847,  0.1588027 ,  0.63745534,  0.23118608,  0.316901  ,
          0.20020404,  0.17543592,  0.25016633,  0.34923348, -0.25404814,
         -0.0087584 ,  0.3622237 ,  0.18640594, -0.1710084 , 

##### Bert Layer as Embedding Model: 
1. Have 3 differnt cnn filter of size 2,3,4
2. Take max concatnate all
3. Use BERT Embedding as  Layer after the last dense layer 


In [57]:
#Lets create a class for our model and it inherits from tf.keras.Model
class DCNNBertEmbedding(tf.keras.Model):
    
    def __init__(self,
               #  vocab_size,  not needed
               #  emb_dim=128, not needed
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
               #  training=False, not needed
                 name="dcnnembed"):
        super(DCNNBertEmbedding, self).__init__(name=name)

#insted of creating tf.keras.layers.embedding we call keras layer form hub for the url - to use bert layer (base version)
#trainable=False becasue we dont want to fine tune bert and use bert layer in frozen way .
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=False)
        
        # self.embedding = layers.Embedding(vocab_size,    not needed
        #                                   emb_dim)       not needed
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
#funciton to call embedder using bert . Input contain 3 different types of tokens. 
#we need to access them  using all batches 
#(all_tokens[:] ,
#alltokens[]:, 0)- ids)
#alltokens[]:, 1)- mask)
#alltokens[]:, 2)- segments)
#alltokens[:,x, :)- all values everything else)
#Refer to cell below modeling example we ran .------->
#Return: -, -one vector used to represent whole sentence ( used for classification tasks)
#      : embs - representation of words/tokens individually - which we need only second part 
    def embed_with_bert(self, all_tokens):
      _, embs = self.bert_layer([all_tokens[:, 0, :],
                                all_tokens[:, 1, :],
                                all_tokens[:, 2, :]])
      return embs
    
    def call(self, inputs, training):
        x = self.embed_with_bert(inputs)
        print(x.shape)
        x_1 = self.bigram(x) # batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Training 

In [58]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [59]:
# Dcnn = DCNN(vocab_size=VOCAB_SIZE,
#             emb_dim=EMB_DIM,
#             nb_filters=NB_FILTERS,
#             FFN_units=FFN_UNITS,
#             nb_classes=NB_CLASSES,
#             dropout_rate=DROPOUT_RATE)
Dcnn = DCNNBertEmbedding(nb_filters=NB_FILTERS,
                         FFN_units=FFN_UNITS,
                         nb_classes=NB_CLASSES,
                         dropout_rate=DROPOUT_RATE)

In [60]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [61]:
checkpoint_path = "./drive/MyDrive/UOH/stanford-twitter/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [62]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [63]:
Dcnn.fit(train_dataset,epochs=NB_EPOCHS,callbacks=[MyCustomCallback()])

Epoch 1/5
(None, None, 768)
(None, None, 768)
Checkpoint saved at ./drive/MyDrive/UOH/stanford-twitter/.
Epoch 2/5
Checkpoint saved at ./drive/MyDrive/UOH/stanford-twitter/.
Epoch 3/5
Checkpoint saved at ./drive/MyDrive/UOH/stanford-twitter/.
Epoch 4/5
Checkpoint saved at ./drive/MyDrive/UOH/stanford-twitter/.
Epoch 5/5
Checkpoint saved at ./drive/MyDrive/UOH/stanford-twitter/.


<tensorflow.python.keras.callbacks.History at 0x7f1289fbf390>

# Evaluation

In [64]:
results = Dcnn.evaluate(test_dataset)
print(results)

(None, None, 768)
[0.853786289691925, 0.6276041865348816]


In [65]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)

    input_ids = get_ids(tokens)
    input_mask = get_mask(tokens)
    segment_ids = get_segments(tokens)

    inputs = tf.stack(
        [tf.cast(input_ids, dtype=tf.int32),
         tf.cast(input_mask, dtype=tf.int32),
         tf.cast(segment_ids, dtype=tf.int32)],
         axis=0)
    
    inputs = tf.expand_dims(inputs, 0) # simulates a batch

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Ouput of the model: {}\nPredicted sentiment: negative.".format(output))
    elif sentiment == 1:
        print("Ouput of the model: {}\nPredicted sentiment: positive.".format(output))

In [66]:
get_prediction("Yes it took a long time but I did it")
get_prediction("Finally I was able to do it myself")
get_prediction("Yes I was able to do it with little help")
get_prediction("Do you really think that I did this mistake")

(1, 12, 768)
Ouput of the model: [[0.05795545]]
Predicted sentiment: negative.
(1, 11, 768)
Ouput of the model: [[0.03170111]]
Predicted sentiment: negative.
(1, 12, 768)
Ouput of the model: [[0.02811372]]
Predicted sentiment: negative.
(1, 12, 768)
Ouput of the model: [[0.03635205]]
Predicted sentiment: negative.


In [69]:
get_prediction("I'm sad to see these results")

(1, 10, 768)
Ouput of the model: [[0.06537941]]
Predicted sentiment: negative.


In [70]:
get_prediction("This actor is a deception.")





(1, 9, 768)
Ouput of the model: [[0.56446403]]
Predicted sentiment: positive.


In [72]:
get_prediction("so disappointed to see the model fails")

(1, 12, 768)
Ouput of the model: [[0.39834484]]
Predicted sentiment: negative.


In [73]:
get_prediction("so Happy to see the model works as expected")

(1, 11, 768)
Ouput of the model: [[0.6160313]]
Predicted sentiment: positive.
