<a href="https://colab.research.google.com/github/hkolgur/UOH/blob/main/CNN_tweet_Sentiment_Bert_for_Tokenization_only.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [64]:
import numpy as np
import pandas as pd
import math
import re 
from bs4 import BeautifulSoup
import random
from google.colab import drive


### Imports related to BERT

In [65]:
!pip install bert-for-tf2  #tensorflow2 
!pip install sentencepiece #bert-for-tf2 need for decode



In [66]:
import tensorflow as tf #tensor flow version is 2.x + 
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

# Data pre-processing

In [67]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
cols=['sentiment','id','date','query','user','text']


df=pd.read_csv("/content/drive/MyDrive/UOH/stanford-twitter/t55.csv",
               names=cols,
               header=None,
               engine="python",
              # sep='delimiter',
               encoding="latin1")


#Keep label and text . Drop other columns
df.drop(['id','date','query','user'],axis=1,inplace=True)

In [69]:
df['sentiment'].value_counts()

0    4893
4    3623
Name: sentiment, dtype: int64

# Cleaning

In [70]:
df['text']

0       @switchfoot http://twitpic.com/2y1zl - Awww, t...
1       is upset that he can't update his Facebook by ...
2       @Kenichan I dived many times for the ball. Man...
3         my whole body feels itchy and like its on fire 
4       @nationwideclass no, it's not behaving at all....
                              ...                        
8511             My GrandMa is making Dinenr with my Mum 
8512    Mid-morning snack time... A bowl of cheese noo...
8513    @ShaDeLa same here  say it like from the Termi...
8514               @DestinyHope92 im great thaanks  wbuu?
8515                 cant wait til her date this weekend 
Name: text, Length: 8516, dtype: object

In [71]:
def clean_tweet(tweet):
  tweet=BeautifulSoup(tweet,"lxml").get_text() #get english format text form the input that is in lxml format 
  tweet=re.sub(r'@[A-za-z0-9]+','',tweet) # Remove @name
  tweet=re.sub(r'https?://[A-Za-z0-9./]+',' ',tweet) #remove http or https links 
  tweet=re.sub(r'[^a-zA-Z0-9.!?\']',' ',tweet) # Only keep alpha,num,punctuations 
  tweet=re.sub(r' +',' ',tweet)  #replace more than one space with single space
  return tweet


In [72]:
#Sample test to check how clean_tweet is working 
print("Before Cleaning:\n",df['text'][0])
t=clean_tweet(df['text'][0])
print("After Cleaning:\n",t)

Before Cleaning:
 @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
After Cleaning:
  Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D


In [73]:
#clean text for the whole data frame 
data_clean=[clean_tweet(x)for x in df['text']]

In [74]:
#separate the label colum form data frame
data_label=df.sentiment


In [75]:
#change the data label 4 to 1 to mark as positve or 0 for negative review
data_label[data_label==4]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Tokenization

##### Use tokenization tool from BERT  


In [76]:
FullTokenizer=bert.bert_tokenization.FullTokenizer


#Create bert layer because there is information about tokenizer in it .
#pre-trained models are stored in tensor flow hub. From there we try to get the weights
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1",
                            trainable=False) # Because we just want to use it for tokenization


In [77]:
#Get vocab file for the tokenizer 
vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()  
#lowercasing the text or not 
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

tokenizer=FullTokenizer(vocab_file,do_lower_case)


In [78]:
#sample tokenizer of a sentence 
print(tokenizer.tokenize("I love cherries"))
# Get the token ids for each of the token 
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("I love cherries")))
#convert id  16138 to token
print(tokenizer.convert_ids_to_tokens([16138]))

['I', 'love', 'che', '##rries']
[146, 16138, 10262, 107788]
['love']


In [79]:
#Apply tokenizer to each sentence of data_clean
def encode_sentence(sentence):
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))

In [80]:
data_inputs=[encode_sentence(sent) for sent in data_clean]

In [81]:
# Here is how sample data_input looks like with tokens 
data_inputs[1]

[10124,
 96213,
 10189,
 10261,
 10944,
 112,
 188,
 35896,
 10226,
 22329,
 10155,
 15541,
 10230,
 10271,
 119,
 119,
 119,
 10111,
 20970,
 171,
 10908,
 10146,
 169,
 14493,
 10984,
 18745,
 10379,
 119,
 139,
 16254,
 106]

# Dataset Creation

In [82]:
#we need to pad the sentences to be of same length while training. To do this we can use below techinique
#we can train in batches and sentences in each batch has to be of same length (apply padding).But all the sentences in 
#all the batches need not be of same lenght. 

#We can split the sentences from input based on their lenghts so that we need not apply much padding when we group them into 
#batches.
data_with_len=[[sent_token,data_label[i],len(sent_token)] for i,sent_token in enumerate(data_inputs)]

#now in our original data we had all labels of 0 together and 1 at end. so lets shuffle data to get a mix
random.shuffle(data_with_len)

#Sort data based on len(sent_token)
data_with_len.sort(key=lambda x:x[2]) # x will have sent_token,data_label[i],len(sent_token) 




In [83]:
#Next drop the len(sent_token) and also keep only the sentence tokens that have a lenght of >7 . This is to make sure we have
#longer senternces to convery meaning. If sentence length was <7 then it may not convey much meaning. 
#7 is arbitary choose your own lenght you think is most precise

sorted_all=[(x[0],x[1]) for x in data_with_len if x[2]>7] #storing as a tuple


In [84]:
#Usually we use the  from tensor methods/tensor slices etc to create a dataset .Here we cannot use that because all 
#sentences are of not same lenght.
#So we need to call the from_generator . Creates a Dataset whose elements are generated by generator.
#List is a generator so we can use above list  to create a dataset form generator 
 
all_dataset=tf.data.Dataset.from_generator(lambda : sorted_all,output_types=(tf.int32, tf.int32))


In [85]:
#check element of all_dataset
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=
 array([10796, 56011, 10908,   146,   112, 10323, 20775, 13028],
       dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [86]:
BATCH_SIZE=32
#Padded_batch takes size of the batch and the padded_shapes
#Dimensions  used for padding are indicated with None,dim corresponding to real value if inputs
# (first ele of tuple corresponds to the input) 
#labels will be batched according to the batch size of input if we leave it as blank () to indicate zero dim tensor
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))


In [87]:
#next iter gives the batch with size 32 as below 
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 10796,  56011,  10908,    146,    112,  10323,  20775,  13028],
        [ 83413,  10376,  10106,  32992,  11369,  66014,  58521,  13596],
        [ 10192,  11355,  10689,  34420,    112,    188,  10529,  10271],
        [ 10160,  10105,  18141,  34875,  15127,  20104,  14499,  10106],
        [ 10944,    112,    188,  10741,  64312,  15127,  24109,  10350],
        [   146,  69977,  10142,    169,  14772,  32288,  11940,    119],
        [   146,  36216,  25157,  10708,  10189,  10142,  13028,    119],
        [ 12689,  84874,  10105,  50939,    119,    119,    119,    119],
        [ 10192,  10798,  50939,  19353,  11161,  19986,  10107,    119],
        [ 12718,  52070,  24140,  10155,  10105,  14424,  10188,  11619],
        [ 31861,  10230,  31237,  11661,  18234,  37627,  10124,  12935],
        [ 47336,  38078,  21852,  10940,  15127,  78680,  13446,    136],
        [ 33200,    146,  36216,  13086,  11257,  60582,  10216,

### Create a testing set

In [88]:
# divide size of data /batch_size to get number of batches
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)

# Form the number of batches take 10% as Test batches
NB_BATCHES_TEST = NB_BATCHES // 10 

#Shuffle all_batced data because we have shortest sentences at the beginning and longest at the end 
#If we dont shuffle ,all small sizes batches data will be alloted to test longer sentences end up in trainng 
#shuffle takes buffer size as input . IF data set is not too big give buffer size = batch_size

all_batched.shuffle(NB_BATCHES) 

#take method which allows to take first n number 
# take 10% of batches to test 
test_dataset = all_batched.take(NB_BATCHES_TEST) 
# take 90% of data to train data set
#skip skips n number
train_dataset = all_batched.skip(NB_BATCHES_TEST) 

In [89]:
print("Total Batches:",NB_BATCHES)
print("Total Test Batches:",NB_BATCHES_TEST)
print("Total Train Batches:",NB_BATCHES-NB_BATCHES_TEST)


Total Batches: 229
Total Test Batches: 22
Total Train Batches: 207


# Model Building 

##### CNN Model: 
1. Have 3 differnt cnn filter of size 2,3,4
2. Take max concatnate all
3. Use dense layer to get the classification done


In [90]:
#Lets create a class for our model and it inherits from tf.keras.Model
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Training 

In [91]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [92]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [93]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [94]:
checkpoint_path = "./drive/MyDrive/UOH/stanford-twitter/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [95]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [96]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
Checkpoint saved at ./drive/MyDrive/UOH/stanford-twitter/.
Epoch 2/5
Checkpoint saved at ./drive/MyDrive/UOH/stanford-twitter/.
Epoch 3/5
Checkpoint saved at ./drive/MyDrive/UOH/stanford-twitter/.
Epoch 4/5
Checkpoint saved at ./drive/MyDrive/UOH/stanford-twitter/.
Epoch 5/5
Checkpoint saved at ./drive/MyDrive/UOH/stanford-twitter/.


<tensorflow.python.keras.callbacks.History at 0x7fcefccea910>

# Evaluation

In [97]:
results = Dcnn.evaluate(test_dataset)
print(results)

[1.3641886711120605, 0.7684659361839294]


In [103]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Ouput of the model: {}\nPredicted sentiment: negative.".format(output))
    elif sentiment == 1:
        print("Ouput of the model: {}\nPredicted sentiment: positive.".format(output))

In [112]:
get_prediction("Yes it took a long time but I did it")
get_prediction("Finally I was able to do it myself")
get_prediction("Yes I was able to do it with little help")
get_prediction("Do you really think that I did this mistake")

Ouput of the model: [[0.99832416]]
Predicted sentiment: positive.
Ouput of the model: [[0.00139698]]
Predicted sentiment: negative.
Ouput of the model: [[0.41671166]]
Predicted sentiment: negative.
Ouput of the model: [[0.98017484]]
Predicted sentiment: positive.
