## Mount Google Drive and Import Necessary Libraries


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/Training/DTS_Tensorflow/demo/

/content/drive/MyDrive/Training/DTS_Tensorflow/demo


In [3]:
%mkdir -p checkpoint_model_word2vec
%mkdir -p checkpoint_model_glove
%mkdir -p checkpoint_model_bert

In [4]:
!pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text


from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

### Preprocessing the Text Data

In [6]:
df = pd.read_csv('tweets.csv')

In [7]:
def remove_URL(text):
    url = re.compile(r'https?://\S+')
    return url.sub(r' httpsmark ', text)


def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)


def remove_atsymbol(text):
    name = re.compile(r'@\S+')
    return name.sub(r' atsymbol ', text)


def remove_hashtag(text):
    hashtag = re.compile(r'#')
    return hashtag.sub(r' hashtag ', text)


def remove_exclamation(text):
    exclamation = re.compile(r'!')
    return exclamation.sub(r' exclamation ', text)


def remove_question(text):
    question = re.compile(r'?')
    return question.sub(r' question ', text)


def remove_punc(text):
    return text.translate(str.maketrans('','',string.punctuation))


def remove_number(text):
    number = re.compile(r'\d+')
    return number.sub(r' number ', text)


def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' emoji ', string)

In [8]:

df['text'] = df['text'].str.lower()
df['text'] = df['text'].apply(lambda text: remove_URL(text))
df['text'] = df['text'].apply(lambda text: remove_html(text))
df['text'] = df['text'].apply(lambda text: remove_atsymbol(text))
df['text'] = df['text'].apply(lambda text: remove_hashtag(text))
df['text'] = df['text'].apply(lambda text: remove_exclamation(text))
df['text'] = df['text'].apply(lambda text: remove_punc(text))
df['text'] = df['text'].apply(lambda text: remove_number(text))
df['text'] = df['text'].apply(lambda text: remove_emoji(text))

## Using Word2Vec Pretrained From Wiki as Base Model

In [9]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!gunzip cc.en.300.vec.gz
!rm -rf gunzip cc.en.300.vec.gz

--2022-07-26 08:46:07--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz’


2022-07-26 08:46:42 (36.0 MB/s) - ‘cc.en.300.vec.gz’ saved [1325960915/1325960915]



In [10]:
tk = Tokenizer(lower=True, filters='')
tk.fit_on_texts(df.text)

max_len = 280 # max twitter char
train_tokenized = tk.texts_to_sequences(df.text)
X = pad_sequences(train_tokenized, maxlen=max_len)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,df.target, test_size=0.2, random_state=42)

In [12]:
embed_size = 300
max_features = 50000

def get_coefs(word,*arr):
    return word, np.asarray(arr, dtype='float32')

embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open("cc.en.300.vec"))
word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [13]:
adam = Adam(learning_rate=0.001)

model_word2vec = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape = (max_len,)),
    tf.keras.layers.Embedding(nb_words+1, embed_size, weights = [embedding_matrix], trainable=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,recurrent_dropout=0.4)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

model_word2vec.compile(loss=SparseCategoricalCrossentropy(),optimizer=adam,metrics=[SparseCategoricalAccuracy()])


model_word2vec.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 280, 300)          7230900   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              186880    
 l)                                                              
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dense_1 (Dense)             (None, 2)                 66        
                                                                 
Total params: 7,421,974
Trainable params: 7,421,974
Non-trainable params: 0
_________________________________________________________________


In [14]:
callback = [EarlyStopping(
    monitor='val_sparse_categorical_accuracy',
    patience=5,
    restore_best_weights=True,
    min_delta=0.01
), 
ModelCheckpoint(
    filepath='checkpoint_model_word2vec/',
    save_weights_only=True,
    monitor='val_sparse_categorical_accuracy',
    mode='max',
    save_best_only=True)
]

In [15]:
# Fit model
history = model_word2vec.fit(X_train,y_train,batch_size=512,epochs=20, validation_split=0.2,callbacks=callback)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [16]:
model_word2vec.load_weights(f'checkpoint_model_word2vec/')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fb7821be510>

In [17]:
model_word2vec.evaluate(X_test, y_test, verbose=1)



[0.32491618394851685, 0.8878628015518188]

## Using Glove Pretrained from Twitter Dataset

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df.text,df.target, test_size=0.2, random_state=42)

In [19]:
# Tokenize words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pd.concat([X_train, X_test], axis=0))

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(sequences_train, maxlen=280, truncating='pre')
X_test = pad_sequences(sequences_test, maxlen=280, truncating='pre')

vocabSize = len(tokenizer.index_word) + 1
print(f"Vocabulary size = {vocabSize}")

Vocabulary size = 24103


In [20]:
#Read GloVE embeddings
path_to_glove_file = 'glove.twitter.27B.200d.txt'
num_tokens = vocabSize 
embedding_dim = 200
hits = 0
misses = 0
embeddings_index = {}

# Read word vectors
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
print("Found %s word vectors." % len(embeddings_index))


# Assign word vectors to our dictionary/vocabulary
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Found 1193514 word vectors.
Converted 16715 words (7387 misses)


In [21]:
# Build neural network architecture
adam = Adam(learning_rate=0.001)

model_glove = Sequential([
    Embedding(vocabSize, 200, weights=[embedding_matrix], trainable=False,input_length=280),
    Bidirectional(LSTM(64,recurrent_dropout=0.4)),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')
])

model_glove.compile(loss=SparseCategoricalCrossentropy(),optimizer=adam,metrics=[SparseCategoricalAccuracy()])


model_glove.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 280, 200)          4820600   
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              135680    
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
 dense_3 (Dense)             (None, 2)                 66        
                                                                 
Total params: 4,960,474
Trainable params: 139,874
Non-trainable params: 4,820,600
_________________________________________________________________


In [22]:
#Callback
callback = [EarlyStopping(
    monitor='val_sparse_categorical_accuracy',
    patience=5,
    restore_best_weights=True,
    min_delta=0.01
), 
ModelCheckpoint(
    filepath='checkpoint_model_glove/',
    save_weights_only=True,
    monitor='val_sparse_categorical_accuracy',
    mode='max',
    save_best_only=True)
]

In [23]:
# Fit model
history = model_glove.fit(X_train,y_train,batch_size=512,epochs=20, validation_split=0.2,callbacks=callback)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


In [24]:
model_glove.load_weights(f'checkpoint_model_glove/')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fb65a48f810>

In [26]:
model_glove.evaluate(X_test, y_test, verbose=1)



[0.2689626216888428, 0.8953385949134827]

## Using Pretrained BERT

In [27]:
X_train, X_test, y_train, y_test = train_test_split(df.text,df.target, test_size=0.2, random_state=42)

In [28]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [29]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

In [30]:
layer = tf.keras.layers.Dense(32, activation='relu')(outputs['pooled_output'])
layer = tf.keras.layers.Dropout(0.5, name="dropout")(layer)
layer = tf.keras.layers.Dense(2, activation='softmax')(layer)

In [31]:
model_bert = tf.keras.Model(inputs=[text_input], outputs = [layer])
model_bert.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [32]:
adam = Adam(learning_rate=0.001)

model_bert.compile(loss=SparseCategoricalCrossentropy(),optimizer=adam,metrics=[SparseCategoricalAccuracy()])

In [33]:
callback = [EarlyStopping(
    monitor='val_sparse_categorical_accuracy',
    patience=5,
    restore_best_weights=True,
    min_delta=0.01
), 
ModelCheckpoint(
    filepath='checkpoint_model_bert/',
    save_weights_only=True,
    monitor='val_sparse_categorical_accuracy',
    mode='max',
    save_best_only=True)
]

In [34]:
history = model_bert.fit(X_train,y_train,batch_size=512,epochs=20, validation_split=0.2,callbacks=callback)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


In [35]:
model_bert.load_weights(f'checkpoint_model_bert/')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fb78348dc10>

In [36]:
model_bert.evaluate(X_test, y_test, verbose=1)



[0.3956999182701111, 0.8254177570343018]