In [4]:
!pip install tensorflow --quiet
!pip install tensorflow-hub --quiet
!pip install tensorflow-text --quiet
!pip install transformers --quiet

In [5]:
import os, re, random
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

tf.get_logger().setLevel('ERROR')
pd.set_option('display.max_colwidth', None)
os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"

ModuleNotFoundError: No module named 'tensorflow'

In [4]:
# CHANGED FOR TPU 1VM:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect(tpu="local") # "local" for 1VM TPU
    strategy = tf.distribute.TPUStrategy(tpu)
    print("on TPU")
except tf.errors.NotFoundError:
    print("not on TPU")
    strategy = tf.distribute.MirroredStrategy()
    
print("REPLICAS: ", strategy.num_replicas_in_sync)

on TPU
REPLICAS:  8


## Import Dataset

In [5]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [6]:
train.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

## Exploratory Data Analysis (EDA)

First, let's check if there are any mislabeled tweets inside the dataset. As we cannot manually inspect all tweets to see if the tweets have been correctly classifier, we are going to look for duplicate tweets and check that duplicates have been assigned to the same labels. 

In [7]:
duplicates = train[train.duplicated('text')]
duplicates.text.nunique()

69

There are 69 duplicate tweets inside the training dataset. We are going to iterate through these duplicate tweets to see if these duplicate tweets have unmatching labels. Unmatching labels would indicate that the tweet(s) has been mislabeled. We are going to store the index of these "problematic duplicates" inside a list and use it to iterature through these tweets so that we can re-assigned the correct labels after inspecting them.

In [8]:
problematic_duplicates = []

for i in range(duplicates.text.nunique()):
    duplicate_subset = train[train.text == duplicates.text.unique()[i]]
    if len(duplicate_subset) > 1 and duplicate_subset.target.nunique() == 2:
        problematic_duplicates.append(i)
        
print(problematic_duplicates)

[4, 7, 12, 15, 24, 26, 33, 34, 35, 36, 38, 39, 42, 44, 46, 51, 55, 58]


In [9]:
train[train.text == duplicates.text.unique()[58]]

Unnamed: 0,id,keyword,location,text,target
6614,9470,terrorism,Jeddah_Saudi Arabia.,In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!,0
6616,9472,terrorism,Riyadh,In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!,1


Above is the 58th duplicate. We see that these tweets have unmatching labels despite their texts being identical. This tweet is not about an actual disaster, so we are going to correctly assign both tweets as not being about an actual disaster. This is going to look like this: 

In [10]:
train.target = np.where(train.text == duplicates.text.unique()[58], 0, train.target)
train[train.text == duplicates.text.unique()[58]]

Unnamed: 0,id,keyword,location,text,target
6614,9470,terrorism,Jeddah_Saudi Arabia.,In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!,0
6616,9472,terrorism,Riyadh,In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!,0


Let's repeat this task for all problematic duplicates after having identified the correct labels for each and every one of these problematic duplicates. We are going to store the correct labels inside a list and iterate through the problematic duplicates, assigning the correct labels one after the other.

In [11]:
target_list = [0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0]

for problematic_index in range(len(problematic_duplicates)): 
    train.target = np.where(train.text == duplicates.text.unique()[problematic_index], 
                            target_list[problematic_index], train.target)

## Preprocessing the Text

Before we use the text as input, we are going to perform some basic pre-processing. To identify the appropriate steps, let's look at some of the tweets.

In [12]:
random.seed(1048596)
sample_train = train.sample(frac = 1).head(5)
sample_train

Unnamed: 0,id,keyword,location,text,target
4277,6075,heat%20wave,liverpool,#greatbritishbakeoff love to know where I was when all this nice weather happened! Did miss the heat wave ?? ??,0
6306,9009,stretcher,,How to Freeze Fruits and Veggies\nhttp://t.co/MET0mtpr3S,0
1075,1554,bomb,,New Documents Found Pointing To Japan's WWII Atomic Bomb Program http://t.co/M9mowCMVNj,1
4011,5695,floods,,Children in Myanmar face a 'double catastrophe' as floods hit the most ... http://t.co/0jFNvAXFph,1
4213,5985,hazardous,"Nashville, Tn",Wholesale #WE Gon Rep That $hit At All Costs- Hazardous #WholeTeam3 #WholesaleEnt https://t.co/JWnXH9Q5ov,0


In the randomly selected tweets above, we see that the tweets contain links (http://...), hashtags (#..), and mentions (@..). We are going to remove links entirely and keep hashtags and mentions in case they signal something.

In [13]:
def clean_text(dataframe):
    dataframe.text = dataframe.text.apply(lambda x: str.lower(x))
    dataframe.text = dataframe.text.apply(lambda x: re.sub(r'http\S+', '', x))
    dataframe.text = dataframe.text.apply(lambda x: re.sub(r'#', '', x))
    dataframe.text = dataframe.text.apply(lambda x: re.sub(r'\W+', ' ', x))
    dataframe.text = dataframe.text.apply(lambda x: re.sub(r'\d+', '', x))
    return(dataframe)

In [14]:
sample_train = clean_text(sample_train)

In [15]:
sample_train

Unnamed: 0,id,keyword,location,text,target
4277,6075,heat%20wave,liverpool,greatbritishbakeoff love to know where i was when all this nice weather happened did miss the heat wave,0
6306,9009,stretcher,,how to freeze fruits and veggies,0
1075,1554,bomb,,new documents found pointing to japan s wwii atomic bomb program,1
4011,5695,floods,,children in myanmar face a double catastrophe as floods hit the most,1
4213,5985,hazardous,"Nashville, Tn",wholesale we gon rep that hit at all costs hazardous wholeteam wholesaleent,0


In [16]:
clean_train = clean_text(train)

In [17]:
train_df, val_df = np.split(clean_train.sample(frac = 1), [int(0.8 * len(clean_train))])

## Define and Train Model - First Model

In [None]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [None]:
bert_preprocess = hub.KerasLayer(tfhub_handle_preprocess)
bert_encoder = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
text_input = tf.keras.layers.Input(shape = (), dtype = tf.string)
encoder_input = bert_preprocess(text_input)
encoder_output = bert_encoder(encoder_input)

l = tf.keras.layers.Dense(100, activation = 'relu')(encoder_output['pooled_output'])
l = tf.keras.layers.Dropout(0.3)(l)
l = tf.keras.layers.Dense(25, activation = 'relu')(l)
l = tf.keras.layers.Dropout(0.3)(l)
l = tf.keras.layers.Dense(1, activation = 'sigmoid')(l)

model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0005),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = ['accuracy'])

In [22]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', 
                                                  patience = 2)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = 'model/best_performed_model.ckpt',
    save_weights_only = True,
    save_best_only = True,
    monitor = 'val_loss',
    verbose = 1
)

In [None]:
history = model.fit(train_df.text,
                    train_df.target,
                    validation_data = (val_df.text, val_df.target),
                    epochs = 30,
                    callbacks = [early_stopping, model_checkpoint_callback])

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.legend(['training', 'validation'])
plt.show()

In [None]:
## Define and Train Model - Second Model

In [None]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [None]:
bert_preprocess = hub.KerasLayer(tfhub_handle_preprocess)
bert_encoder = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
text_input = tf.keras.layers.Input(shape = (), dtype = tf.string)
encoder_input = bert_preprocess(text_input)
encoder_output = bert_encoder(encoder_input)

l = tf.keras.layers.Dense(16, activation = 'relu')(encoder_output['pooled_output'])
l = tf.keras.layers.Dropout(0.3)(l)
l = tf.keras.layers.Dense(1, activation = 'sigmoid')(l)

model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0005),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = ['accuracy'])

In [None]:
history = model.fit(train_df.text,
                    train_df.target,
                    validation_data = (val_df.text, val_df.target),
                    epochs = 30,
                    callbacks = [early_stopping, model_checkpoint_callback])

## Define and Train Model - Third Model

In [18]:
tfhub_handle_encoder = 'https://tfhub.dev/google/electra_small/2'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [19]:
bert_preprocess = hub.KerasLayer(tfhub_handle_preprocess)
bert_encoder = hub.KerasLayer(tfhub_handle_encoder)

In [27]:
text_input = tf.keras.layers.Input(shape = (), dtype = tf.string)
encoder_input = bert_preprocess(text_input)
encoder_output = bert_encoder(encoder_input)

l = tf.keras.layers.Dense(32, activation = 'relu')(encoder_output['pooled_output'])
l = tf.keras.layers.Dropout(0.3)(l)
l = tf.keras.layers.Dense(16, activation = 'relu')(l)
l = tf.keras.layers.Dropout(0.3)(l)
l = tf.keras.layers.Dense(1, activation = 'sigmoid')(l)

model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [28]:
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = ['accuracy'])

In [29]:
history = model.fit(train_df.text,
                    train_df.target,
                    validation_data = (val_df.text, val_df.target),
                    epochs = 30,
                    callbacks = [early_stopping, model_checkpoint_callback])

Epoch 1/30
Epoch 1: val_loss did not improve from 0.58092
Epoch 2/30
Epoch 2: val_loss did not improve from 0.58092
Epoch 3/30
Epoch 3: val_loss did not improve from 0.58092
Epoch 4/30
Epoch 4: val_loss improved from 0.58092 to 0.58039, saving model to model/best_performed_model.ckpt
Epoch 5/30
Epoch 5: val_loss did not improve from 0.58039
Epoch 6/30
Epoch 6: val_loss improved from 0.58039 to 0.57437, saving model to model/best_performed_model.ckpt
Epoch 7/30
Epoch 7: val_loss did not improve from 0.57437
Epoch 8/30
Epoch 8: val_loss did not improve from 0.57437
