In [1]:
import pandas as pd

import numpy as np

import tensorflow as tf

from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import callbacks
from tensorflow.keras import metrics

import re

from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score

import tqdm

import matplotlib.pyplot as plt

import collections

from sklearn.neighbors import KNeighborsClassifier

Read train and test.

In [2]:
train = pd.read_csv(r"data/train.csv")
test = pd.read_csv(r"data/test.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
train.location.fillna("unknown", inplace=True)
train.keyword.fillna("unknown", inplace=True)

There are some tweets that appear more than once. For simplicity I will keep only the first instance.

In [5]:
print(f"There are {len(train.text.unique())} unique tweets and the size of the dataset is {len(train)}.")

train = train.groupby("text").head(1)

There are 7503 unique tweets and the size of the dataset is 7613.


In [6]:
y = train.target.values
text = train.text.values

Preprocess the tweets.

In [8]:
# Preprocess.
def preprocess(sentence):
    s = sentence.lower().strip()
    s = re.sub(r'https?://[^\s<>"]+|www\.[^\s<>"]+', r"url", s)  # replace url's sith "url"
    s = re.sub(r"[\w\.-]+@[\w\.-]+", r"email", s)  # replace email's by "email"
    s = re.sub(r"#[a-z]+", "hashtag", s)
    s = re.sub(r"[!]+", r" ! ", s)
    s = re.sub(r"[?]+", r" ? ", s)
    s = re.sub(r"[,]+", r" , ", s)
    s = re.sub(r"[.]+", r" . ", s)
    s = re.sub(r"[`]+", r"", s)
    #s = re.sub(r"([a-z])\1+", r"\1", s)
    s = re.sub(r"[^a-z!?., ]", "", s)
    return s

In [9]:
text_preprocessed = [preprocess(txt) for txt in text]

Tokenize and pad tweets.

In [10]:
vocab_size = 10000
tokenizer = tf.keras.preprocessing.text.Tokenizer(vocab_size, filters="", oov_token="<oov>")
tokenizer.fit_on_texts(text_preprocessed)

In [11]:
X = tokenizer.texts_to_sequences(text_preprocessed)
X = tf.keras.preprocessing.sequence.pad_sequences(X, padding="post")

### GloVe embeddings

Read GloVe word embeddings.

In [7]:
glove = {}
with open(r"data/glove.twitter.27B.25d.txt") as f:
    for line in f.readlines():
        line = line.strip().split()
        word = line[0]
        vector = np.array(line[1:], dtype=np.float64)
        glove[word] = vector

Check GloVe coverage.

In [12]:
vocab = set(tokenizer.word_index.keys())
vocab_glove = set(glove.keys())

perc_word_in_glove = len(vocab.intersection(vocab_glove)) / len(vocab) * 100
print(r"{:.2f} % of vocab is in GloVe.". format(perc_word_in_glove))

# Compute coverage.
all_words = " ".join(text_preprocessed)
all_words = all_words.split()

counter = collections.Counter(all_words)
coverage = [counter[word] for word in vocab.intersection(vocab_glove)]
coverage = sum(coverage) / len(all_words)
print(r"Which represent {:.2f} % of coverage.". format(coverage * 100))

76.15 % of vocab is in GloVe.
Which represent 96.12 % of coverage.


Build the embedding initializer based on the GloVe vectors.

In [13]:
embedding_size = 25
embedding_initializer = np.random.uniform(0, 1, (vocab_size, embedding_size))
for idx, word in enumerate(list(tokenizer.word_index)[:vocab_size]):
    clean_word = re.sub(r"([a-z])\1+", r"\1", word)
    if word in glove:
        embedding_initializer[idx-1] = glove[word]
    elif clean_word in glove:
        embedding_initializer[idx-1] = glove[clean_word]

Split data into train and validation.

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=train.keyword
)

Create a tensorflow dataset.

In [15]:
batch_size = 64
train_set = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train)).batch(batch_size).prefetch(1)
valid_set = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(batch_size).prefetch(1)

### Pretrain embedding layer

In [16]:
embedding = models.Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, input_shape=[None], 
                    embeddings_initializer=tf.keras.initializers.Constant(embedding_initializer), 
                    trainable=True),
    layers.Dense(1, activation="sigmoid")
])

embedding.compile(
    optimizer="nadam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

embedding_early_stop_cb = callbacks.EarlyStopping(patience=5, restore_best_weights=True)

In [17]:
embedding.fit(
    train_set, 
    epochs=100, 
    validation_data=valid_set,
    callbacks=[embedding_early_stop_cb]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


<tensorflow.python.keras.callbacks.History at 0x7f7b8d0e7410>

Save the pretrained word embeddings.

In [18]:
embedding_initializer = embedding.layers[0].get_weights()[0]

## Reference model (without autoencoder pretraining)

In [19]:
model = models.Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, input_shape=[None], 
                     embeddings_initializer=tf.keras.initializers.Constant(embedding_initializer), 
                     trainable=True),
    layers.Bidirectional(layers.GRU(20, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)),
    layers.Bidirectional(layers.GRU(10, dropout=0.5, recurrent_dropout=0.5)),
    layers.BatchNormalization(),
    layers.Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=optimizers.Nadam(lr=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

The competition is evaluated by the F1 Score. Since tensorflow don't have a built in F1 score metric, we need to build our own.

In [20]:
# Build a custom callback that returns the f1 score at the end of each epoch.
class F1ScoreCallback(callbacks.Callback):
    def __init__(self, X_train, y_train, X_valid, y_valid, **kargs):
        super().__init__(**kargs)
        self.X_train = X_train
        self.y_train = y_train
        self.X_valid = X_valid
        self.y_valid = y_valid
        
    def on_epoch_end(self, epoch, logs=None):
        y_train_pred = self.model.predict(self.X_train)
        y_train_pred = np.round(y_train_pred).reshape(self.y_train.shape)
        
        y_valid_pred = self.model.predict(self.X_valid)
        y_valid_pred = np.round(y_valid_pred).reshape(self.y_valid.shape)
        
        f1_score_train = f1_score(y_train, y_train_pred)
        f1_score_valid = f1_score(y_valid, y_valid_pred)
        
        print(" - f1_score: {:.4f} - f1_score_valid: {:.4f}".format(f1_score_train, f1_score_valid))

In [21]:
f1_score_cb = F1ScoreCallback(X_train, y_train, X_valid, y_valid)


In [22]:
#model.fit(train_set, epochs=10, validation_data=valid_set, callbacks=[f1_score_cb]) # the f1_score_cb slows down training.

model.fit(
    train_set, 
    epochs=100, 
    validation_data=valid_set,
    callbacks=[callbacks.EarlyStopping(patience=3, restore_best_weights=True)]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


<tensorflow.python.keras.callbacks.History at 0x7f7b804617d0>

## Unsupervised pretraining with Autoencoder

In [23]:
embedding_layer = layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, input_shape=[None],
                    embeddings_initializer=tf.keras.initializers.Constant(embedding_initializer), trainable=False)

In [24]:
X_train_embed = embedding_layer(X_train)
X_valid_embed = embedding_layer(X_valid)

In [25]:
encoder = models.Sequential([
    layers.Bidirectional(layers.GRU(20, dropout=0.5, recurrent_dropout=0.5,return_sequences=True), input_shape=[None, 25]),
    layers.Bidirectional(layers.GRU(10, dropout=0.5, recurrent_dropout=0.5))
])

decoder = models.Sequential([
    layers.RepeatVector(37, input_shape=[20]),
    layers.Bidirectional(layers.GRU(20, dropout=0.5, recurrent_dropout=0.5,return_sequences=True)),
    layers.TimeDistributed(layers.Dense(25))
])

autoencoder = models.Sequential([encoder, decoder])

In [26]:
autoencoder.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_2 (Sequential)    (None, 20)                8760      
_________________________________________________________________
sequential_3 (Sequential)    (None, 37, 25)            6065      
Total params: 14,825
Trainable params: 14,825
Non-trainable params: 0
_________________________________________________________________


In [27]:
autoencoder.compile(
    optimizer=optimizers.Nadam(1e-3),
    loss="mse"
)

In [28]:
autoencoder.fit(
    X_train_embed, X_train_embed, 
    epochs=100,
    validation_data=(X_valid_embed, X_valid_embed),
    callbacks=[callbacks.EarlyStopping(patience=3, restore_best_weights=True)]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


<tensorflow.python.keras.callbacks.History at 0x7f7b7202b990>

Let's have a look at the reconstructed tweets.

In [29]:
knn = KNeighborsClassifier(n_neighbors=1)

X_knn = embedding_initializer
y_knn = list(tokenizer.word_index.keys())[:vocab_size]

knn.fit(X_knn, y_knn)

KNeighborsClassifier(n_neighbors=1)

In [30]:
tweets = [
    'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
    'Forest fire near La Ronge Sask. Canada'
]

for tweet in tweets:
    preprocessed_tweet = preprocess(tweet)
    sequence = tokenizer.texts_to_sequences([preprocessed_tweet])
    sequnce_embed = embedding_layer(np.array(sequence))
    
    recons_tweet = autoencoder.predict(sequnce_embed)
    recons_tweet = [knn.predict([vector])[0] for vector in recons_tweet[0]]
    recons_tweet = " ".join(recons_tweet[:len(sequence[0])])
    
    print(tweet)
    print(recons_tweet)
    print("-" * 80)

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
someone someone someone someone someone former am am am am am am am
--------------------------------------------------------------------------------
Forest fire near La Ronge Sask. Canada
lmfao lmfao lmfao lmfao lmfao lmfao lmfao lmfao
--------------------------------------------------------------------------------


The autoencoder is not really able to reconstruct the tweets (it would need more capacity for that). It seems that it has found some words that reduce the loss. Nevertheless, let's see if it found a representation that can improve the validation accuracy.

### Build a model based on the encoder

Train the model with the encoder weights locked.

In [31]:
encoder_clone = models.clone_model(encoder)

encoder_clone.trainable = False
pretrained_model = models.Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, input_shape=[None], 
                     embeddings_initializer=tf.keras.initializers.Constant(embedding_initializer), 
                     trainable=True),
    encoder_clone,
    layers.BatchNormalization(),
    layers.Dense(1, activation="sigmoid")
])

pretrained_model.compile(
    loss="binary_crossentropy",
    optimizer=optimizers.Nadam(1e-3),
    metrics=["accuracy"]
)

In [32]:
#pretrained_model.fit(train_set, epochs=10, validation_data=valid_set, callbacks=[f1_score_cb])
pretrained_model.fit(
    train_set, 
    epochs=10, 
    validation_data=valid_set,
    callbacks=[callbacks.EarlyStopping(patience=3, restore_best_weights=True)]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7b6014a9d0>

Train the model with the encoder weights unlocked.

In [33]:
pretrained_model.trainable=True

pretrained_model.compile(
    loss="binary_crossentropy",
    optimizer=optimizers.Nadam(1e-3),
    metrics=["accuracy"]
)

pretrained_model.fit(
    train_set, 
    epochs=10, 
    validation_data=valid_set,
    callbacks=[callbacks.EarlyStopping(patience=5, restore_best_weights=True)]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<tensorflow.python.keras.callbacks.History at 0x7f7b4a994dd0>