# Text classification

This notebook is about text classification and general principles of nlp.

1. How to tokenize a text and get different representation of a text ready to ingest into a model. 

2. The concept of embeddings and how to make it in pure tf. 

3. A classifier using a classic DNN and a RNN (LSTM)

We also check how to view embeddings in a 3d space in tensorboard

In [None]:
%tensorflow_version 2.x

%load_ext tensorboard

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot,text_to_word_sequence
from tensorflow.keras.utils import to_categorical

### How to tokenize a text

Get different representation of an input text using keras tokenizer.

- as a matrix where each entry i,j is for the i sentence , the j token. Matrix cell contains the nb of occurence of each word in the sentence, a binary if present. It could also get a tfidf 
- as one-hot encoded


In [None]:
import spacy

In [None]:
import spacy.cli
spacy.cli.download("en_core_web_md")

In [None]:
nlp = spacy.load("en_core_web_md")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion.")
index_word = {}
word_index = {}
i =0
for token in doc:
    #print(token.text, token.pos_, token.lemma_, token.is_sent_start)
    word_index[token.text]=i
    index_word[i]=token.text
    i+=1
word_index, index_word

In [None]:
class SpacyTokenizer:

    def __init__(self, num_words=None, do_lower=True, remove_stop_words = True, 
                 replace_nb_with_fix_tok=False, oov_token='_OOV_', stop_words = []):
        self.num_words = num_words
        self.do_lower = do_lower
        self.remove_stop_words = remove_stop_words
        self.replace_nb_with_fix_tok = replace_nb_with_fix_tok
        self.index_word = {}
        self.word_index = {}
        self.word_count = {}
        self.oov_token = oov_token
        self.stop_words = stop_words
        self.use_spacy_stop_words = len(stop_words)==0

    def __add_word(self, text, i): 
        self.index_word[i]=text
        self.word_index[text]=i
        count = self.word_count.get(text,0)
        self.word_count[text] = count+1


    def __process_tok(self, tok):
        tok_text = None
        if self.do_lower:
            tok_text= tok.text.lower()
        else:
            tok_text = tok.text

        if self.replace_nb_with_fix_tok:
            if tok.is_digit:
                tok_text = '__DIGIT__'

        if self.remove_stop_words:
            if self.use_spacy_stop_words:
                if tok.is_stop:
                    tok_text = None
            else:
                if tok_text in self.stop_words:
                    tok_text = None
            

        return tok_text

    def fit_on_texts(self, texts):
        i = 0
        for txt in texts:
            doc = nlp (txt)
            for tok in doc:
                tok_text = self.__process_tok(tok)
                #print(tok_text)
                if tok_text is not None:
                    # word can already be in index. If true we don't add it, but reuse existing idx
                    idx = self.word_index.get(tok_text, i)
                    self.__add_word(tok_text,idx)
                    if (idx==i):
                        i+=1
        # eliminate words over num_words based on sort of count.
        top_words = [k for k,v in sorted(self.word_count.items(), key=lambda item: item[1], reverse=True)]
        top_words = top_words[:self.num_words]

        self.top_words = {*top_words}
        self.__add_word(self.oov_token,i)

                
    def texts_to_sequences(self, texts):
        seqs = []
        for txt in texts:
            doc = nlp(txt)
            seq = []
            for tok in doc:
                tok_text = self.__process_tok(tok)
                if tok_text is not None:
                    if tok_text in self.top_words:
                        tok_idx = self.word_index[tok_text]
                    else:
                        tok_idx = self.word_index[self.oov_token]
                    seq.append(tok_idx)
            seqs.append(seq)
        return seqs
                


sp_tok = SpacyTokenizer(num_words = 8, do_lower=True, replace_nb_with_fix_tok=True, oov_token='__OOV__', stop_words=['a','the','my'])
texts = ["Apple is looking at buying U.K. startup for $1 billion.", "One should not eat the apple every day!  this is my 12 billion advice"]
sp_tok.fit_on_texts(texts)
            
seq = sp_tok.texts_to_sequences(texts)
#ßseq, sp_tok.top_words, sp_tok.word_index, sp_tok.index_word, sp_tok.word_count

In [None]:
queen = nlp('queen')
queen_vect = queen.vector
king = nlp('king')
king_vect = king.vector

man = nlp('man')
man_vect = man.vector
woman = nlp('woman')
woman_vect = woman.vector

king_vect - man_vect + woman_vect - queen_vect
king_vect.argmax(), queen_vect.argmax()
man_vect.argmax(), woman_vect.argmax()


In [None]:
doc = nlp("I loved coffee")
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
            lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_, word.lemma_)

In [None]:
vocab_nb=8
test_t = Tokenizer(num_words=vocab_nb,filters='',lower=True, oov_token='UNK' )
tmp_text = ['hello the people of the world .','You are the best in the world !']
tmp_text = ["Apple is looking at buying U.K. startup for $1 billion.", "One should not eat apple every day!  this is my 12 billion advice"]

test_t.fit_on_texts(tmp_text)
#test_t.index_word, test_t.word_index, test_t.word_counts

In [None]:
count_mat = test_t.texts_to_matrix(tmp_text, mode='count')
bin_mat = test_t.texts_to_matrix(tmp_text,mode='binary')
tfidf_mat = test_t.texts_to_matrix(tmp_text, mode='tfidf')
# tfidf set a low score for the, because it's frequent in each sentence > less meaningfull than a words that occurs in less sents. 
# matrix shape takes num_words into account even if it has more words into dictionnary
count_mat, bin_mat, tfidf_mat,count_mat.shape

In [None]:
seq_len = 20
seq = test_t.texts_to_sequences(tmp_text)
padded_seq = tf.keras.preprocessing.sequence.pad_sequences(seq,maxlen=seq_len,padding='post')
seq,padded_seq, type(seq)

We one_hot encode the padded sequence. It creates a tensor of shape
(nb_samples, seq_len, vocab_nb)

In [None]:
seq_oh = tf.keras.backend.one_hot(tf.Variable(padded_seq),10)
seq_oh

## Embeddings concept

Let's create our own embedding layer and compare results with Keras one.

The goal is to create a layer that extract the embeddings of a given sequence from a embedding matrix that contains the embeddings of all of the tokens/words.

We extract embeddings from a random uniform matrix. In a learning problem, we will learn those embeddings with gradient descent. 

In [None]:
class Embeddings(object):

    def __init__(self,vocab_size, embed_size):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        init= tf.keras.initializers.RandomUniform(minval=-0.05, maxval=0.05, seed=23)

        self.embed_matrix = tf.Variable(init((vocab_size,embed_size)),dtype=tf.float32)
        print("embedding matrix shape:",self.embed_matrix.shape)
        print("embedding matrix", self.embed_matrix)

    def forward(self,X):
        X_oh = tf.keras.backend.one_hot(X,self.vocab_size)
        print(X_oh)
        return tf.matmul(X_oh, self.embed_matrix,transpose_a=False,transpose_b=False)
        

input = np.array([[1,2,3,0,0],[0,1,2,0,3]])
my_emb_layer = Embeddings(4,3)
my_emb = my_emb_layer.forward(input)
my_emb

In [None]:
from tensorflow.keras.layers import Embedding

In [None]:
# Try keras version
k_emb_layer = Embedding(4,3,embeddings_initializer=tf.keras.initializers.RandomUniform(minval=-0.05, maxval=0.05, seed=23))
k_emb = k_emb_layer(input)

# Check results are the same between my embeddings and keras one. 
tf.math.equal(k_emb,my_emb)


In [None]:
# Check that init of embedding_matrix are the same
tf.math.equal(k_emb_layer.embeddings,my_emb_layer.embed_matrix)

## Real case

In [None]:
%%bash
[[ -d /content/.kaggle ]] || mkdir /content/.kaggle

In [None]:
# get the keys from your kaggle account information
# my account > API > Create New API Token
# get the content of the file 
import json
token = {"username":"xxx","key":"xxxxx"}
with open('/content/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)

In [None]:
!ls /content/.kaggle

In [None]:
%%bash

if [ ! -f /content/spam-text-message-classification.zip ]; then
    mkdir /root/.kaggle
    cp /content/.kaggle/kaggle.json /root/.kaggle/
    cat /root/.kaggle/kaggle.json
    kaggle config set -n path -v{/content}
    
    kaggle datasets download -d team-ai/spam-text-message-classification -p /content
    
    unzip -o /content/spam-text-message-classification.zip -d /content
fi

In [None]:
%load_ext google.colab.data_table

In [None]:
data_df = pd.read_csv('/content/SPAM text message 20170820 - Data.csv')
data_df = data_df.sample(frac=1.,random_state=34).reset_index(drop=True)
data_df['labels'] = data_df.Category.astype('category').cat.codes

In [None]:
data_df

In [None]:
nb_1 = data_df.loc[data_df.labels==1].labels.count()
nb_0=data_df.loc[data_df.labels==0].labels.count()
print(f"CAUTION : majority class baseline will provide {nb_0 / (nb_1+nb_0)} accuracy")

In [None]:
data_nb = data_df.labels.count()
train_nb = np.int(np.floor(data_nb *0.8))
test_nb = data_nb-train_nb
xval_nb = np.int(np.floor(train_nb*0.2))
train_nb = train_nb-xval_nb
assert data_nb == train_nb + xval_nb + test_nb

In [None]:
labels = data_df.labels.values
text = data_df.Message.values

train_labels = labels[:train_nb]
train_text = text[:train_nb]

xval_labels = labels[train_nb:train_nb+xval_nb]
xval_text = text[train_nb:train_nb+xval_nb]

test_labels = labels [-test_nb:]
test_text = text[-test_nb:]
# check there is no overlap
assert train_text[-1] != xval_text[0] and xval_text[-1]!=test_text[0]

In [None]:
train_text[:10], train_labels[:10]

In [None]:
use_spacy_tok = True

vocab_size = 10000
if use_spacy_tok:
    sp_tok = SpacyTokenizer(num_words = vocab_size, do_lower=True, replace_nb_with_fix_tok=True, oov_token='__OOV__')
else:
    t = Tokenizer(num_words=vocab_size,filters='',oov_token='UNK',lower=True)
#t = Tokenizer(filters='',oov_token='UNK',lower=True)
t.fit_on_texts(train_text)

In [None]:
len(t.index_word),t.index_word[10000],t.word_index['UNK'],t.num_words

### Export words for tensorboard projector

Save words associated to each embeddings in meta.tsv to be able to view them in projector. 

In [None]:
import io

meta_file = io.open('meta.tsv', 'w', encoding='utf-8')

ms = []
i=0

for key in t.index_word:
    #print(t.index_word[key])
    if i< vocab_size:
        word = t.index_word[key]
        word = word.replace('\n',' ')
        ms.append(word)    
        meta_file.write(word + "\n")
    else:
        print(i)
        break
    i+=1

meta_file.close()


# uncomment to download meta.tsv and then upload to tensorboard projector.

try:
  from google.colab import files
except ImportError:
   pass
else:
  files.download('meta.tsv')


In [None]:
train_seq = t.texts_to_sequences(train_text)
xval_seq = t.texts_to_sequences(xval_text)
test_seq = t.texts_to_sequences(test_text)
len(train_seq)
train_seq[0]

In [None]:
train_seq[1]

In [None]:
t.index_word[10272]

In [None]:
max_len=500
train_seq_pad = tf.keras.preprocessing.sequence.pad_sequences(train_seq,maxlen=max_len,padding='post') 
xval_seq_pad = tf.keras.preprocessing.sequence.pad_sequences(xval_seq,maxlen=max_len,padding='post')
test_seq_pad = tf.keras.preprocessing.sequence.pad_sequences(test_seq,maxlen=max_len,padding='post')

train_ds = tf.data.Dataset.from_tensor_slices((train_seq_pad,train_labels)).shuffle(train_seq_pad.shape[0])
xval_ds = tf.data.Dataset.from_tensor_slices((xval_seq_pad,xval_labels)).shuffle(xval_seq_pad.shape[0])
test_ds = tf.data.Dataset.from_tensor_slices((test_seq_pad,test_labels)).shuffle(test_seq_pad.shape[0])

In [None]:
"data:",train_seq_pad.shape, xval_seq_pad.shape, test_seq_pad.shape, \
"labels:",train_labels.shape,xval_labels.shape,test_labels.shape, \
vocab_size

In [None]:
tf.keras.backend.clear_session()
embedding_dim=300

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim),
  # we average along each word embedding of the sequence, so we are agnostic to seq.len. 
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dense(1,activation='sigmoid')
])

model.summary()

In [None]:
import datetime

model.compile(optimizer='adam',
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=['accuracy'])

train_batch_ds = train_ds.batch(64)
xval_batch_ds = xval_ds.batch(64)

log_dir="logs/dnn-fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True,
                                                      embeddings_freq=1)


history = model.fit(
    train_batch_ds,
    validation_data=xval_batch_ds,
    callbacks=[tensorboard_cb],
    epochs=20)

In [None]:
%tensorboard --logdir logs

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

### Apply model on test data

In [None]:
test_loss, test_acc = model.evaluate(test_ds.batch(100))

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
def np_sent_to_str(np_sent):
    toks = [t.index_word[tok] for tok in np_sent if tok >0]
    return " ".join(toks)

def np_sentence_to_str_sentence(np_sents):
    str_sents = []
    for sent in np_sents:
        str_sents.append(np_sent_to_str(sent))        
    return str_sents


def get_test_preds(model, test_ds):
    
    iter = test_ds.as_numpy_iterator()
    test_seq = next(iter)
    test_seq_np = test_seq[0]
    test_label = test_seq[1]

    test_label_pred = model.predict_classes(test_seq)

    test_sents = np_sentence_to_str_sentence(test_seq_np)
    

    return pd.DataFrame(np.column_stack((test_sents,test_label,test_label_pred)),columns=['sentence','ground_truth','pred'])

get_test_preds(model,test_ds.batch(20))

In [None]:
t.word_index['hello']
t.index_word[7]

In [None]:
def str_sents_to_np(str_sent,max_len=20):
    seq = t.texts_to_sequences([str_sent])
    padded_seq = tf.keras.preprocessing.sequence.pad_sequences(seq,maxlen=max_len,padding='post')

    return padded_seq

def pred_sent(model, str_sent, max_len=20):
    print(max_len)
    str_sents = [str_sent]
    seq = str_sents_to_np(str_sent,max_len=max_len)
    return model.predict(seq)


str_sent="Now the new samsung s5 for free !"
pred_sent(model,str_sent, max_len=max_len)


## RNN model


In [None]:
rnn_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1,activation='sigmoid')
])


In [None]:
rnn_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
train_batch_ds = train_ds.batch(32)
xval_batch_ds = xval_ds.batch(32)

log_dir="logs/rnn-fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True,
                                                      embeddings_freq=1)

history = rnn_model.fit(train_batch_ds, epochs=10,
                    validation_data=xval_batch_ds,
                    callbacks=[tensorboard_cb], 
                    validation_steps=5)


In [None]:
# On test data
test_loss, test_acc = rnn_model.evaluate(test_ds.batch(100))

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
get_test_preds(rnn_model,test_ds.batch(20))

In [None]:
str_sent="Now get the new samsung s5 for 499 USD !"
pred_sent(rnn_model,str_sent, max_len=max_len)

In [None]:
rnn_model.summary()

In [None]:
%tensorboard --logdir logs