In [41]:
from nltk.corpus import reuters
import nltk
#nltk.download('reuters')
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

## Getting train and test dataset from nltk reuters corpus

In [42]:
train_documents, train_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')])
test_documents, test_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')])

In [43]:
print("Number of training documents:", len(train_documents))
print("Number of testing documents:", len(test_documents))

Number of training documents: 7769
Number of testing documents: 3019


## Convert the categorical labels to Multi Label Encodings

In [44]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_categories)
test_labels = mlb.transform(test_categories)

## Creating Dataframe for applying transformations

In [46]:
trainData = {"content": train_documents}
testData = {"content": test_documents}
trainDf = pd.DataFrame(trainData, columns=["content"])
testDf = pd.DataFrame(testData, columns=["content"])

## Pre-process the input data

In [291]:
wordnet_lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stopwords = set(w.rstrip() for w in open("../../nltk_data/corpora/reuters/stopwords"))

def tokenize_lemma_stopwords(text):
    text = text.replace("\n", " ")
    tokens = nltk.tokenize.word_tokenize(text.lower()) # split string into words (tokens)
    tokens = [t for t in tokens if t.isalpha()] # keep strings with only alphabets
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [stemmer.stem(t) for t in tokens]
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    cleanedText = " ".join(tokens)
    return cleanedText

def dataCleaning(df):
    data = df.copy()
    data["content"] = data["content"].apply(tokenize_lemma_stopwords)
    return data

In [292]:
cleanedTrainData = dataCleaning(trainDf)
cleanedTestData = dataCleaning(testDf)

# Create TF-IDF Vectorizer (Bag of Words)

In [293]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

vectorizer = TfidfVectorizer()
vectorised_train_documents = vectorizer.fit_transform(cleanedTrainData["content"])
vectorised_test_documents = vectorizer.transform(cleanedTestData["content"])

## Train and Evaluate Classifiers

In [294]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss

ModelsPerformance = {}

def metricsReport(modelName, test_labels, predictions):
    accuracy = accuracy_score(test_labels, predictions)

    macro_precision = precision_score(test_labels, predictions, average='macro')
    macro_recall = recall_score(test_labels, predictions, average='macro')
    macro_f1 = f1_score(test_labels, predictions, average='macro')

    micro_precision = precision_score(test_labels, predictions, average='micro')
    micro_recall = recall_score(test_labels, predictions, average='micro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')
    hamLoss = hamming_loss(test_labels, predictions)
    print("------" + modelName + " Model Metrics-----")
    print("Accuracy: {:.4f}\nHamming Loss: {:.4f}\nPrecision:\n  - Macro: {:.4f}\n  - Micro: {:.4f}\nRecall:\n  - Macro: {:.4f}\n  - Micro: {:.4f}\nF1-measure:\n  - Macro: {:.4f}\n  - Micro: {:.4f}"\
          .format(accuracy, hamLoss, macro_precision, micro_precision, macro_recall, micro_recall, macro_f1, micro_f1))
    ModelsPerformance[modelName] = micro_f1

In [295]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

svmClassifier = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
svmClassifier.fit(vectorised_train_documents, train_labels)

svmPreds = svmClassifier.predict(vectorised_test_documents)
metricsReport("SVC Sq. Hinge Loss", test_labels, svmPreds)

------SVC Sq. Hinge Loss Model Metrics-----
Accuracy: 0.8115
Hamming Loss: 0.0034
Precision:
  - Macro: 0.6485
  - Micro: 0.9445
Recall:
  - Macro: 0.3961
  - Micro: 0.8007
F1-measure:
  - Macro: 0.4680
  - Micro: 0.8667


In [296]:
from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(test_labels, svmPreds))


Classification Report

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       719
           1       1.00      0.48      0.65        23
           2       1.00      0.64      0.78        14
           3       0.87      0.67      0.75        30
           4       0.88      0.39      0.54        18
           5       0.00      0.00      0.00         1
           6       1.00      0.94      0.97        18
           7       1.00      0.50      0.67         2
           8       0.00      0.00      0.00         3
           9       0.96      0.96      0.96        28
          10       1.00      0.78      0.88        18
          11       0.00      0.00      0.00         1
          12       0.96      0.79      0.86        56
          13       1.00      0.55      0.71        20
          14       0.00      0.00      0.00         2
          15       0.93      0.46      0.62        28
          16       0.00      0.00      0.00         1
   

# Word2Vec

In [297]:
## for word embedding
import gensim
import gensim.downloader as gensim_api
nlp = gensim_api.load("word2vec-google-news-300")

In [298]:
# from https://github.com/giannisnik/mpad/blob/master/mpad/utils.py
def load_embeddings(model, vocab):
    word_vecs = np.zeros((len(vocab)+1, 300))
    unknown_words = set()
    #model = KeyedVectors.load_word2vec_format(fname, binary=True)
    for word in vocab:
        if word in model:
            word_vecs[vocab[word],:] = model[word]
        else:
            unknown_words.add(word)
            word_vecs[vocab[word],:] = np.random.uniform(-0.25, 0.25, 300)
    print("Existing vectors:", len(vocab)-len(unknown_words))
    return word_vecs

def get_vocab(docs):
    vocab = dict()
    
    for index, row in docs.iterrows():
        doc = row['content'].split()
        
        for word in doc:
            if word not in vocab:
                vocab[word] = len(vocab)+1

    print("Vocabulary size: ", len(vocab))
        
    return vocab

In [299]:
vocab = get_vocab(pd.concat([cleanedTrainData, cleanedTestData]))
embeddings = load_embeddings(nlp, vocab)

Vocabulary size:  19906
Existing vectors: 7940


In [300]:
embeddings.shape

(19907, 300)

In [301]:
## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

In [302]:
def text2seq(docs, vocab, maxlen):
    res = []
    for index, row in docs.iterrows():
        doc = row['content'].split()
        tmp = []
        if len(doc) > maxlen:
            for i in range(maxlen): # for each doc, keep maxlen words
                tmp.append(vocab[doc[i]])
        else:
            for i in range(len(doc)):
                tmp.append(vocab[doc[i]])
            for i in range(maxlen - len(doc)):
                tmp.append(0)
        res.append(tmp)
        
    return np.array(res)


In [308]:
maxlen = 500 # how many words to check at each document
X_train = text2seq(cleanedTrainData, vocab, maxlen)
X_test = text2seq(cleanedTestData, vocab, maxlen)

# ## encode y
# dic_y_mapping = {n:label for n,label in 
#                  enumerate(np.unique(train_labels))}
# inverse_dic = {v:k for k,v in dic_y_mapping.items()}
# y_train = np.array([inverse_dic[y] for y in y_train])


In [309]:
## code attention layer
def attention_layer(inputs, neurons):
    x = layers.Permute((2,1))(inputs)
    x = layers.Dense(neurons, activation="softmax")(x)
    x = layers.Permute((2,1), name="attention")(x)
    x = layers.multiply([inputs, x])
    return x

## input
x_in = layers.Input(shape=(maxlen,))
## embedding
x = layers.Embedding(input_dim=embeddings.shape[0],  
                     output_dim=embeddings.shape[1], 
                     weights=[embeddings],
                     input_length=maxlen, trainable=False)(x_in)
## apply attention
#x = attention_layer(x, neurons=15)
## 2 layers of bidirectional lstm
x = layers.Bidirectional(layers.LSTM(units=maxlen, dropout=0.2, 
                         return_sequences=True))(x) #  return_sequences=True return the whole sequence from LSTM cells
x = layers.Bidirectional(layers.LSTM(units=maxlen, dropout=0.2))(x)
## final dense layers
x = layers.Dense(64, activation='relu')(x)
y_out = layers.Dense(90, activation='sigmoid')(x) # 90 possible categories # multi-label classification, use sigmoid
## compile
model = models.Model(x_in, y_out)
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "model_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        [(None, 500)]             0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 500, 300)          5972100   
_________________________________________________________________
bidirectional_18 (Bidirectio (None, 500, 1000)         3204000   
_________________________________________________________________
bidirectional_19 (Bidirectio (None, 1000)              6004000   
_________________________________________________________________
dense_28 (Dense)             (None, 64)                64064     
_________________________________________________________________
dense_29 (Dense)             (None, 90)                5850      
Total params: 15,250,014
Trainable params: 9,277,914
Non-trainable params: 5,972,100
_______________________________________

In [None]:
## train
training = model.fit(x=X_train, y=train_labels, validation_data=(X_test, test_labels), batch_size=256, 
                     epochs=100, shuffle=True, verbose=1) #validation_split=0.3
## plot loss and accuracy
metrics = [k for k in training.history.keys() if ("loss" not in k) and ("val" not in k)]
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True)
ax[0].set(title="Training")
ax11 = ax[0].twinx()
ax[0].plot(training.history['loss'], color='black')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss', color='black')
for metric in metrics:
    ax11.plot(training.history[metric], label=metric)
ax11.set_ylabel("Score", color='steelblue')
ax11.legend()
ax[1].set(title="Validation")
ax22 = ax[1].twinx()
ax[1].plot(training.history['val_loss'], color='black')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss', color='black')
for metric in metrics:
     ax22.plot(training.history['val_'+metric], label=metric)
ax22.set_ylabel("Score", color="steelblue")
plt.show()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100

In [306]:
## test
th = 0.5
LSTM_preds = model.predict(X_test)
LSTM_preds[LSTM_preds>=th] = 1
LSTM_preds[LSTM_preds<th] = 0

In [307]:
#from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(test_labels, LSTM_preds))


Classification Report

              precision    recall  f1-score   support

           0       0.97      0.92      0.95       719
           1       0.71      0.22      0.33        23
           2       0.41      0.50      0.45        14
           3       0.46      0.60      0.52        30
           4       0.64      0.39      0.48        18
           5       0.00      0.00      0.00         1
           6       0.94      0.89      0.91        18
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.89      0.89      0.89        28
          10       0.73      0.61      0.67        18
          11       0.00      0.00      0.00         1
          12       0.58      0.77      0.66        56
          13       0.65      0.55      0.59        20
          14       0.00      0.00      0.00         2
          15       0.60      0.43      0.50        28
          16       0.00      0.00      0.00         1
   

In [158]:
train_labels.shape

(7769, 90)

# BERT Bidirectional Encoder Representations from Transformers

In [287]:
## for bert language model
import transformers
import re

In [288]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_texts = trainDf["content"].to_list()

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
#valid_encodings = tokenizer(valid_texts, truncation=True, padding=True)
#test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [289]:
## inputs
input_ids = layers.Input((512), dtype="int32", name="input_idx")
token_type_ids = layers.Input((512), dtype="int32", name="input_masks")
attention_mask = layers.Input((512), dtype="int32", name="input_segments")
## pre-trained bert
nlp = transformers.TFBertModel.from_pretrained("bert-base-uncased")
bert_out = nlp(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
## fine-tuning
x = layers.GlobalAveragePooling1D()(bert_out)
x = layers.Dense(64, activation="relu")(x)
y_out = layers.Dense(90, 
                     activation='sigmoid')(x)
## compile
model = models.Model([input_ids, token_type_ids, attention_mask], y_out)
for layer in model.layers[:4]:
    layer.trainable = False
model.compile(loss='binary_crossentropy', 
              optimizer='adam', metrics=['accuracy'])

model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config o

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_idx (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_segments (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_4 (TFBertModel)   TFBaseModelOutputWit 109482240   input_idx[0][0]                  
                                                                 input_segments[0][0]      

In [290]:
training = model.fit(x=[np.array(train_encodings['input_ids']), np.array(train_encodings['token_type_ids']), np.array(train_encodings['attention_mask'])], y=train_labels, batch_size=256, 
                     epochs=100, shuffle=True, verbose=1) 

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 1/100



The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

KeyboardInterrupt: 

In [219]:
## pre-trained bert
nlp = transformers.TFBertModel.from_pretrained("bert-base-uncased")
bert_out, _ = nlp([idx, masks, segments])
## fine-tuning
x = layers.GlobalAveragePooling1D()(bert_out)
x = layers.Dense(64, activation="relu")(x)
y_out = layers.Dense(len(np.unique(y_train)), 
                     activation='softmax')(x)
## compile
model = models.Model([idx, masks, segments], y_out)
for layer in model.layers[:4]:
    layer.trainable = False
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])
model.summary()

49

In [214]:
seq.split(" ")

['[CLS]',
 'bahia',
 'cocoa',
 'review',
 'showers',
 'continued',
 'throughout',
 'the',
 'week',
 'in',
 'the',
 'bahia',
 'cocoa',
 'zone',
 'all',
 '##ev',
 '[SEP]',
 '',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [206]:
seq.split(" ")

['[CLS]',
 'bahia',
 'cocoa',
 'review',
 'showers',
 'continued',
 'throughout',
 'the',
 'week',
 'in',
 'the',
 'bahia',
 'cocoa',
 'zone',
 'all',
 '##ev',
 '[SEP]',
 '',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']