# **UNIT TESTS FOR ANN**

In [None]:
import re
import numpy as np
import pandas as pd
from nltk import WordNetLemmatizer
import keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Embedding, Bidirectional, GRU
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
import nltk 
import pickle
from pickle import load
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from nltk import download
download('wordnet')
download('omw-1.4')
download('stopwords')
import tensorflow
import gensim
import gensim.downloader as api
from sklearn.model_selection import train_test_split
import unittest
from google.colab import files

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def preprocessing(text):
    text = text.lower()
    text_cleaned = re.sub(r'[^0-9a-z_+\-*]', ' ', text).strip()
    lemm = WordNetLemmatizer()
    title = []
    for token in text_cleaned.split():
        token_lemm = lemm.lemmatize(token)
        if token_lemm not in stopwords.words('english'):
             title.append(lemm.lemmatize(token))
    return ' '.join(str(elem) for elem in title)

In [None]:
uploaded = files.upload()
data = pd.read_excel('/content/sample_400_Data&Headers.xlsx')

In [None]:
train_data, test_data, train_label, test_label = train_test_split(data['headline'],data['labels'] ,random_state=104,test_size=0.25, shuffle=True)

### DATA PREPARATION

In [None]:
labels = ['t', 'e', 'b', 'm']
max_token = 20 
data_tokenizer = Tokenizer()
label_tokenizer = Tokenizer()
data_tokenizer.fit_on_texts(train_data)
label_tokenizer.fit_on_texts(labels)

In [None]:
train_data_sequences = data_tokenizer.texts_to_sequences(train_data)
test_data_sequences = data_tokenizer.texts_to_sequences(test_data)
train_label_sequences = label_tokenizer.texts_to_sequences(train_label)
test_label_sequences = label_tokenizer.texts_to_sequences(test_label)

In [None]:
train_data_pad = pad_sequences(train_data_sequences, maxlen=max_token, padding='pre', truncating='pre')
test_data_pad = pad_sequences(test_data_sequences, maxlen=max_token, padding='pre', truncating='pre')
train_label_pad = np.array(train_label_sequences)
test_label_pad = np.array(test_label_sequences)

## WORD EMBEDDING IMPORT
- word2vec: imported from Gensim
- glove: local file 'glove.6B.300d.txt'

In [None]:
w2v_model = api.load('word2vec-google-news-300')



In [None]:
# Upload might take a couple of hours to complete
from google.colab import files
uploaded = files.upload()
gloVe_path = '/content/glove.6B.300d.txt'

Saving glove.6B.300d.txt to glove.6B.300d.txt


In [None]:
# Setting variables for the model
embed_dim = 300
max_len=20
vocab_size= len(data_tokenizer.word_index) + 1

In [None]:
# word2vec ---> VOCAB MATRIX
def word_emb_w2v(w2v_model=w2v_model):
  vocab = w2v_model.vocab
  word_vec_dict={}
  for word in vocab:
    word_vec_dict[word] = w2v_model.wv.get_vector(word)
  embedding_matrix = get_embed_matrix(word_vec_dict)
  return embedding_matrix

In [None]:
# GloVe ---> VOCAB MATRIX
def word_emb_glove(gloVe_path=gloVe_path):
  word_vec_dict = {}
  file_reader = open(gloVe_path, 'r', encoding='utf-8')
  for vect in file_reader:
      values = vect.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      word_vec_dict[word] = coefs
  file_reader.close()
  embedding_matrix = get_embed_matrix(word_vec_dict)
  return embedding_matrix

In [None]:
# create embed matrix
def get_embed_matrix(word_vec_dict):
  embedding_matrix = np.zeros(shape=(vocab_size, embed_dim))
  for word, i in data_tokenizer.word_index.items():
    embed_vector = word_vec_dict.get(word)
    if embed_vector is not None:
      embedding_matrix[i] = embed_vector
  return embedding_matrix

In [None]:
def build_model(word_emb, rnn_layer, embedding_matrix):
  if word_emb == True:
    embedding_initializer = tensorflow.constant_initializer(embedding_matrix)
  else:
    # default value
    embedding_initializer = 'uniform'
  if rnn_layer == 'lstm':
    rnn = LSTM(100)
  else:
    # default value
    rnn = GRU(100)
  
  model = Sequential()
  model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_len, embeddings_initializer=embedding_initializer))
  model.add(Dropout(0.5))
  model.add(Bidirectional(rnn))
  model.add(Dropout(0.5))
  model.add(Dense(5))
  model.compile(optimizer=keras.optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

## EXECUTION OF MODELS

In [None]:
epochs = 4
batch_size = 32
embedding_matrix_glove = word_emb_glove()
embedding_matrix_w2v = word_emb_w2v()
models = {
    "LSTM": build_model(False, 'lstm', None),
    "GRU": build_model(False, 'gru', None),
    "WORD2VEC-LSTM": build_model(True, 'lstm', embedding_matrix_w2v),
    "WORD2VEC-GRU": build_model(True, 'gru', embedding_matrix_w2v),
    "GLOVE-LSTM": build_model(True, 'lstm', embedding_matrix_glove),
    "GLOVE-GRU": build_model(True, 'gru', embedding_matrix_glove)
}

  word_vec_dict[word] = w2v_model.wv.get_vector(word)


In [None]:
output = []
for model in models:
  models[model].fit(train_data_pad, train_label_pad, epochs=epochs, batch_size=batch_size)
  output.append(models[model].predict(test_data_pad))

##Setting test parameters & executing test

In [None]:
class Test(unittest.TestCase):
  def __init__(self, testName, output):
    super(Test, self).__init__(testName) 
    self.output = output

  # check if the prediction is an array
  def test_type_prediction(self):
    error_message = "Output of model is not a dinamic array"
    self.assertEqual(type(self.output), np.ndarray, error_message)

  # check if the prediction legth is 100
  #   -> (400 headlines * 25% test size = 100) 
  def test_length_predictions(self):
    error_message = "The length of the prediction is not the expected"
    self.assertEqual(len(self.output), 100, error_message)

  # check if ANN structure has expected layers
  def test_ANN_structure(self):
    layers = self.output.layers
    layer_structure = list()
    for layer in layers:
        layer_structure.append(type(layer))

    lista_layers = [
        keras.layers.core.embedding.Embedding,
        keras.layers.regularization.dropout.Dropout,
        keras.layers.rnn.bidirectional.Bidirectional,
        keras.layers.regularization.dropout.Dropout,
        keras.layers.core.dense.Dense
    ]
    error_message = "The ANN structure is not the expected"
    self.assertEqual(layer_structure, lista_layers, error_message)

In [None]:
suite = unittest.TestSuite()
output_list = output
for i, model in enumerate(models):
  label_prediction = output_list[i]
  suite.addTest(Test('test_type_prediction', label_prediction))
  suite.addTest(Test('test_length_predictions', label_prediction))
  suite.addTest(Test('test_ANN_structure', models[model]))
unittest.TextTestRunner(verbosity=4).run(suite)

test_type_prediction (__main__.Test) ... ok
test_length_predictions (__main__.Test) ... ok
test_ANN_structure (__main__.Test) ... ok
test_type_prediction (__main__.Test) ... ok
test_length_predictions (__main__.Test) ... ok
test_ANN_structure (__main__.Test) ... ok
test_type_prediction (__main__.Test) ... ok
test_length_predictions (__main__.Test) ... ok
test_ANN_structure (__main__.Test) ... ok
test_type_prediction (__main__.Test) ... ok
test_length_predictions (__main__.Test) ... ok
test_ANN_structure (__main__.Test) ... ok
test_type_prediction (__main__.Test) ... ok
test_length_predictions (__main__.Test) ... ok
test_ANN_structure (__main__.Test) ... ok
test_type_prediction (__main__.Test) ... ok
test_length_predictions (__main__.Test) ... ok
test_ANN_structure (__main__.Test) ... ok

----------------------------------------------------------------------
Ran 18 tests in 0.042s

OK


<unittest.runner.TextTestResult run=18 errors=0 failures=0>