In [None]:
!pip install --upgrade gensim

import nltk
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

from time import time
from torch import nn, optim
from torch.optim.lr_scheduler import StepLR
from torchvision import datasets, transforms
from sklearn import metrics

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from tables.path import keyword
from sklearn.model_selection import train_test_split
from tensorflow import  keras
from keras.preprocessing import sequence

_CAT = 'category'
_MSG = 'message'



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


# Curso de Aprendizaje Automático
# Trabajo Practico 2: Redes Recurrentes y Representaciones Incrustadas

**Escuela de Ingeniería en Computación | Instituto Tecnológico de Costa Rica**

Realizado por:

*   Victoria Orozco
*   Ignacio Barquero
*   Esteban Villalobos

Fecha de entrega:

* 20 de Noviembre de 2022

Tipo de entrega:

* Digital, por medio de la plataforma TEC-digital.

Modo de trabajo

* Grupos de 2/3 personas.


## Data preprocessing
The class Preprocessing loads the specific dataset and makes the data partitions. It also converts the input text to indices, in order to feed the embedding layer.



In [None]:
def _preprocesar_oracion_1(sentence):
	''' Removes capitalization and stopworkds '''

	# Tokenize words while ignoring punctuation
	tokeniser = RegexpTokenizer(r'\w+')
	tokens = tokeniser.tokenize(sentence)
	keywords = [token.lower() for token in tokens
	            if token not in stopwords.words('english')]
	preprocessed_sentence = " ".join(keywords)
	return preprocessed_sentence


def _preprocesar_oracion_2(sentece):
	''' Removes capitalization, stopworkds and lematizes'''

	# Tokenize words while ignoring punctuation
	tokeniser = RegexpTokenizer(r'\w+')
	tokens = tokeniser.tokenize(sentece)

	# Lowercase and remove stopwords
	filtered_tokens = [token.lower() for token in tokens
	                   if token not in stopwords.words('english')]

	# Lowercase and lemmatise
	lemmatiser = WordNetLemmatizer()
	lemmas = [lemmatiser.lemmatize(token, pos='v') for token in filtered_tokens]

	preprocessed_sentence = " ".join(lemmas)
	return preprocessed_sentence


Excercize 1.a.1: Compare preprocessing versions

In [None]:
class Preprocessing:
	''' Usage example
	Input doc:
		['URGENT! Your Mobile number has been awarded a <UKP>2000 prize GUARANTEED. Call 09061790125 from landline. Claim 3030. Valid 12hrs only 150ppm']

	Output:
		preprocesar_documento_1:
			['urgent your mobile number awarded ukp 2000 prize guaranteed call 09061790125 landline claim 3030 valid 12hrs 150ppm']

		Procesar_documento_2:
			['urgent mobile number award ukp 2000 prize guarantee call 09061790125 landline claim 3030 valid 12hrs 150ppm']
	'''

	def __init__(self):
		self.data = 'SMSSpamCollection'
    #maximum length for each sequence, CORRECT
		self.max_len = 200
    #Maximum number of words in the dictionary
		self.max_words = 200
		#percentage of test data
		self.test_size = 0.2

	def load_data(self, drop_na=False):
		"""
		Loads and splits the data
		"""
		# load training and test data
		df = pd.read_csv(self.data, sep="\t", names = [_CAT, _MSG], header = None)
		# print(df)

		# Replace dependent variable to a int value: 0 = not_spam, 1 = spam
		df.category = df.category.map({'ham': 0., 'spam': 1.})
		df.astype({_CAT:'float'})
		print(df)

		# extract input and labels
		X = df['message'].values
		Y = df['category'].values
		# create train/test split using sklearn
		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, Y, test_size=self.test_size)

	def preprocesar_documento_1(self):
		""" Preprocess training corpus version 1 """

		for i in range (0, len(self.x_train)):
			self.x_train[i] = _preprocesar_oracion_1(self.x_train[i])

	def preprocesar_documento_2(self):
		""" Preprocess training corpus version 2 """

		for i in range (0, len(self.x_train)):
			self.x_train[i] = _preprocesar_oracion_2(self.x_train[i])

	def prepare_tokens(self, version=2):
		"""
		Tokenizes the input text
		"""
		#tokenize the input text
		if version == 1:
			self.preprocesar_documento_1()
		else:
			self.preprocesar_documento_2()

		self.tokens = Tokenizer(num_words=self.max_words)
		self.tokens.fit_on_texts(self.x_train)

	def sequence_to_token(self, x):
		"""
		Converts the input sequence of strings to a sequence of integers
		"""
		#transform the token list to a sequence of integers
		sequences = self.tokens.texts_to_sequences(x)
	  #add padding using the maximum length specified
		return keras.utils.pad_sequences(sequences, maxlen=self.max_len)


In [6]:
p = Preprocessing()
print('Data loading')
p.load_data()
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
sent = p.x_train[50]
print('Initial sentence(random):\t', sent)
print('Preprocessed sentence v1:\t', _preprocesar_oracion_1(sent))
print('Preprocessed sentence v2:\t', _preprocesar_oracion_2(sent))

print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
print('Example')
sent = "Wow. I never realized that you were so embarassed by your accomodations. " \
       "I thought you liked it, since i was doing the best i could and you always seemed so happy about \"the cave\". "\
       "I'm sorry I didn't and don't have more to give."
print('Initial sentence:\t\t', sent)
print('Preprocessed sentence v1:\t', _preprocesar_oracion_1(sent))
print('Preprocessed sentence v2:\t', _preprocesar_oracion_2(sent))


Data loading
      category                                            message
0          0.0  Go until jurong point, crazy.. Available only ...
1          0.0                      Ok lar... Joking wif u oni...
2          1.0  Free entry in 2 a wkly comp to win FA Cup fina...
3          0.0  U dun say so early hor... U c already then say...
4          0.0  Nah I don't think he goes to usf, he lives aro...
...        ...                                                ...
5567       1.0  This is the 2nd time we have tried 2 contact u...
5568       0.0               Will ü b going to esplanade fr home?
5569       0.0  Pity, * was in mood for that. So...any other s...
5570       0.0  The guy did some bitching but I acted like i'd...
5571       0.0                         Rofl. Its true to its name

[5572 rows x 2 columns]
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Initial sentence(random):	 No idea, I guess we'll work that out an hour after we're supposed to leave si

## Model
Creates the LSTM model. The hidden state $h$ and cell $c$ are initialized with noise. The LSTM receives the entire sequence of embeddings.
An Embedding layer is trained in order to learn the data representations.
At the top of the model, a fully connected model is defined.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTM_Classifier(nn.ModuleList):

	def __init__(self, batch_size = 64, hidden_dim = 20, lstm_layers = 2,
	             max_words = 200):
		"""
		param batch_size: batch size for training data
		param hidden_dim: number of hidden units used in the LSTM and the
										  Embedding layer
		param lstm_layers: number of lstm_layers
		param max_words: maximum sentence length
		"""
		super(LSTM_Classifier, self).__init__()
		#batch size during training
		self.batch_size = batch_size
		#number of hidden units in the LSTM layer
		self.hidden_dim = hidden_dim
		#Number of LSTM layers
		self.LSTM_layers = lstm_layers
		self.input_size = max_words # embedding dimension

		self.dropout = nn.Dropout(0.5)
		self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0)
		self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True)
		self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=257)
		self.fc2 = nn.Linear(257, 1)

	def forward(self, x):
		"""
		Forward pass
		param x: model input
		"""
		#it starts with noisy estimations of h and c
		h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))
		c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))
		#Fills the input Tensor with values according to the method described in Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a normal distribution.
		#The resulting tensor will have values sampled from \mathcal{N}(0, \text{std}^2)N(0,std)
		torch.nn.init.xavier_normal_(h)
		torch.nn.init.xavier_normal_(c)
		#print("x shape ", x.shape)
		#print("embedding ", self.embedding)
		out = self.embedding(x)
		out, (hidden, cell) = self.lstm(out, (h,c))
		out = self.dropout(out)
		out = torch.relu_(self.fc1(out[:,-1,:]))
		out = self.dropout(out)
	  #sigmoid activation function
		out = torch.sigmoid(self.fc2(out))

		return out

## Data iterator
In order to get ready the training phase, first, we need to prepare the way how the sequences will be fed to the model. For this purpose, PyTorch provides two very useful classes: Dataset and DataLoader. The aim of Dataset class is to provide an easy way to iterate over a dataset by batches.

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class DatasetMaper(Dataset):
	'''
	Handles batches of dataset
	'''
	def __init__(self, x, y):
		"""
		Inits the dataset mapper
		"""
		self.x = x
		self.y = y

	def __len__(self):
		"""
		Returns the length of the dataset
		"""
		return len(self.x)

	def __getitem__(self, idx):
		"""
		Fetches a specific item by id
		"""
		return self.x[idx], self.y[idx]




## Load training data

In [None]:
def create_data_loaders(batch_size = 64, preprocess_version=2):
  preprocessor = Preprocessing()
  #load the data
  preprocessor.load_data()

  #tokenize the text
  preprocessor.prepare_tokens(version=preprocess_version)
  raw_x_train = preprocessor.x_train
  raw_x_test = preprocessor.x_test
  y_train = preprocessor.y_train
  y_test = preprocessor.y_test

  #convert sequence of strings to tokens
  x_train = preprocessor.sequence_to_token(raw_x_train)
  x_test = preprocessor.sequence_to_token(raw_x_test)

  #create data loaders
  training_set = DatasetMaper(x_train, y_train)
  test_set = DatasetMaper(x_test, y_test)
  loader_training = DataLoader(training_set, batch_size=batch_size)
  loader_test = DataLoader(test_set)
  return loader_training, loader_test

In [None]:
loader_training_v2, loader_test_v2 = create_data_loaders(preprocess_version=2)
loader_training_v1, loader_test_v1 = create_data_loaders(preprocess_version=1)

      category                                            message
0          0.0  Go until jurong point, crazy.. Available only ...
1          0.0                      Ok lar... Joking wif u oni...
2          1.0  Free entry in 2 a wkly comp to win FA Cup fina...
3          0.0  U dun say so early hor... U c already then say...
4          0.0  Nah I don't think he goes to usf, he lives aro...
...        ...                                                ...
5567       1.0  This is the 2nd time we have tried 2 contact u...
5568       0.0               Will ü b going to esplanade fr home?
5569       0.0  Pity, * was in mood for that. So...any other s...
5570       0.0  The guy did some bitching but I acted like i'd...
5571       0.0                         Rofl. Its true to its name

[5572 rows x 2 columns]
      category                                            message
0          0.0  Go until jurong point, crazy.. Available only ...
1          0.0                      Ok lar... Jokin

## Train the model


Accuracy evaluation functions

In [None]:
from sklearn.metrics import accuracy_score
import numpy as np


def calculate_accuray(y_pred, y_gt):
  return accuracy_score(y_pred, y_gt)


def evaluate_model(model, loader_test):

  predictions = []
  accuracies = []
    # The model is turned in evaluation mode
  model.eval()

      # Skipping gradients update
  with torch.no_grad():

            # Iterate over the DataLoader object
    for x_batch, y_batch in loader_test:
      #print("batch")
      x = x_batch.type(torch.LongTensor)
      y = y_batch.type(torch.FloatTensor)

                  # Feed the model
      y_pred = model(x)
      y_pred = torch.round(y_pred).flatten()
      # print("y_pred: ", y_pred)
      # Save prediction
      predictions += list(y_pred.detach().numpy())
      acc_batch = accuracy_score(y_pred, y)
      accuracies += [acc_batch]
  return np.array(accuracies)

Train the model using the dataset loader for the training partition.

In [None]:
import torch.optim as optim

def train_model(model, loader_training, loader_test, epochs = 10,
                learning_rate = 0.01):

  # Defines a RMSprop optimizer to update the parameters
  optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)


  for epoch in range(epochs):

    predictions = []

    # model in training mode
    model.train()
    loss_dataset = 0
    for x_batch, y_batch in loader_training:
      #print("x_batch \n ", x_batch)
      #print("y batch \n", y_batch)
      x = x_batch.type(torch.LongTensor)
      y = y_batch.type(torch.FloatTensor)
      # Feed the model the entire sequence and get output "y_pred"
      y_pred = model(x).flatten()
      #print("y\n", y)
      #print("y pred ", y_pred)
      # Calculate loss
      loss = F.binary_cross_entropy(y_pred, y)

      # The gradientes are calculated
      # i.e. derivates are calculated
      loss.backward()

      # Each parameter is updated
      # with torch.no_grad():
      #     a -= lr * a.grad
      #     b -= lr * b.grad
      optimizer.step()
      # Take the gradients to zero!
      # a.grad.zero_()
      # b.grad.zero_()
      optimizer.zero_grad()
      loss_dataset += loss
    accuracies = evaluate_model(model, loader_test)
    print("Epoch ", epoch, " Loss training : ", loss_dataset.item(), " Accuracy test: ", accuracies.mean())

Test: D = 20, preprocess version 2, 10 models test

In [None]:
#hyper parameters
learning_rate = 0.01
epochs = 10  # 50

for i in range(10):
  print("Model #", i)
  model = LSTM_Classifier(hidden_dim=20)  # Dimension = 20
  # Train preprocessor 2
  train_model(model, loader_training_v2, loader_test_v2, epochs, learning_rate)
  accuracies = evaluate_model(model, loader_test_v2)
  print(f"average accuracy model #{i}: {accuracies.mean()}")
  print("++++++++++++++++++++++++++++++++++++++++++++++++\n")



Model # 0
Epoch  0  Loss training :  19.57164192199707  Accuracy test:  0.9596412556053812
Epoch  1  Loss training :  6.532021522521973  Accuracy test:  0.9659192825112107
Epoch  2  Loss training :  5.148623466491699  Accuracy test:  0.967713004484305
Epoch  3  Loss training :  4.374024391174316  Accuracy test:  0.9560538116591928
Epoch  4  Loss training :  3.4256131649017334  Accuracy test:  0.9542600896860987
Epoch  5  Loss training :  3.024905204772949  Accuracy test:  0.9659192825112107
Epoch  6  Loss training :  2.7564826011657715  Accuracy test:  0.9641255605381166
Epoch  7  Loss training :  1.9829388856887817  Accuracy test:  0.9605381165919282
Epoch  8  Loss training :  1.7899994850158691  Accuracy test:  0.9542600896860987
Epoch  9  Loss training :  1.202175498008728  Accuracy test:  0.9614349775784753
average accuracy model #0: 0.9614349775784753
++++++++++++++++++++++++++++++++++++++++++++++++

Model # 1
Epoch  0  Loss training :  14.98291301727295  Accuracy test:  0.9587443

Test: D = 100, preprocess version 2, 10 models test

In [None]:
#hyper parameters
learning_rate = 0.01
epochs = 10  # 50

for i in range(10):
  print("Model #", i)
  model_100 = LSTM_Classifier(hidden_dim=100)  # Dimension = 100
  # Train preprocessor 2
  train_model(model_100, loader_training_v2, loader_test_v2, epochs, learning_rate)
  accuracies = evaluate_model(model_100, loader_test_v2)
  print(f"average accuracy model #{i}: {accuracies.mean()}")
  print("++++++++++++++++++++++++++++++++++++++++++++++++\n")

Model # 0
Epoch  0  Loss training :  40.62193298339844  Accuracy test:  0.9713004484304932
Epoch  1  Loss training :  6.095354080200195  Accuracy test:  0.9515695067264573
Epoch  2  Loss training :  4.000911712646484  Accuracy test:  0.9632286995515695
Epoch  3  Loss training :  3.010891914367676  Accuracy test:  0.9560538116591928
Epoch  4  Loss training :  2.389524221420288  Accuracy test:  0.947085201793722
Epoch  5  Loss training :  7.597579479217529  Accuracy test:  0.9327354260089686
Epoch  6  Loss training :  2.5797417163848877  Accuracy test:  0.9641255605381166
Epoch  7  Loss training :  1.0570229291915894  Accuracy test:  0.947085201793722
Epoch  8  Loss training :  1.48758065700531  Accuracy test:  0.9524663677130045
Epoch  9  Loss training :  1.0288139581680298  Accuracy test:  0.9506726457399103
average accuracy model #0: 0.9506726457399103
++++++++++++++++++++++++++++++++++++++++++++++++

Model # 1
Epoch  0  Loss training :  53.991249084472656  Accuracy test:  0.966816143

Test: D = 20, preprocess version 1, 10 models test

In [None]:
#hyper parameters
learning_rate = 0.01
epochs = 10  # 50

for i in range(10):
  print("Model #", i)
  model = LSTM_Classifier(hidden_dim=20)  # Dimension = 20
  # Train preprocessor 2
  train_model(model, loader_training_v1, loader_test_v1, epochs, learning_rate)
  accuracies = evaluate_model(model, loader_test_v1)
  print(f"average accuracy model #{i}: {accuracies.mean()}")
  print("++++++++++++++++++++++++++++++++++++++++++++++++\n")

Model # 0
Epoch  0  Loss training :  14.535436630249023  Accuracy test:  0.9417040358744395
Epoch  1  Loss training :  5.504261016845703  Accuracy test:  0.9390134529147982
Epoch  2  Loss training :  4.109217643737793  Accuracy test:  0.9479820627802691
Epoch  3  Loss training :  3.0082881450653076  Accuracy test:  0.9461883408071748
Epoch  4  Loss training :  2.3335564136505127  Accuracy test:  0.9345291479820628
Epoch  5  Loss training :  2.002533435821533  Accuracy test:  0.9327354260089686
Epoch  6  Loss training :  1.5289751291275024  Accuracy test:  0.9560538116591928
Epoch  7  Loss training :  4.377479553222656  Accuracy test:  0.9399103139013453
Epoch  8  Loss training :  2.274275302886963  Accuracy test:  0.9417040358744395
Epoch  9  Loss training :  1.2916669845581055  Accuracy test:  0.9479820627802691
average accuracy model #0: 0.9488789237668162
++++++++++++++++++++++++++++++++++++++++++++++++

Model # 1
Epoch  0  Loss training :  14.743879318237305  Accuracy test:  0.9452

Test: D = 100, preprocess version 1, 10 models test

In [None]:
#hyper parameters
learning_rate = 0.01
epochs = 10  # 50

for i in range(10):
  print("Model #", i)
  model = LSTM_Classifier(hidden_dim=100)  # Dimension = 100
  # Train preprocessor 2
  train_model(model, loader_training_v1, loader_test_v1, epochs, learning_rate)
  accuracies = evaluate_model(model, loader_test_v1)
  print(f"average accuracy model #{i}: {accuracies.mean()}")
  print("++++++++++++++++++++++++++++++++++++++++++++++++\n")

Model # 0
Epoch  0  Loss training :  721.6635131835938  Accuracy test:  0.8807174887892377
Epoch  1  Loss training :  634.4773559570312  Accuracy test:  0.8807174887892377
Epoch  2  Loss training :  12.37992000579834  Accuracy test:  0.9668161434977578
Epoch  3  Loss training :  5.625229835510254  Accuracy test:  0.9668161434977578
Epoch  4  Loss training :  3.593289852142334  Accuracy test:  0.9605381165919282
Epoch  5  Loss training :  2.9294116497039795  Accuracy test:  0.9650224215246637
Epoch  6  Loss training :  2.4447474479675293  Accuracy test:  0.947085201793722
Epoch  7  Loss training :  2.2420754432678223  Accuracy test:  0.9668161434977578
Epoch  8  Loss training :  17.86869239807129  Accuracy test:  0.9461883408071748
Epoch  9  Loss training :  7.250354766845703  Accuracy test:  0.9668161434977578
average accuracy model #0: 0.9650224215246637
++++++++++++++++++++++++++++++++++++++++++++++++

Model # 1
Epoch  0  Loss training :  49.65599060058594  Accuracy test:  0.90224215

## 2. Multi Layer Perceptron


1. Feature extractor using word2vec and gensim

In [None]:
class Preprocessor2:
  def __init__(self):
    df = pd.read_csv('SMSSpamCollection', sep="\t", names = [_CAT, _MSG],
                     header = None)

    # Replace dependent variable to a int value: 0 = not_spam, 1 = spam
    df.category = df.category.map({'ham': 0., 'spam': 1.})
    df.astype({_CAT:'float'})
    self.df = df.dropna().reset_index(drop=True)

    # extract input and labels
    X = self.df['message'].values
    Y = self.df['category'].values
    # create train/test split using sklearn
    self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
        X, Y, test_size=0.7)

  def __len__(self):
    return len(self.df[_MSG])

  def __iter__(self):
      for line in self.df[_MSG]:
          # assume there's one document per line, tokens separated by whitespace
          yield self.preprocess(line)

  def preprocess(self, sentence):
    return _preprocesar_oracion_2(sentence)


def generate_model(corpus, num_features):
  # Followed training recomendation from https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial/notebook
  model = Word2Vec(vector_size=num_features, min_count=1, window=2,
                  sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20,
                  workers=4)
  sentences = [s.split() for s in corpus]
  model.build_vocab(sentences, progress_per=10000)
  model.train(sentences, total_examples=len(sentences), epochs=30, report_delay=1)
  return model


def extract_features_dataset(model, preprocessed_dataset, max_length_words = 100,
                             num_features = 20):
  """"
  retorne un tensor features_dataset con N×D dimensiones, donde N es la cantidad de
  observaciones de la muestra, y D la cantidad de dimensiones seleccionadas
  para el vector incrustado (num_features).
  """
  N = len(preprocessed_dataset)

  empty_vector = np.array([0]*num_features, dtype=np.float32)
  features_list = []
  for i in range(len(preprocessed_dataset)):
    feature = []
    sentence = preprocessed_dataset[i]
    # print(sentence)
    words = sentence.split()
    for word in words:
      try:
        feature.append(model.wv.get_vector(word))
      except KeyError:
        feature.append(empty_vector)
        print(f"word not found ${word}, added empty embedding instead")

    # print(feature)  # each word has D dimensions
    features_list.append(feature)

  # normalize sentences to be of max_length_words size
  features_list = keras.utils.pad_sequences(features_list, maxlen=max_length_words,
                                            dtype='float32')
  print(len(features_list))
  return torch.tensor(features_list)

In [None]:
D = 20  # embedding dimensions
corpus = Preprocessor2()
model = generate_model(corpus, D)

sample = [corpus.preprocess(sentence) for sentence in corpus.x_test]

features_dataset_20D = extract_features_dataset(model, sample,
                                                max_length_words=100,
                                                num_features=D)
print(features_dataset_20D)
print(features_dataset_20D.shape)

3901
tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [-0.6404,  0.1182, -0.4312,  ..., -0.8848,  0.0256, -0.9930],
         [-0.4023,  0.0924, -0.3467,  ..., -0.5513, -0.0664, -0.6518],
         [-0.4219,  0.0732, -0.4652,  ..., -0.8297, -0.1072, -0.9690]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [-0.4999,  0.0966, -0.4862,  ..., -0.6704, -0.1964, -0.8771],
         [-0.6807,  0.0463, -0.6160,  ..., -0.7662, -0.2221, -1.0226],
         [-0.3686,  0.0798, -0.2949,  ..., -0.4510, -0.1333, -0.6066]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..

In [None]:
def create_data_loaders2(model, corpus, max_length_words=100, D=20, batch_size = 64):
  raw_x_train =  [corpus.preprocess(sentence) for sentence in corpus.x_train]
  raw_x_test =  [corpus.preprocess(sentence) for sentence in corpus.x_test]
  y_train = corpus.y_train
  y_test = corpus.y_test

  #convert sequence of strings to tokens
  x_train = extract_features_dataset(model, raw_x_train, num_features=D)
  x_test = extract_features_dataset(model, raw_x_test, num_features=D)

  #create data loaders
  training_set = DatasetMaper(x_train, y_train)
  test_set = DatasetMaper(x_test, y_test)
  loader_training = DataLoader(training_set, batch_size=batch_size)
  loader_test = DataLoader(test_set)
  return loader_training, loader_test


MAX_WORDS_PER_SENTENCE = 100
D = 20  # embedding dimensions
corpus = Preprocessor2()
model20 = generate_model(corpus, D)
loader_training_D20, loader_test_D20 = create_data_loaders2(model20, corpus, MAX_WORDS_PER_SENTENCE, D)

D = 100  # embedding dimensions
model100 = generate_model(corpus, D)
loader_training_D100, loader_test_D100 = create_data_loaders2(model100, corpus, MAX_WORDS_PER_SENTENCE, D)


1671
3901
1671
3901


2. MPL SGD

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def create_MLP_model(input_layer_size=2000, intermediate_layer_size=14, ouput_size=2):
    # Model creation with neural net Sequential model
    model=nn.Sequential(
        nn.Linear(input_layer_size, intermediate_layer_size),   # 1 layer: _INPUT_LAYER features (pixels)
        #nn.ReLU(),                                  # Defining Regular linear unit as activation
        nn.Sigmoid(),
        nn.Linear(intermediate_layer_size, ouput_size),   # Out Layer
        nn.LogSoftmax(dim=1) # Defining the log softmax to find the probablities for the last output unit
    )
    return model


def get_new_model(input_size=2000, intermediate_size=14):

  print("Running on device: ", device)
  mlp_model = create_MLP_model(input_size, intermediate_size)

  #moving models to device
  mlp_model.to(device)
  #create error criterion
  criterion = nn.NLLLoss()
  print("MLP model\n", mlp_model)

  return mlp_model, criterion

In [None]:
def train_model(model, criterion, trainloader, epochs = 15, lr = 0.01):
    time0 = time()
    running_loss_list= []
    epochs_list = []
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    for e in range(epochs):
        running_loss = 0

        #go for every batch
        for features, labels in trainloader:
            #move data to specific device
            features = features.flatten(1, 2)
            features = features.to(device)
            labels = labels.to(device)

            # defining gradient in each epoch as 0
            optimizer.zero_grad()
            # modeling for each image batch
            features = features.float()
            output = model(features)

            labels = labels.long()
            # calculating the loss
            loss = criterion(output, labels)  # tener cuidado que tanto etiquetas como outputs esten en el formato que espera la func de error

            # This is where the model learns by backpropagating
            loss.backward()  # Calculo de los gradientes (Matrioshka mas grande).

            # And optimizes its weights here
            optimizer.step()  # Actualiza pesos en todas las capas

            # calculating the loss
            running_loss += loss.item()
            # print(f'Output {output}, labels {labels}, loss {loss.item()}')
            # print("Epoch {} - Training loss: {}".format(e, running_loss/len(trainloader)))

    print("\nTraining Time (in minutes) =",(time()-time0)/60)
    return model


In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


def test_model_mlp(testloader, model, input_size=2000):
    correct_count, all_count = 0, 0
    all_predicted = []
    all_labels = []

    for features, labels in testloader:
      #move data to specific device
      features = features.flatten(1, 2)
      features = features.to(device)
      features = features.float()
      labels = labels.to(device)
      labels = labels.long()

      for i in range(len(labels)):
        feature = features[i].view(1, input_size)

        #evaluate model with no grad
        with torch.no_grad():
            logps = model(feature)

        ps = torch.exp(logps)
        probab = list(ps.cpu().numpy()[0])

        #get predicted label
        pred_label = probab.index(max(probab))
        true_label = labels.cpu().numpy()[i]

        all_predicted.append(pred_label)
        all_labels.append(true_label)

        if true_label == pred_label:
          correct_count += 1

        all_count += 1

    # accuracy = correct_count/all_count
    accuracy = accuracy_score(all_labels, all_predicted)
    f1 = f1_score(all_labels, all_predicted, average='macro')

    print("Number Of Observations Tested =", all_count)
    print("\nModel Accuracy =", accuracy)
    print("\nModel F1 =", f1)

    return accuracy, f1, all_count

In [None]:
def train_models(input_layer, intermediate_layer, loader_train, loader_test, trials=10):
  accuracies = []
  f1s = []
  alpha = 0.01

  for run in range(trials):
    print(f"Training MLP model #{run}")
    new_model, criterion = get_new_model(input_layer, intermediate_layer)
    mlp_model = train_model(new_model, criterion, loader_train,
                            epochs = 10, lr = alpha)
    print(f"Testing MLP model #{run}")
    accuracy, f1, all_count = test_model_mlp(loader_test, mlp_model, input_layer)
    accuracies.append(accuracy)
    f1s.append(f1)
    print("------------------------------------------------------")

  return accuracies, f1s

In [None]:
print("------------------------------------------------------")
print("D = 20")
input_size = 20 * MAX_WORDS_PER_SENTENCE
accuracies_1, f1s_1 = train_models(input_size, int(input_size*.7),
                                   loader_training_D20, loader_test_D20)

print("------------------------------------------------------")


------------------------------------------------------
D = 20
Training MLP model #0
Running on device:  cpu
MLP model
 Sequential(
  (0): Linear(in_features=2000, out_features=1400, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=1400, out_features=2, bias=True)
  (3): LogSoftmax(dim=1)
)

Training Time (in minutes) = 0.10327964623769124
Testing MLP model #0
Number Of Observations Tested = 3901

Model Accuracy = 0.9543706741861061

Model F1 = 0.8972132419117996
------------------------------------------------------
Training MLP model #1
Running on device:  cpu
MLP model
 Sequential(
  (0): Linear(in_features=2000, out_features=1400, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=1400, out_features=2, bias=True)
  (3): LogSoftmax(dim=1)
)

Training Time (in minutes) = 0.11187851428985596
Testing MLP model #1
Number Of Observations Tested = 3901

Model Accuracy = 0.9556523968213279

Model F1 = 0.9003578196550551
------------------------------------------------------
Training

In [None]:
print(f"D = 14 \n Accuracies: {accuracies_1}\nF1 scores: {f1s_1}")
print(f"Accuracy ->\t mean = {np.mean(accuracies_1)}, Std = {np.std(accuracies_1)}")
print(f"F1 ->\t\t mean = {np.mean(f1s_1)}, Std = {np.std(f1s_1)}")
print("------------------------------------------------------")

D = 14 
 Accuracies: [0.9543706741861061, 0.9556523968213279, 0.9502691617533966, 0.9515508843886183, 0.952063573442707, 0.9548833632401948, 0.9543706741861061, 0.9538579851320175, 0.9546270187131505, 0.9543706741861061]
F1 scores: [0.8972132419117996, 0.9003578196550551, 0.8902334240713279, 0.8918835492346318, 0.8937490869468391, 0.8988900788054652, 0.8972132419117996, 0.8969443466009158, 0.8977025797142718, 0.8972132419117996]
Accuracy ->	 mean = 0.9536016406049731, Std = 0.0016253132513865415
F1 ->		 mean = 0.8961400610763907, Std = 0.0030102921288484913
------------------------------------------------------


In [None]:
print("------------------------------------------------------")

print("D = 100")
input_size = 100 * MAX_WORDS_PER_SENTENCE

accuracies_2, f1s_2 = train_models(input_size, int(input_size*.7),
                                   loader_training_D100, loader_test_D100)


------------------------------------------------------
D = 100
Training MLP model #0
Running on device:  cpu
MLP model
 Sequential(
  (0): Linear(in_features=10000, out_features=7000, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=7000, out_features=2, bias=True)
  (3): LogSoftmax(dim=1)
)

Training Time (in minutes) = 2.515516432126363
Testing MLP model #0
Number Of Observations Tested = 3901

Model Accuracy = 0.8692642912073827

Model F1 = 0.46503017004936914
------------------------------------------------------
Training MLP model #1
Running on device:  cpu
MLP model
 Sequential(
  (0): Linear(in_features=10000, out_features=7000, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=7000, out_features=2, bias=True)
  (3): LogSoftmax(dim=1)
)

Training Time (in minutes) = 2.660879675547282
Testing MLP model #1
Number Of Observations Tested = 3901

Model Accuracy = 0.8700333247885158

Model F1 = 0.47106951996078367
------------------------------------------------------
Trainin

In [None]:
print("------------------------------------------------------")
print(f"D = 70 \n Accuracies: {accuracies_2}\nF1 scores: {f1s_2}")
print(f"Accuracy ->\t mean = {np.mean(accuracies_2)}, Std = {np.std(accuracies_2)}")
print(f"F1 ->\t\t mean = {np.mean(f1s_2)}, Std = {np.std(f1s_2)}")

------------------------------------------------------
D = 70 
 Accuracies: [0.8692642912073827, 0.8700333247885158, 0.8936170212765957, 0.8692642912073827, 0.9279671879005383, 0.8692642912073827, 0.8746475262753141, 0.8723404255319149, 0.8310689566777749, 0.8692642912073827]
F1 scores: [0.46503017004936914, 0.47106951996078367, 0.628193335836672, 0.46503017004936914, 0.7991763191763191, 0.46503017004936914, 0.5059212862489869, 0.48878520904382977, 0.7279100975561792, 0.46503017004936914]
Accuracy ->	 mean = 0.8746731607280184, Std = 0.02290075034084115
F1 ->		 mean = 0.5481176448020249, Std = 0.11858300819821961
