In [1]:
# import pytorch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import random
random.seed(0)
seed = 0

In [92]:
"""
To get an ID of an available GPU
"""
import numpy as np
import subprocess as sp

ACCEPTABLE_AVAILABLE_MEMORY = 11167

# https://github.com/yselivonchyk/TensorFlow_DCIGN/blob/master/utils.py
def _output_to_list(output):
  return output.decode('ascii').split('\n')[:-1]


def get_idle_gpu(leave_unmasked=1, random=True):
  try:
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = _output_to_list(sp.check_output(command.split()))[1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    available_gpus = [i for i, x in enumerate(memory_free_values) if x > ACCEPTABLE_AVAILABLE_MEMORY]

    if len(available_gpus) <= leave_unmasked:
      print('Found only %d usable GPUs in the system' % len(available_gpus))
      return -1

    if random:
      available_gpus = np.asarray(available_gpus)
      np.random.shuffle(available_gpus)

    gpu_to_use = available_gpus[0]
    print("Using GPU: ", gpu_to_use)
    
    return int(gpu_to_use)
    """
    # update CUDA variable
    gpus = available_gpus[:leave_unmasked]
    setting = ','.join(map(str, gpus))
    os.environ["CUDA_VISIBLE_DEVICES"] = setting
    print('Left next %d GPU(s) unmasked: [%s] (from %s available)'
          % (leave_unmasked, setting, str(available_gpus)))
    """
  except FileNotFoundError as e:
    print('"nvidia-smi" is probably not installed. GPUs are not masked')
    print(e)
    return -1
  except sp.CalledProcessError as e:
    print("Error on GPU masking:\n", e.output)
    return -1

In [264]:
import torchtext

is_character_level = True

if is_character_level:
    # character level
    tweet_max_len = 280
    tokenizer = lambda x: x
else:
    # word level
    tweet_max_len = 200
    tokenizer = torchtext.data.get_tokenizer('basic_english')

In [265]:
filename = 'data/train_conll_spanglish.csv'
if torch.cuda.is_available():
    gpu_id = "cuda:{}".format(get_idle_gpu(leave_unmasked=0))

device = torch.device(gpu_id if torch.cuda.is_available() else 'cpu')
print(device)

def label2int(label):
    if label=='positive':
        return 1
    elif label=='negative':
        return 0
    else:
        return 2

def label2float(label):
    if label=='positive':
        return 1.
    elif label=='negative':
        return 0.
    else:
        return 2.

text_field = torchtext.data.Field(sequential=True,      # text sequence
                                  tokenize=tokenizer, # because are building a character-RNN
                                  include_lengths=True, # to track the length of sequences, for batching
                                  batch_first=True,
                                  fix_length=tweet_max_len, # 280 characters if character-level; else 200
                                  lower=True, # lower characters
                                  use_vocab=True)       # to turn each character into an integer index
label_field = torchtext.data.Field(sequential=False,    # not a sequence
                                   use_vocab=False,     # don't need to track vocabulary
                                   is_target=True,
                                   batch_first=True,
                                   preprocessing=lambda x: label2int(x)) # convert text to 0 and 1

fields = [('id', None),('text', text_field), ('label', label_field)]
dataset = torchtext.data.TabularDataset(filename, # name of the file
                                        "tsv",               # fields are separated by a tab
                                        fields)

Using GPU:  1
cuda:1


In [266]:
for i in range(0,10):
    print(dataset[i].text, "---", dataset[i].label)


so that means tomorrow cruda segura lol --- 1
tonight peda segura --- 2
eres tan mala vieja bruja interesada#jamming --- 0
yo kiero pretzels lol --- 2
fuck that ni ke el me vaya a mantener toda la vida lol --- 0
i always tell my dad ke me kiero kasar con una vieja rika and me regaña telling me ke no sea interesada ha --- 0
ke me compre un carrito pa irme con mis friends and party lol --- 2
why can i just find a rich bitch ke me mantenga y ya ha --- 2
since i started working ya ni disfruto la vida lol --- 0
my dad me regano cuzs i was telling that to my brother and lo andaba molestando lol --- 0


In [267]:
train, val, test = dataset.split(split_ratio=[0.8,0.1,0.1])

In [268]:
text_field.build_vocab(dataset)
# text_field.vocab.stoi
# text_field.vocab.itos

In [269]:
print(len(text_field.vocab))
"""
i = 0
for word in text_field.vocab.itos:
    if i > 10:
        break
    print(word)
    i += 1
"""

555
<unk>
<pad>
 


In [263]:
if not is_character_level:
    from torchnlp.word_to_vector import FastText
    # create weights matrix for word level
    lang1 = "en"
    lang2 = "es"
    lang3 = "hi"
    lang1_vectors = FastText(language=lang1, aligned=True, cache="wiki.{}.align.vec".format(lang1))
    lang2_vectors = FastText(language=lang2, aligned=True, cache="wiki.{}.align.vec".format(lang2))
    lang3_vectors = FastText(language=lang3, aligned=True, cache="wiki.{}.align.vec".format(lang3))
    en_embeddings = [lang1_vectors[word] for word in text_field.vocab.itos]
    es_embeddings = [lang2_vectors[word] for word in text_field.vocab.itos]
    hi_embeddings = [lang3_vectors[word] for word in text_field.vocab.itos]
    print(len(en_embeddings), len(es_embeddings), len(hi_embeddings))

36140 36140 36140


In [270]:
train_iter = torchtext.data.BucketIterator(train,
                                           batch_size=32,
                                           sort_key=lambda x: len(x.text), # to minimize padding
                                           sort_within_batch=True,        # sort within each batch
                                           repeat=False, # repeat the iterator for multiple epochs
                                           device=device)
val_iter = torchtext.data.BucketIterator(val,
                                         batch_size=32,
                                         sort_key=lambda x: len(x.text), # to minimize padding
                                         sort_within_batch=True,        # sort within each batch
                                         repeat=False, # repeat the iterator for multiple epochs
                                         device=device)
test_iter = torchtext.data.BucketIterator(test,
                                          batch_size=32,
                                          sort_key=lambda x: len(x.text), # to minimize padding
                                          sort_within_batch=True,        # sort within each batch
                                          repeat=False, # repeat the iterator for multiple epochs
                                          device=device)

In [9]:
"""
for i, batch in enumerate(train_iter):
    if i >= 2:
        break
    print(batch.text)
#     print(batch.text[0].shape)
    print(batch.label)
"""

'\nfor i, batch in enumerate(train_iter):\n    if i >= 2:\n        break\n    print(batch.text)\n#     print(batch.text[0].shape)\n    print(batch.label)\n'

In [10]:
"""
Another version of preprocessing data
"""
"""
import sklearn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import os

INPUT_PATH = "data/train_conll_spanglish.csv"
MAX_TWEET = 280

char_to_ind = {}
ind_to_char = {}

char_to_ind.update({"UNK":0})
ind_to_char.update({0:"UNK"})

count = 1

with open(INPUT_PATH, 'r') as f:
    for line in f:
        for char in line.split('\t')[1]:
            if char.lower() not in char_to_ind:
                char_to_ind.update({char.lower():count})
                ind_to_char.update({count:char.lower()})
                count += 1

#print(char_to_ind)
#print(ind_to_char)

n_letters = len(char_to_ind)
"""

'\nimport sklearn\nimport pandas as pd\nimport numpy as np\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.svm import SVC\nimport os\n\nINPUT_PATH = "data/train_conll_spanglish.csv"\nMAX_TWEET = 280\n\nchar_to_ind = {}\nind_to_char = {}\n\nchar_to_ind.update({"UNK":0})\nind_to_char.update({0:"UNK"})\n\ncount = 1\n\nwith open(INPUT_PATH, \'r\') as f:\n    for line in f:\n        for char in line.split(\'\t\')[1]:\n            if char.lower() not in char_to_ind:\n                char_to_ind.update({char.lower():count})\n                ind_to_char.update({count:char.lower()})\n                count += 1\n\n#print(char_to_ind)\n#print(ind_to_char)\n\nn_letters = len(char_to_ind)\n'

In [11]:
"""
def letterToTensor(letter, n_letters):
    tensor = torch.zeros(1, n_letters)
    tensor[0][char_to_ind[letter]] = 1
    return tensor

def lineToTensor(line, n_letters):
    tensor = torch.zeros(len(line), n_letters)
    for li, letter in enumerate(line):
        tensor[li][char_to_ind[letter]] = 1
    return tensor

def batchToTensor(batch, n_letters):
    tensor = torch.zeros(len(batch),MAX_TWEET,n_letters)
    for sentence, line in enumerate(batch):
        for li, letter in enumerate(line):
            tensor[sentence][li][char_to_ind[letter.lower()]] = 1
    return tensor


#print(letterToTensor('o'))
print(lineToTensor('hello how are tou', n_letters).shape)
print(batchToTensor(['hello friend', 'linear svm is better'], n_letters))
print(batchToTensor(['hello friend', 'linear svm is better'], n_letters).shape)
"""

"\ndef letterToTensor(letter, n_letters):\n    tensor = torch.zeros(1, n_letters)\n    tensor[0][char_to_ind[letter]] = 1\n    return tensor\n\ndef lineToTensor(line, n_letters):\n    tensor = torch.zeros(len(line), n_letters)\n    for li, letter in enumerate(line):\n        tensor[li][char_to_ind[letter]] = 1\n    return tensor\n\ndef batchToTensor(batch, n_letters):\n    tensor = torch.zeros(len(batch),MAX_TWEET,n_letters)\n    for sentence, line in enumerate(batch):\n        for li, letter in enumerate(line):\n            tensor[sentence][li][char_to_ind[letter.lower()]] = 1\n    return tensor\n\n\n#print(letterToTensor('o'))\nprint(lineToTensor('hello how are tou', n_letters).shape)\nprint(batchToTensor(['hello friend', 'linear svm is better'], n_letters))\nprint(batchToTensor(['hello friend', 'linear svm is better'], n_letters).shape)\n"

In [12]:
"""
trainpath = INPUT_PATH
train = pd.read_csv(trainpath, sep='\\t', names=["ID","SENTENCE","LABEL"])
"""

'\ntrainpath = INPUT_PATH\ntrain = pd.read_csv(trainpath, sep=\'\\t\', names=["ID","SENTENCE","LABEL"])\n'

In [13]:
"""
print(train['SENTENCE'][0].lower())
train_char_features = batchToTensor(train['SENTENCE'], n_letters)
"""

"\nprint(train['SENTENCE'][0].lower())\ntrain_char_features = batchToTensor(train['SENTENCE'], n_letters)\n"

In [14]:
"""
print(train_char_features.shape)
for in_tensor in train_char_features:
    print(in_tensor.shape)
    break
# train_char_features[0].shape
"""

'\nprint(train_char_features.shape)\nfor in_tensor in train_char_features:\n    print(in_tensor.shape)\n    break\n# train_char_features[0].shape\n'

In [251]:
class CharShallowCNN(nn.Module):
    """
    TextCNN implementation based on
    https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
    """
    def __init__(self, vocab_size, embed_dim,
                 conv0_f_nums, conv0_f_sizes,
                 output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # one-hot vector, https://discuss.pytorch.org/t/convert-int-into-one-hot-format/507/11
        self.embedding.weight.data = torch.eye(vocab_size)
        # make embedding untrainable
#         self.embedding.weight.requires_grad=False
        # first convolutional layer (three layers)
        self.conv_0 = nn.ModuleList([
                nn.Conv2d(in_channels = 1,
                          out_channels = conv0_f_nums,
                          kernel_size = (fs, embed_dim))
                for fs in conv0_f_sizes
        ])
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(conv0_f_sizes) * conv0_f_nums, output_dim)
    
    def forward(self, text):
        # text = (tensor of input, tensor of input length)
#         print(text)
        # convert input to embeddings
        in_data = text[0]
        # in_data = [batch_size, sentence_length]
        embedded = self.embedding(in_data)
#         print(embedded)
        # embedded = [batch_size, sentence_length, embedding_dimension]
        embedded = embedded.unsqueeze(1)
        # embedded = [batch_size, 1, sentence_length, embedding_dimension]
        conved_0 = [F.relu(conv(embedded)).squeeze(3) for conv in self.conv_0]
#         for each in conved_0:
#             print(each.shape)        

        cnn_output = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved_0]
        
        # pooled_n = [batch_size, n_filters]
        cat = self.dropout(torch.cat(cnn_output, dim=1))
#         cat = torch.cat(cnn_output, dim=1)
        # cat = [batch_size, n_filters * len(filter_sizes)]
        logit = self.fc(cat)
        
        return logit
        

In [252]:
class WordShallowCNN(nn.Module):
    """
    TextCNN implementation based on
    https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
    """
    def __init__(self, vocab_size, embed_dim,
                 conv0_f_nums, conv0_f_sizes,
                 output_dim, dropout,
                 lang1_weights=None, lang2_weights=None):
        super().__init__()
        self.lang1_embedding = nn.Embedding(vocab_size, embed_dim)
        self.lang2_embedding = nn.Embedding(vocab_size, embed_dim)
        # one-hot vector, https://discuss.pytorch.org/t/convert-int-into-one-hot-format/507/11
        if lang1_weights is not None:
            self.lang1_embedding.weight = nn.Parameter(lang1_weights)
        if lang2_weights is not None:
            self.lang2_embedding.weight = nn.Parameter(lang2_weights)
        # make embedding untrainable
        self.lang1_embedding.weight.requires_grad=False
        self.lang2_embedding.weight.requires_grad=False
        # first convolutional layer (three layers)
        self.conv_0 = nn.ModuleList([
                nn.Conv2d(in_channels = 2,
                          out_channels = conv0_f_nums,
                          kernel_size = (fs, embed_dim))
                for fs in conv0_f_sizes
        ])
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(conv0_f_sizes) * conv0_f_nums, output_dim)
    
    def forward(self, text):
        # text = (tensor of input, tensor of input length)
#         print(text)
        # convert input to embeddings
        in_data = text[0]
        # in_data = [batch_size, sentence_length]
        lang1_embedded = self.lang1_embedding(in_data)
        lang1_embedded = lang1_embedded.unsqueeze(1)
        lang2_embedded = self.lang2_embedding(in_data)
        lang2_embedded = lang2_embedded.unsqueeze(1)
        # embedded = [batch_size, 2, sentence_length, embedding_dimension]
        embedded = torch.cat((lang1_embedded, lang2_embedded), dim=1)
#         print(embedded)
        # embedded = [batch_size, sentence_length, embedding_dimension]
#         embedded = embedded.unsqueeze(1)
        
        conved_0 = [F.relu(conv(embedded)).squeeze(3) for conv in self.conv_0]
#         for each in conved_0:
#             print(each.shape)        

        cnn_output = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved_0]
        
        # pooled_n = [batch_size, n_filters]
        # prevent overfitting
        cat = self.dropout(torch.cat(cnn_output, dim=1))
#         cat = torch.cat(cnn_output, dim=1)
        # cat = [batch_size, n_filters * len(filter_sizes)]
        logit = self.fc(cat)
        
        return logit

In [286]:
class CharDeepCNN(nn.Module):
    """
    TextCNN implementation based on
    https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
    """
    def __init__(self, vocab_size, embed_dim,
                 conv0_f_nums, conv0_f_sizes,
                 conv1_f_nums, conv1_f_sizes,
                 output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # one-hot vector, https://discuss.pytorch.org/t/convert-int-into-one-hot-format/507/11
        self.embedding.weight.data = torch.eye(vocab_size)
        # make embedding untrainable
#         self.embedding.weight.requires_grad=False
        # first convolutional layer (three layers)
        self.conv_0 = nn.ModuleList([
                nn.Conv2d(in_channels = 1,
                          out_channels = conv0_f_nums,
                          kernel_size = (fs, embed_dim))
                for fs in conv0_f_sizes
        ])
        conv_0_out_dims = [280 - fs -1 for fs in conv0_f_sizes]
        self.conv_1 = nn.ModuleList([
                nn.Conv1d(in_channels = conv0_f_nums,
                          out_channels = conv1_f_nums,
                          kernel_size = fs)
                for fs in conv1_f_sizes
        ])
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(conv1_f_sizes) * conv1_f_nums, output_dim)
    
    def forward(self, text):
        # text = (tensor of input, tensor of input length)
#         print(text)
        # convert input to embeddings
        in_data = text[0]
        # in_data = [batch_size, sentence_length]
        embedded = self.embedding(in_data)
#         print(embedded)
        # embedded = [batch_size, sentence_length, embedding_dimension]
        embedded = embedded.unsqueeze(1)
        # embedded = [batch_size, 1, sentence_length, embedding_dimension]
        conved_0 = [F.relu(conv(embedded)).squeeze(3) for conv in self.conv_0]
#         for each in conved_0:
#             print(each.shape)        

#         print(each.shape for each in pooled_0)
        conved_1 = [F.relu(conv1(conv0)) for conv1, conv0 in zip(self.conv_1, conved_0)]
#         for each in conved_1:
#             print(each.shape)
        # pooled output
        cnn_output = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved_1]
        
        # pooled_n = [batch_size, n_filters]
        cat = self.dropout(torch.cat(cnn_output, dim=1))
#         cat = torch.cat(cnn_output, dim=1)
#         cat = [batch_size, n_filters * len(filter_sizes)]
        logit = self.fc(cat)
        
        return logit

In [254]:
class WordDeepCNN(nn.Module):
    """
    TextCNN implementation based on
    https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
    """
    def __init__(self, vocab_size, embed_dim,
                 conv0_f_nums, conv0_f_sizes,
                 conv1_f_nums, conv1_f_sizes,
                 output_dim, dropout,
                 lang1_weights=None, lang2_weights=None):
        super().__init__()
        self.lang1_embedding = nn.Embedding(vocab_size, embed_dim)
        self.lang2_embedding = nn.Embedding(vocab_size, embed_dim)
        if lang1_weights is not None:
            self.lang1_embedding.weight = nn.Parameter(lang1_weights)
        if lang2_weights is not None:
            self.lang2_embedding.weight = nn.Parameter(lang2_weights)
        # make embedding untrainable
        self.lang1_embedding.weight.requires_grad=False
        self.lang2_embedding.weight.requires_grad=False
        # first convolutional layer (three layers)
        self.conv_0 = nn.ModuleList([
                nn.Conv2d(in_channels = 2,
                          out_channels = conv0_f_nums,
                          kernel_size = (fs, embed_dim))
                for fs in conv0_f_sizes
        ])
        conv_0_out_dims = [280 - fs -1 for fs in conv0_f_sizes]
        self.conv_1 = nn.ModuleList([
                nn.Conv1d(in_channels = conv0_f_nums,
                          out_channels = conv1_f_nums,
                          kernel_size = fs)
                for fs in conv1_f_sizes
        ])
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(conv1_f_sizes) * conv1_f_nums, output_dim)
    
    def forward(self, text):
        # text = (tensor of input, tensor of input length)
#         print(text)
        # convert input to embeddings
        in_data = text[0]
        # in_data = [batch_size, sentence_length]
        lang1_embedded = self.lang1_embedding(in_data)
        lang1_embedded = lang1_embedded.unsqueeze(1)
        lang2_embedded = self.lang2_embedding(in_data)
        lang2_embedded = lang2_embedded.unsqueeze(1)
        # embedded = [batch_size, 2, sentence_length, embedding_dimension]
        embedded = torch.cat((lang1_embedded, lang2_embedded), dim=1)
#         print(embedded)
        conved_0 = [F.relu(conv(embedded)).squeeze(3) for conv in self.conv_0]
#         for each in conved_0:
#             print(each.shape)        

        conved_1 = [F.relu(conv1(conv0)) for conv1, conv0 in zip(self.conv_1, conved_0)]
#         for each in conved_1:
#             print(each.shape)
        # pooled output
        cnn_output = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved_1]
        
        # pooled_n = [batch_size, n_filters]
        cat = self.dropout(torch.cat(cnn_output, dim=1))
#         cat = torch.cat(cnn_output, dim=1)
        # cat = [batch_size, n_filters * len(filter_sizes)]
        logit = self.fc(cat)
        
        return logit
        

In [287]:
# train_char_features = [num_of_tweets, max. length of each tweet, embedding_size]
"""
num_features = list(train_char_features.shape)
print(num_features)
max_tweet_length = num_features[1]
embedding_dim = num_features[2] # vocab_size
"""
if is_character_level:
    input_dim = embedding_dim = len(text_field.vocab)
else:
    # en_embeddings: English
    # es_embeddings: Spanish
    # hi_embeddings: Hindi
    lang1_weights = torch.stack(en_embeddings)
    lang2_weights = torch.stack(es_embeddings) if "spanglish" in filename else torch.stack(hi_embeddings)
    print(lang1_weights.shape, lang2_weights.shape)
    input_dim = lang1_weights.shape[0]
    embedding_dim = lang1_weights.shape[1]
    
conv0_filter_sizes = [3, 4, 5, 6] # like character 3-gram, 4-gram, 5-gram, 6-gram
conv0_filter_nums = 5 # number of filters
conv1_filter_sizes = [3, 3, 3, 3]
conv1_filter_nums = 5
output_dim = 3
dropout = 0.5

is_shallow = False

if is_shallow:
    # shallow CNNs
    if is_character_level:
        model = CharShallowCNN(input_dim, embedding_dim,
                               conv0_filter_nums, conv0_filter_sizes,
                               output_dim, dropout)
    else:
        model = WordShallowCNN(input_dim, embedding_dim,
                               conv0_filter_nums, conv0_filter_sizes,
                               output_dim, dropout,
                               lang1_weights=lang1_weights, lang2_weights=lang2_weights)
else:
    # deep CNNs --- two hidden layers as of now
    if is_character_level:
        model = CharDeepCNN(input_dim, embedding_dim,
                            conv0_filter_nums, conv0_filter_sizes,
                            conv1_filter_nums, conv1_filter_sizes,
                            output_dim, dropout)
    else:
        model = WordDeepCNN(input_dim, embedding_dim,
                            conv0_filter_nums, conv0_filter_sizes,
                            conv1_filter_nums, conv1_filter_sizes,
                            output_dim, dropout,
                            lang1_weights=lang1_weights, lang2_weights=lang2_weights)

In [288]:
# checking the parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('The model has {} trainable parameters'.format(count_parameters(model)))

The model has 358378 trainable parameters


In [289]:
# training
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-3)

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [274]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def get_results(logits, labels):
    _, predicted = torch.max(logits, 1)
    # move CUDA variables to CPU for numpy
    preds = predicted.cpu().numpy()
    gts = labels.cpu().numpy()
    ret = {}
    ret['accuracy'] = accuracy_score(gts, preds)
    ret['precision'], ret['recall'], ret['f1'], _ = precision_recall_fscore_support(gts, preds, average='macro')
    return ret
    
    """
    correct, total = 0, 0
#     print(predicted, labels)
#     print(predicted.shape, labels.shape)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
    return correct / total
    """

In [275]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0
    epoch_prec = 0
    epoch_recall = 0
    
    model.train()
    for i, batch in enumerate(iterator):
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
#         print(predictions, predictions.shape)
#         print(batch.label, batch.label.shape)
        loss = criterion(predictions, batch.label)
        
        ret = get_results(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += ret['accuracy']
        epoch_f1 += ret['f1']
        epoch_prec += ret['precision']
        epoch_recall += ret['recall']
        """
        if i % 10 == 0:
            print("batch: {}, loss: {}, acc: {}".format(i, loss, acc*100))
        """
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), \
            epoch_f1 / len(iterator), epoch_prec / len(iterator), epoch_recall / len(iterator)

In [276]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0
    epoch_prec = 0
    epoch_recall = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            ret = get_results(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += ret['accuracy']
            epoch_f1 += ret['f1']
            epoch_prec += ret['precision']
            epoch_recall += ret['recall']
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), \
            epoch_f1 / len(iterator), epoch_prec / len(iterator), epoch_recall / len(iterator)

In [277]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
is_spanglish = "spanglish" in filename
model_name = "shallow" if is_shallow else "deep"
model_name += "CharCNN" if is_character_level else "WordCNN"
model_name += "_spanglish" if is_spanglish else "_higlish"
model_name += ".pt"
print("Model will be saved into {}".format(model_name))

N_EPOCHS = 100

best_valid_loss = float('inf')
train_losses = []
valid_losses = []

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc, train_f1, train_prec, train_recall = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc, valid_f1, valid_prec, valid_recall = evaluate(model, val_iter, criterion)
    
    end_time = time.time()
    
    # add losses for plotting
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print('Epoch: {} | Epoch Time: {}m {}s'.format(epoch+1, epoch_mins, epoch_secs))
    print('\tTrain Loss: {0:.4f} | Acc: {1:.4f}% | F1: {2:.4f} | Prec.: {3:.4f} | Recall: {4:.4f}'.format(train_loss, train_acc*100, train_f1, train_prec, train_recall))
    print('\tVal. Loss: {0:.4f} | Acc: {1:.4f}% | F1: {2:.4f} | Prec.: {3:.4f} | Recall: {4:.4f}'.format(valid_loss, valid_acc*100, valid_f1, valid_prec, valid_recall))
    
# testing time
_, test_acc, test_f1, test_prec, test_recall = evaluate(model, test_iter, criterion)
print('============================')
print('Test results: Acc: {0:.4f}% | F1: {1:.4f} | Prec.: {2:.4f} | Recall: {3:.4f}'.format(test_acc*100, test_f1, test_prec, test_recall))

# save the model
torch.save(model.state_dict(), model_name)

Model will be saved into deepCharCNN_spanglish.pt
Epoch: 1 | Epoch Time: 0m 8s
	Train Loss: 1.0148 | Acc: 48.9083% | F1: 0.2479 | Prec.: 0.2325 | Recall: 0.3365
	Val. Loss: 0.9834 | Acc: 49.8005% | F1: 0.2202 | Prec.: 0.1660 | Recall: 0.3333
Epoch: 2 | Epoch Time: 0m 8s
	Train Loss: 0.9788 | Acc: 50.1500% | F1: 0.2391 | Prec.: 0.2221 | Recall: 0.3419
	Val. Loss: 0.9615 | Acc: 50.5319% | F1: 0.2589 | Prec.: 0.2585 | Recall: 0.3579
Epoch: 3 | Epoch Time: 0m 8s
	Train Loss: 0.9627 | Acc: 50.8917% | F1: 0.2620 | Prec.: 0.2708 | Recall: 0.3568
	Val. Loss: 0.9539 | Acc: 51.0543% | F1: 0.2775 | Prec.: 0.3039 | Recall: 0.3665
Epoch: 4 | Epoch Time: 0m 8s
	Train Loss: 0.9531 | Acc: 50.7833% | F1: 0.2752 | Prec.: 0.2999 | Recall: 0.3631
	Val. Loss: 0.9491 | Acc: 52.1847% | F1: 0.3253 | Prec.: 0.3735 | Recall: 0.3996
Epoch: 5 | Epoch Time: 0m 8s
	Train Loss: 0.9436 | Acc: 51.4167% | F1: 0.3067 | Prec.: 0.3607 | Recall: 0.3817
	Val. Loss: 0.9548 | Acc: 51.4533% | F1: 0.2883 | Prec.: 0.3363 | Recal

In [None]:
# plot losses
