In [36]:
# import pytorch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import random
random.seed(0)
seed = 0

In [38]:
"""
To get an ID of an available GPU
"""
import numpy as np
import subprocess as sp

ACCEPTABLE_AVAILABLE_MEMORY = 11167

# https://github.com/yselivonchyk/TensorFlow_DCIGN/blob/master/utils.py
def _output_to_list(output):
  return output.decode('ascii').split('\n')[:-1]


def get_idle_gpu(leave_unmasked=1, random=True):
  try:
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = _output_to_list(sp.check_output(command.split()))[1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    available_gpus = [i for i, x in enumerate(memory_free_values) if x > ACCEPTABLE_AVAILABLE_MEMORY]

    if len(available_gpus) <= leave_unmasked:
      print('Found only %d usable GPUs in the system' % len(available_gpus))
      return -1

    if random:
      available_gpus = np.asarray(available_gpus)
      np.random.shuffle(available_gpus)

    gpu_to_use = available_gpus[0]
    print("Using GPU: ", gpu_to_use)
    
    return int(gpu_to_use)
    """
    # update CUDA variable
    gpus = available_gpus[:leave_unmasked]
    setting = ','.join(map(str, gpus))
    os.environ["CUDA_VISIBLE_DEVICES"] = setting
    print('Left next %d GPU(s) unmasked: [%s] (from %s available)'
          % (leave_unmasked, setting, str(available_gpus)))
    """
  except FileNotFoundError as e:
    print('"nvidia-smi" is probably not installed. GPUs are not masked')
    print(e)
    return -1
  except sp.CalledProcessError as e:
    print("Error on GPU masking:\n", e.output)
    return -1

In [39]:
filename = 'data/train_conll_spanglish.csv'
if torch.cuda.is_available():
    gpu_id = "cuda:{}".format(get_idle_gpu(leave_unmasked=0))

device = torch.device(gpu_id if torch.cuda.is_available() else 'cpu')
print(device)

import torchtext

def label2int(label):
    if label=='positive':
        return 1
    elif label=='negative':
        return 0
    else:
        return 2

def label2float(label):
    if label=='positive':
        return 1.
    elif label=='negative':
        return 0.
    else:
        return 2.

text_field = torchtext.data.Field(sequential=True,      # text sequence
                                  tokenize=lambda x: x, # because are building a character-RNN
                                  include_lengths=True, # to track the length of sequences, for batching
                                  batch_first=True,
                                  use_vocab=True)       # to turn each character into an integer index
label_field = torchtext.data.Field(sequential=False,    # not a sequence
                                   use_vocab=False,     # don't need to track vocabulary
                                   is_target=True,
                                   batch_first=True,
                                   preprocessing=lambda x: label2int(x)) # convert text to 0 and 1

fields = [('id', None),('text', text_field), ('label', label_field)]
dataset = torchtext.data.TabularDataset(filename, # name of the file
                                        "tsv",               # fields are separated by a tab
                                        fields)

Using GPU:  0
cuda:0


In [40]:
for i in range(0,10):
    print(dataset[i].text, "---", dataset[i].label)


So that means tomorrow cruda segura lol --- 1
Tonight peda segura --- 2
Eres tan mala vieja bruja interesada#jamming --- 0
Yo kiero Pretzels lol --- 2
Fuck that ni ke el me vaya a mantener toda la vida lol --- 0
I always tell my dad ke me kiero kasar con una vieja rika and me regaña telling me ke no sea interesada ha --- 0
Ke me compre un carrito pa irme con mis friends and party lol --- 2
Why can I just find a rich bitch ke me mantenga y ya ha --- 2
Since I started working ya ni disfruto la vida lol --- 0
My dad me regano cuzs I was telling that to my brother and lo andaba molestando lol --- 0


In [41]:
train, val, test = dataset.split(split_ratio=[0.8,0.1,0.1])

In [42]:
text_field.build_vocab(dataset)
# text_field.vocab.stoi
# text_field.vocab.itos

In [43]:
len(text_field.vocab)
print(label_field)

<torchtext.data.field.Field object at 0x7f079e364668>


In [44]:
train_iter = torchtext.data.BucketIterator(train,
                                           batch_size=32,
                                           sort_key=lambda x: len(x.text), # to minimize padding
                                           sort_within_batch=True,        # sort within each batch
                                           repeat=False, # repeat the iterator for multiple epochs
                                           device=device)
val_iter = torchtext.data.BucketIterator(val,
                                         batch_size=32,
                                         sort_key=lambda x: len(x.text), # to minimize padding
                                         sort_within_batch=True,        # sort within each batch
                                         repeat=False, # repeat the iterator for multiple epochs
                                         device=device)
test_iter = torchtext.data.BucketIterator(test,
                                          batch_size=32,
                                          sort_key=lambda x: len(x.text), # to minimize padding
                                          sort_within_batch=True,        # sort within each batch
                                          repeat=False, # repeat the iterator for multiple epochs
                                          device=device)

In [45]:
"""
for i, batch in enumerate(train_iter):
    if i >= 2:
        break
    print(batch.text)
#     print(batch.text[0].shape)
    print(batch.label)
"""

'\nfor i, batch in enumerate(train_iter):\n    if i >= 2:\n        break\n    print(batch.text)\n#     print(batch.text[0].shape)\n    print(batch.label)\n'

In [None]:
"""
Another version of preprocessing data
"""
"""
import sklearn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import os

INPUT_PATH = "data/train_conll_spanglish.csv"
MAX_TWEET = 280

char_to_ind = {}
ind_to_char = {}

char_to_ind.update({"UNK":0})
ind_to_char.update({0:"UNK"})

count = 1

with open(INPUT_PATH, 'r') as f:
    for line in f:
        for char in line.split('\t')[1]:
            if char.lower() not in char_to_ind:
                char_to_ind.update({char.lower():count})
                ind_to_char.update({count:char.lower()})
                count += 1

#print(char_to_ind)
#print(ind_to_char)

n_letters = len(char_to_ind)
"""

In [None]:
"""
def letterToTensor(letter, n_letters):
    tensor = torch.zeros(1, n_letters)
    tensor[0][char_to_ind[letter]] = 1
    return tensor

def lineToTensor(line, n_letters):
    tensor = torch.zeros(len(line), n_letters)
    for li, letter in enumerate(line):
        tensor[li][char_to_ind[letter]] = 1
    return tensor

def batchToTensor(batch, n_letters):
    tensor = torch.zeros(len(batch),MAX_TWEET,n_letters)
    for sentence, line in enumerate(batch):
        for li, letter in enumerate(line):
            tensor[sentence][li][char_to_ind[letter.lower()]] = 1
    return tensor


#print(letterToTensor('o'))
print(lineToTensor('hello how are tou').shape)
print(batchToTensor(['hello friend', 'linear svm is better']))
print(batchToTensor(['hello friend', 'linear svm is better']).shape)
"""

In [None]:
"""
trainpath = INPUT_PATH
train = pd.read_csv(trainpath, sep='\\t', names=["ID","SENTENCE","LABEL"])
"""

In [None]:
"""
print(train['SENTENCE'][0].lower())
train_char_features = batchToTensor(train['SENTENCE'], n_letters)
"""

In [None]:
"""
print(train_char_features.shape)
for in_tensor in train_char_features:
    print(in_tensor.shape)
    break
# train_char_features[0].shape
"""

In [46]:
class TextCNN(nn.Module):
    """
    TextCNN implementation based on
    https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
    """
    def __init__(self, vocab_size, embed_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # first convolutional layer (three layers)
        self.conv_0 = nn.ModuleList([
                nn.Conv2d(in_channels = 1,
                          out_channels = n_filters,
                          kernel_size = (fs, embed_dim))
                for fs in filter_sizes
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
    
    def forward(self, text):
        # text = (tensor of input, tensor of input length)
#         print(text[0].shape)
        # convert input to embeddings
        in_data = text[0]
        # in_data = [batch_size, sentence_length]
        embedded = self.embedding(in_data)
        # embedded = [batch_size, sentence_length, embedding_dimension]
        embedded = embedded.unsqueeze(1)
        # embedded = [batch_size, 1, sentence_length, embedding_dimension]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.conv_0]
        # conved_n = [batch_size, n_filters, sentence_length - filter_size[n] - 1]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        # pooled_n = [batch_size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim=1))
        # cat = [batch_size, n_filters * len(filter_sizes)]
        logit = self.fc(cat)
        
        return logit
        

In [47]:
# train_char_features = [num_of_tweets, max. length of each tweet, embedding_size]
"""
num_features = list(train_char_features.shape)
print(num_features)
max_tweet_length = num_features[1]
embedding_dim = num_features[2] # vocab_size
"""
input_dim = embedding_dim = len(text_field.vocab)
n_filters = 3 # number of filters
filter_sizes = [3, 4, 5] # like character 3-gram, 4-gram, 5-gram
output_dim = 3
dropout = 0.5

model = TextCNN(input_dim, embedding_dim, n_filters, filter_sizes, output_dim, dropout)

In [48]:
# checking the parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('The model has {} trainable parameters'.format(count_parameters(model)))

The model has 368164 trainable parameters


In [49]:
# training
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-4)

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [50]:
def get_accuracy(logits, labels):
    correct, total = 0, 0
    _, predicted = torch.max(logits, 1)
#     print(predicted, labels)
#     print(predicted.shape, labels.shape)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
    return correct / total

In [51]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    for i, batch in enumerate(iterator):
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
#         print(predictions, predictions.shape)
#         print(batch.label, batch.label.shape)
        loss = criterion(predictions, batch.label)
        
        acc = get_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc
        """
        if i % 10 == 0:
            print("batch: {}, loss: {}, acc: {}".format(i, loss, acc*100))
        """
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [52]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = get_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [53]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [54]:
N_EPOCHS = 100

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print('Epoch: {} | Epoch Time: {}m {}s'.format(epoch+1, epoch_mins, epoch_secs))
    print('\tTrain Loss: {} | Train Acc: {}%'.format(train_loss, train_acc*100))
    print('\tVal. Loss: {} |  Val. Acc: {}%'.format(valid_loss, valid_acc*100))

Epoch: 1 | Epoch Time: 0m 3s
	Train Loss: 1.10106595993042 | Train Acc: 38.608333333333334%
	Val. Loss: 1.0006959400278457 |  Val. Acc: 49.85752279635259%
Epoch: 2 | Epoch Time: 0m 3s
	Train Loss: 1.0448423498471577 | Train Acc: 42.4%
	Val. Loss: 0.9915222398778225 |  Val. Acc: 49.40159574468085%
Epoch: 3 | Epoch Time: 0m 3s
	Train Loss: 1.0358972846666972 | Train Acc: 42.95833333333333%
	Val. Loss: 0.9916330182805975 |  Val. Acc: 49.1451367781155%
Epoch: 4 | Epoch Time: 0m 3s
	Train Loss: 1.0148214836120606 | Train Acc: 44.983333333333334%
	Val. Loss: 0.9804358355542446 |  Val. Acc: 49.2686170212766%
Epoch: 5 | Epoch Time: 0m 3s
	Train Loss: 1.0044216448465983 | Train Acc: 46.19166666666666%
	Val. Loss: 0.9784309787953154 |  Val. Acc: 49.7815349544073%
Epoch: 6 | Epoch Time: 0m 2s
	Train Loss: 0.9945557249387106 | Train Acc: 47.575%
	Val. Loss: 0.9755334017124582 |  Val. Acc: 50.39893617021277%
Epoch: 7 | Epoch Time: 0m 3s
	Train Loss: 0.9911596272786458 | Train Acc: 48.39166666666666

Epoch: 54 | Epoch Time: 0m 2s
	Train Loss: 0.9013598254521688 | Train Acc: 54.25833333333333%
	Val. Loss: 0.9803611598116286 |  Val. Acc: 51.05433130699089%
Epoch: 55 | Epoch Time: 0m 2s
	Train Loss: 0.899087917804718 | Train Acc: 54.25%
	Val. Loss: 0.9809864125353225 |  Val. Acc: 50.98784194528876%
Epoch: 56 | Epoch Time: 0m 2s
	Train Loss: 0.9023212135632833 | Train Acc: 54.15833333333333%
	Val. Loss: 0.9824108164361183 |  Val. Acc: 50.33244680851063%
Epoch: 57 | Epoch Time: 0m 2s
	Train Loss: 0.8985627559026083 | Train Acc: 54.108333333333334%
	Val. Loss: 0.9822953216573025 |  Val. Acc: 50.379939209726444%
Epoch: 58 | Epoch Time: 0m 2s
	Train Loss: 0.9007677076657613 | Train Acc: 53.925%
	Val. Loss: 0.9860163153486049 |  Val. Acc: 50.56990881458966%
Epoch: 59 | Epoch Time: 0m 2s
	Train Loss: 0.8955888768831889 | Train Acc: 54.44166666666666%
	Val. Loss: 0.9838657899105803 |  Val. Acc: 50.256458966565354%
Epoch: 60 | Epoch Time: 0m 2s
	Train Loss: 0.8915649959246318 | Train Acc: 54.5