In [1]:
# import pytorch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import random
random.seed(0)
seed = 0

In [2]:
"""
To get an ID of an available GPU
"""
import numpy as np
import subprocess as sp

ACCEPTABLE_AVAILABLE_MEMORY = 11167

# https://github.com/yselivonchyk/TensorFlow_DCIGN/blob/master/utils.py
def _output_to_list(output):
  return output.decode('ascii').split('\n')[:-1]


def get_idle_gpu(leave_unmasked=1, random=True):
  try:
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = _output_to_list(sp.check_output(command.split()))[1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    available_gpus = [i for i, x in enumerate(memory_free_values) if x > ACCEPTABLE_AVAILABLE_MEMORY]

    if len(available_gpus) <= leave_unmasked:
      print('Found only %d usable GPUs in the system' % len(available_gpus))
      return -1

    if random:
      available_gpus = np.asarray(available_gpus)
      np.random.shuffle(available_gpus)

    gpu_to_use = available_gpus[0]
    print("Using GPU: ", gpu_to_use)
    
    return int(gpu_to_use)
    """
    # update CUDA variable
    gpus = available_gpus[:leave_unmasked]
    setting = ','.join(map(str, gpus))
    os.environ["CUDA_VISIBLE_DEVICES"] = setting
    print('Left next %d GPU(s) unmasked: [%s] (from %s available)'
          % (leave_unmasked, setting, str(available_gpus)))
    """
  except FileNotFoundError as e:
    print('"nvidia-smi" is probably not installed. GPUs are not masked')
    print(e)
    return -1
  except sp.CalledProcessError as e:
    print("Error on GPU masking:\n", e.output)
    return -1

In [3]:
filename = 'data/train_conll_hinglish.csv'
if torch.cuda.is_available():
    gpu_id = "cuda:{}".format(get_idle_gpu(leave_unmasked=0))

device = torch.device(gpu_id if torch.cuda.is_available() else 'cpu')
print(device)
tweet_max_len = 280

import torchtext

def label2int(label):
    if label=='positive':
        return 1
    elif label=='negative':
        return 0
    else:
        return 2

def label2float(label):
    if label=='positive':
        return 1.
    elif label=='negative':
        return 0.
    else:
        return 2.

text_field = torchtext.data.Field(sequential=True,      # text sequence
                                  tokenize=lambda x: x, # because are building a character-RNN
                                  include_lengths=True, # to track the length of sequences, for batching
                                  batch_first=True,
                                  fix_length=tweet_max_len, # 280 characters
                                  lower=True, # lower characters
                                  use_vocab=True)       # to turn each character into an integer index
label_field = torchtext.data.Field(sequential=False,    # not a sequence
                                   use_vocab=False,     # don't need to track vocabulary
                                   is_target=True,
                                   batch_first=True,
                                   preprocessing=lambda x: label2int(x)) # convert text to 0 and 1

fields = [('id', None),('text', text_field), ('label', label_field)]
dataset = torchtext.data.TabularDataset(filename, # name of the file
                                        "tsv",               # fields are separated by a tab
                                        fields)

Using GPU:  7
cuda:7


In [4]:
for i in range(0,10):
    print(dataset[i].text, "---", dataset[i].label)


@ adilnisarbutt pakistan ka ghra tauq he pakistan israel ko tasleem nahein kerta isko palestine kehta he- occupied palestine --- 0
madarchod mulle ye mathura me nahi dikha tha jab mullo ne hindu ko iss liye mara ki vo lasse ki paise mag liye the… https// t. co/ oxf8tr3bly --- 0
@ narendramodi manya pradhan mantri mahoday shriman narendra modi ji pradhanmantri banne par hardik badhai tahe dil… https// t. co/ prnomskkn1 --- 1
@ atheist_ krishna jcb full trend me chal rahi aa --- 1
@ abhisharsharma_@ ravishkumarblog loksabha me janta sirf modi ko vote de rahi thi na ki kisi mp or bjp ko without m… https// t. co/ shtbwcb7fm --- 1
@ noirnaveed@ angelahana6@ cricketworldcup bhosdike tum pechvade ki tatti hi rahoge bc --- 0
love u bhaijan...♥♥ father+ son..# bharat# iambharat# bharatthiseid best pic from entire# promotions... mashallah… https// t. co/ s2xhwu6lud --- 1
@ manojgajjar111 tumhara pass abh deemagh hai nahi islea google ko apna deemagh banaya hua hai. har koi tumhari tarh… https// 

In [5]:
train, val, test = dataset.split(split_ratio=[0.8,0.1,0.1])

In [6]:
text_field.build_vocab(dataset)
# text_field.vocab.stoi
# text_field.vocab.itos

In [7]:
len(text_field.vocab)
print(label_field)

<torchtext.data.field.Field object at 0x7f4cae2b1fd0>


In [8]:
train_iter = torchtext.data.BucketIterator(train,
                                           batch_size=32,
                                           sort_key=lambda x: len(x.text), # to minimize padding
                                           sort_within_batch=True,        # sort within each batch
                                           repeat=False, # repeat the iterator for multiple epochs
                                           device=device)
val_iter = torchtext.data.BucketIterator(val,
                                         batch_size=32,
                                         sort_key=lambda x: len(x.text), # to minimize padding
                                         sort_within_batch=True,        # sort within each batch
                                         repeat=False, # repeat the iterator for multiple epochs
                                         device=device)
test_iter = torchtext.data.BucketIterator(test,
                                          batch_size=32,
                                          sort_key=lambda x: len(x.text), # to minimize padding
                                          sort_within_batch=True,        # sort within each batch
                                          repeat=False, # repeat the iterator for multiple epochs
                                          device=device)

In [9]:
"""
for i, batch in enumerate(train_iter):
    if i >= 2:
        break
    print(batch.text)
#     print(batch.text[0].shape)
    print(batch.label)
"""

'\nfor i, batch in enumerate(train_iter):\n    if i >= 2:\n        break\n    print(batch.text)\n#     print(batch.text[0].shape)\n    print(batch.label)\n'

In [10]:
"""
Another version of preprocessing data
"""
"""
import sklearn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import os

INPUT_PATH = "data/train_conll_spanglish.csv"
MAX_TWEET = 280

char_to_ind = {}
ind_to_char = {}

char_to_ind.update({"UNK":0})
ind_to_char.update({0:"UNK"})

count = 1

with open(INPUT_PATH, 'r') as f:
    for line in f:
        for char in line.split('\t')[1]:
            if char.lower() not in char_to_ind:
                char_to_ind.update({char.lower():count})
                ind_to_char.update({count:char.lower()})
                count += 1

#print(char_to_ind)
#print(ind_to_char)

n_letters = len(char_to_ind)
"""

'\nimport sklearn\nimport pandas as pd\nimport numpy as np\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.svm import SVC\nimport os\n\nINPUT_PATH = "data/train_conll_spanglish.csv"\nMAX_TWEET = 280\n\nchar_to_ind = {}\nind_to_char = {}\n\nchar_to_ind.update({"UNK":0})\nind_to_char.update({0:"UNK"})\n\ncount = 1\n\nwith open(INPUT_PATH, \'r\') as f:\n    for line in f:\n        for char in line.split(\'\t\')[1]:\n            if char.lower() not in char_to_ind:\n                char_to_ind.update({char.lower():count})\n                ind_to_char.update({count:char.lower()})\n                count += 1\n\n#print(char_to_ind)\n#print(ind_to_char)\n\nn_letters = len(char_to_ind)\n'

In [11]:
"""
def letterToTensor(letter, n_letters):
    tensor = torch.zeros(1, n_letters)
    tensor[0][char_to_ind[letter]] = 1
    return tensor

def lineToTensor(line, n_letters):
    tensor = torch.zeros(len(line), n_letters)
    for li, letter in enumerate(line):
        tensor[li][char_to_ind[letter]] = 1
    return tensor

def batchToTensor(batch, n_letters):
    tensor = torch.zeros(len(batch),MAX_TWEET,n_letters)
    for sentence, line in enumerate(batch):
        for li, letter in enumerate(line):
            tensor[sentence][li][char_to_ind[letter.lower()]] = 1
    return tensor


#print(letterToTensor('o'))
print(lineToTensor('hello how are tou', n_letters).shape)
print(batchToTensor(['hello friend', 'linear svm is better'], n_letters))
print(batchToTensor(['hello friend', 'linear svm is better'], n_letters).shape)
"""

"\ndef letterToTensor(letter, n_letters):\n    tensor = torch.zeros(1, n_letters)\n    tensor[0][char_to_ind[letter]] = 1\n    return tensor\n\ndef lineToTensor(line, n_letters):\n    tensor = torch.zeros(len(line), n_letters)\n    for li, letter in enumerate(line):\n        tensor[li][char_to_ind[letter]] = 1\n    return tensor\n\ndef batchToTensor(batch, n_letters):\n    tensor = torch.zeros(len(batch),MAX_TWEET,n_letters)\n    for sentence, line in enumerate(batch):\n        for li, letter in enumerate(line):\n            tensor[sentence][li][char_to_ind[letter.lower()]] = 1\n    return tensor\n\n\n#print(letterToTensor('o'))\nprint(lineToTensor('hello how are tou', n_letters).shape)\nprint(batchToTensor(['hello friend', 'linear svm is better'], n_letters))\nprint(batchToTensor(['hello friend', 'linear svm is better'], n_letters).shape)\n"

In [12]:
"""
trainpath = INPUT_PATH
train = pd.read_csv(trainpath, sep='\\t', names=["ID","SENTENCE","LABEL"])
"""

'\ntrainpath = INPUT_PATH\ntrain = pd.read_csv(trainpath, sep=\'\\t\', names=["ID","SENTENCE","LABEL"])\n'

In [13]:
"""
print(train['SENTENCE'][0].lower())
train_char_features = batchToTensor(train['SENTENCE'], n_letters)
"""

"\nprint(train['SENTENCE'][0].lower())\ntrain_char_features = batchToTensor(train['SENTENCE'], n_letters)\n"

In [14]:
"""
print(train_char_features.shape)
for in_tensor in train_char_features:
    print(in_tensor.shape)
    break
# train_char_features[0].shape
"""

'\nprint(train_char_features.shape)\nfor in_tensor in train_char_features:\n    print(in_tensor.shape)\n    break\n# train_char_features[0].shape\n'

In [73]:
class TextShallowCNN(nn.Module):
    """
    TextCNN implementation based on
    https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
    """
    def __init__(self, vocab_size, embed_dim,
                 conv0_f_nums, conv0_f_sizes,
                 output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # one-hot vector, https://discuss.pytorch.org/t/convert-int-into-one-hot-format/507/11
#         self.embedding.weight.data = torch.eye(vocab_size)
        # make embedding untrainable
#         self.embedding.weight.requires_grad=False
        # first convolutional layer (three layers)
        self.conv_0 = nn.ModuleList([
                nn.Conv2d(in_channels = 1,
                          out_channels = conv0_f_nums,
                          kernel_size = (fs, embed_dim))
                for fs in conv0_f_sizes
        ])
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(conv0_f_sizes) * conv0_f_nums, output_dim)
    
    def forward(self, text):
        # text = (tensor of input, tensor of input length)
#         print(text)
        # convert input to embeddings
        in_data = text[0]
        # in_data = [batch_size, sentence_length]
        embedded = self.embedding(in_data)
#         print(embedded)
        # embedded = [batch_size, sentence_length, embedding_dimension]
        embedded = embedded.unsqueeze(1)
        # embedded = [batch_size, 1, sentence_length, embedding_dimension]
        conved_0 = [F.relu(conv(embedded)).squeeze(3) for conv in self.conv_0]
#         for each in conved_0:
#             print(each.shape)        

        cnn_output = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved_0]
        
        # pooled_n = [batch_size, n_filters]
#         cat = self.dropout(torch.cat(cnn_output, dim=1))
        cat = torch.cat(cnn_output, dim=1)
        # cat = [batch_size, n_filters * len(filter_sizes)]
        logit = self.fc(cat)
        
        return logit
        

In [74]:
class TextDeepCNN(nn.Module):
    """
    TextCNN implementation based on
    https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
    """
    def __init__(self, vocab_size, embed_dim,
                 conv0_f_nums, conv0_f_sizes,
                 conv1_f_nums, conv1_f_sizes,
                 output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # one-hot vector, https://discuss.pytorch.org/t/convert-int-into-one-hot-format/507/11
#         self.embedding.weight.data = torch.eye(vocab_size)
        # make embedding untrainable
#         self.embedding.weight.requires_grad=False
        # first convolutional layer (three layers)
        self.conv_0 = nn.ModuleList([
                nn.Conv2d(in_channels = 1,
                          out_channels = conv0_f_nums,
                          kernel_size = (fs, embed_dim))
                for fs in conv0_f_sizes
        ])
        conv_0_out_dims = [280 - fs -1 for fs in conv0_f_sizes]
        self.conv_1 = nn.ModuleList([
                nn.Conv1d(in_channels = conv0_f_nums,
                          out_channels = conv1_f_nums,
                          kernel_size = fs)
                for fs in conv1_f_sizes
        ])
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(conv1_f_sizes) * conv1_f_nums, output_dim)
    
    def forward(self, text):
        # text = (tensor of input, tensor of input length)
#         print(text)
        # convert input to embeddings
        in_data = text[0]
        # in_data = [batch_size, sentence_length]
        embedded = self.embedding(in_data)
#         print(embedded)
        # embedded = [batch_size, sentence_length, embedding_dimension]
        embedded = embedded.unsqueeze(1)
        # embedded = [batch_size, 1, sentence_length, embedding_dimension]
        conved_0 = [F.relu(conv(embedded)).squeeze(3) for conv in self.conv_0]
#         for each in conved_0:
#             print(each.shape)        

        print(each.shape for each in pooled_0)
        conved_1 = [F.relu(conv1(conv0)) for conv1, conv0 in zip(self.conv_1, conved_0)]
        for each in conved_1:
            print(each.shape)
        # pooled output
        cnn_output = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved_1]
        
        # pooled_n = [batch_size, n_filters]
#         cat = self.dropout(torch.cat(cnn_output, dim=1))
        cat = torch.cat(cnn_output, dim=1)
        # cat = [batch_size, n_filters * len(filter_sizes)]
        logit = self.fc(cat)
        
        return logit
        

In [75]:
# train_char_features = [num_of_tweets, max. length of each tweet, embedding_size]
"""
num_features = list(train_char_features.shape)
print(num_features)
max_tweet_length = num_features[1]
embedding_dim = num_features[2] # vocab_size
"""
input_dim = embedding_dim = len(text_field.vocab)
conv0_filter_sizes = [3, 4, 5, 6] # like character 3-gram, 4-gram, 5-gram, 6-gram
conv0_filter_nums = 5 # number of filters
conv1_filter_sizes = [3, 3, 3, 3]
conv1_filter_nums = 5
output_dim = 3
dropout = 0.5

model = TextShallowCNN(input_dim, embedding_dim,
                conv0_filter_nums, conv0_filter_sizes,
                output_dim, dropout)

# model = TextDeepCNN(input_dim, embedding_dim,
#                 conv0_filter_nums, conv0_filter_sizes,
#                 conv1_filter_nums, conv1_filter_sizes,
#                 output_dim, dropout)

In [76]:
# checking the parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('The model has {} trainable parameters'.format(count_parameters(model)))

The model has 889194 trainable parameters


In [77]:
# training
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-2)

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [78]:
def get_accuracy(logits, labels):
    correct, total = 0, 0
    _, predicted = torch.max(logits, 1)
#     print(predicted, labels)
#     print(predicted.shape, labels.shape)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
    return correct / total

In [79]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    for i, batch in enumerate(iterator):
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
#         print(predictions, predictions.shape)
#         print(batch.label, batch.label.shape)
        loss = criterion(predictions, batch.label)
        
        acc = get_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc
        """
        if i % 10 == 0:
            print("batch: {}, loss: {}, acc: {}".format(i, loss, acc*100))
        """
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [80]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = get_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [81]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 100

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print('Epoch: {} | Epoch Time: {}m {}s'.format(epoch+1, epoch_mins, epoch_secs))
    print('\tTrain Loss: {} | Train Acc: {}%'.format(train_loss, train_acc*100))
    print('\tVal. Loss: {} |  Val. Acc: {}%'.format(valid_loss, valid_acc*100))

Epoch: 1 | Epoch Time: 0m 8s
	Train Loss: 1.5124360588735515 | Train Acc: 44.234645265318086%
	Val. Loss: 1.2832356194655101 |  Val. Acc: 38.93952546296296%
Epoch: 2 | Epoch Time: 0m 8s
	Train Loss: 0.9800451957770576 | Train Acc: 49.89097771914394%
	Val. Loss: 0.9699447515110174 |  Val. Acc: 49.660011574074076%
Epoch: 3 | Epoch Time: 0m 8s
	Train Loss: 0.9587001022059559 | Train Acc: 52.0100410436822%
	Val. Loss: 0.9814755531648794 |  Val. Acc: 50.86082175925925%
Epoch: 4 | Epoch Time: 0m 8s
	Train Loss: 0.9556486954160605 | Train Acc: 51.52631193198476%
	Val. Loss: 0.9893438580135504 |  Val. Acc: 50.47743055555556%
Epoch: 5 | Epoch Time: 0m 8s
	Train Loss: 0.9802465331900403 | Train Acc: 50.950967458223396%
	Val. Loss: 1.0503458008170128 |  Val. Acc: 48.90769675925925%
Epoch: 6 | Epoch Time: 0m 8s
	Train Loss: 0.9924766295503501 | Train Acc: 49.17362943418352%
	Val. Loss: 1.0863170213997364 |  Val. Acc: 45.20399305555556%
Epoch: 7 | Epoch Time: 0m 8s
	Train Loss: 1.0072979236655626 |

Epoch: 54 | Epoch Time: 0m 8s
	Train Loss: 1.0795943096948488 | Train Acc: 38.25307827616535%
	Val. Loss: 2.376678249488274 |  Val. Acc: 37.3119212962963%
Epoch: 55 | Epoch Time: 0m 8s
	Train Loss: 1.0797695110215362 | Train Acc: 38.23200674289065%
	Val. Loss: 2.3771719709038734 |  Val. Acc: 37.14554398148148%
Epoch: 56 | Epoch Time: 0m 8s
	Train Loss: 1.0790884619023366 | Train Acc: 38.253078276165354%
	Val. Loss: 2.3773137057820954 |  Val. Acc: 37.3119212962963%
Epoch: 57 | Epoch Time: 0m 8s
	Train Loss: 1.0792842262023987 | Train Acc: 38.25307827616535%
	Val. Loss: 2.378100593884786 |  Val. Acc: 37.3119212962963%
Epoch: 58 | Epoch Time: 0m 8s
	Train Loss: 1.0793218601662125 | Train Acc: 38.27414980944005%
	Val. Loss: 2.3765408247709274 |  Val. Acc: 37.3119212962963%
Epoch: 59 | Epoch Time: 0m 8s
	Train Loss: 1.0794609612084944 | Train Acc: 38.25307827616535%
	Val. Loss: 2.3782654826839766 |  Val. Acc: 37.14554398148148%
Epoch: 60 | Epoch Time: 0m 8s
	Train Loss: 1.079507201831385 | 