In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
from utils.datautils import prepare_data, create_vocab
prepare_data('train_pos_full.txt', 'train_neg_full.txt', 'data')

(134000, 2)
(66000, 2)


In [2]:
VOCAB, MAX_LEN = create_vocab('data')

In [3]:
EMBEDDING_DIM = 100
K = 200
BATCH_SIZE = 64

In [7]:
from utils.glove import load_glove
word_to_idx, embeddings, PAD_IDX, words = load_glove('embeddings/glove/glove.twitter.27B.100d.txt', EMBEDDING_DIM)

In [8]:
VOCAB_SIZE = len(embeddings)
VOCAB_SIZE

1193515

In [9]:
from datasets.glove_dataset import GloveDataset

train_data = GloveDataset('data/train.csv', MAX_LEN, word_to_idx, PAD_IDX, VOCAB)
valid_data = GloveDataset('data/valid.csv', MAX_LEN, word_to_idx, PAD_IDX, VOCAB)

In [10]:
train_data.__len__(), valid_data.__len__()

(134000, 66000)

In [12]:
train_data.__getitem__(378)

(array([    199,      73,      11,     456,      50,      36,      10,
             36,      11,     236,      11,     456,      50,      36,
             17,     236,      10,      36,      38,      72,      11,
            210,     210,     199,      28,      73,      50,      72,
            187,     369,      49,     236,      50,      36,     199,
             49,      73,     334,      51,     293,     236,      50,
             73,     293,     199,      50,     334,     187,      73,
             11,      36,     137,     351,      50,      73,     187,
             11,     187,      10,      50,      36,       1,     199,
             73,      11,     456,      50,      36,      10,      36,
             11,     236,      11,     456,      50,      36,      10,
            137,     187,     369,      49,     351,      49,      73,
            334,      49,     210,     187,     236,      11,      47,
              1,       1,       1,      71,      51,      73,     293,
      

In [13]:
from torch.utils.data import DataLoader
training_params = {"batch_size": BATCH_SIZE,
                       "shuffle": True,
                       "drop_last": True}

validation_params = {"batch_size": BATCH_SIZE,
                      "shuffle": False,
                      "drop_last": True}
train_iterator = DataLoader(train_data, **training_params)
valid_iterator = DataLoader(valid_data, **validation_params)

In [14]:
from models.lstm import RNN as RNN

In [15]:
INPUT_DIM = VOCAB_SIZE # len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = PAD_IDX # :)

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)


In [15]:
VOCAB_SIZE

1193515

In [16]:

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 121,662,157 trainable parameters


In [17]:
import torch 
model.embedding.weight.data.copy_(torch.Tensor(embeddings))

tensor([[ 0.6301,  0.6518,  0.2555,  ...,  0.5510,  0.6471, -0.6093],
        [ 0.1821, -0.0485,  0.2397,  ..., -0.3358,  0.1888, -0.4079],
        [ 1.0674,  0.4572,  0.5146,  ...,  0.1397,  0.7649, -0.1731],
        ...,
        [-0.3831,  0.0646,  0.2738,  ..., -0.1640, -0.2697, -0.6994],
        [-0.0288, -0.7261, -0.8277,  ..., -0.6969, -0.7652, -1.0901],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
import torch.optim as optim
import torch.nn as nn

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [20]:
from utils.mterics import AverageMeter, binary_accuracy

In [21]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    num_iter_per_epoch = len(iterator)
    losses = AverageMeter()
    accuracies = AverageMeter()
    model.train()
    iter = 0
    for batch in iterator:
        iter += 1
        optimizer.zero_grad()
        x_wrd, lengths, labels = batch
        x_wrd = torch.LongTensor(x_wrd)
        if torch.cuda.is_available():
            x_wrd = x_wrd.cuda()
            lengths = lengths.cuda()
            labels = labels.cuda()
        
        predictions = model(x_wrd, lengths).squeeze(1)
        
        loss = criterion(predictions, labels.float())
        
        acc = binary_accuracy(predictions, labels)
        
        loss.backward()
        
        optimizer.step()
        
        losses.update(loss.data, x_wrd.size(0))
        accuracy = binary_accuracy(predictions, labels)
        accuracies.update(accuracy, x_wrd.size(0))

        if (iter % 20 == 0) and (iter > 0):
                print("[Train - Epoch: {}] , Iteration: {}/{} , Loss: {}, Accuracy: {}".format(
                epoch + 1,
                iter,
                num_iter_per_epoch,
                losses.avg,
                accuracies.avg
            ))
        
    return losses.avg.item(), accuracies.avg.item()

In [22]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    losses = AverageMeter()
    accuracies = AverageMeter()
    num_iter_per_epoch = len(iterator)

    iter = 0
    for batch in iterator:
        iter += 1
        x_wrd, lengths, labels = batch
        x_wrd = torch.LongTensor(x_wrd)
        if torch.cuda.is_available():
            x_wrd = x_wrd.cuda()
            lenghts = lengths.cuda()
            labels = labels.cuda()

        with torch.no_grad():
            predictions = model(x_wrd, lengths).squeeze(1)
        
        loss = criterion(predictions, labels.float())
        accuracy = binary_accuracy(predictions, labels)
        losses.update(loss.data, x_wrd.size(0))
        accuracies.update(accuracy, x_wrd.size(0))

        if (iter % 20 == 0) and (iter > 0):
            print("[Validation - Epoch: {}] , Iteration: {}/{} , Loss: {}, Accuracy: {}".format(
                epoch + 1,
                iter,
                num_iter_per_epoch,
                losses.avg,
                accuracies.avg
            ))


    return losses.avg.item(), accuracies.avg.item()

In [23]:

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [1]:
N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model.filename)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

NameError: name 'time' is not defined

In [2]:
from utils.datautils import prepare_test_data

prepare_test_data('test_data.txt', 'data')

ModuleNotFoundError: No module named 'utils'

In [3]:
def predict(model, iterator):
    model.eval()
    decimal_output, binary_output = [], []
    id = 1
    for tweet in iterator: 
        x_wrd, lengths, label = tweet
        x_wrd = torch.LongTensor(x_wrd).unsqueeze(0)
        lengths = torch.LongTensor([lengths])
        if torch.cuda.is_available():
            x_wrd = x_wrd.cuda()
            lenghts = lengths.cuda()
        prediction = torch.sigmoid(model(x_wrd, lengths))
        decimal_output.append([id, prediction.item()])
        if int(torch.round(prediction).item()) == 0:
            binary_output.append([id, -1])
        elif int(torch.round(prediction).item()) == 1:
            binary_output.append([id, 1])
        else:
            raise Exception("This should never happen")
        id += 1

        if id % 1000 == 0:
            print('Predicting example {}/{}'.format(id, len(iterator)))
    
    return decimal_output, binary_output


In [2]:
test_iterator = GloveDataset('data/test.csv', MAX_LEN, word_to_idx, PAD_IDX, VOCAB)

NameError: name 'GloveDataset' is not defined

In [None]:
optimal_model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)
optimal_model.load_state_dict(torch.load(model.filename))
optimal_model = optimal_model.to(device)

In [None]:
decimal_pred, binary_pred = predict(optimal_model, test_iterator)

In [None]:
from utils.datautils import write_predictions
binary_pred_df = write_predictions(binary_pred, 'LSTM-Binary-Predictions.csv', data_dir='predfiles')