In [1]:
from utils.datautils import prepare_data, create_vocab
prepare_data('train_pos_full.txt', 'train_neg_full.txt', 'data')

(1675000, 2)
(825000, 2)


In [4]:
VOCAB, MAX_LEN = create_vocab('data')

In [5]:
EMBEDDING_DIM = 100
K = 200
BATCH_SIZE = 64

In [6]:
from utils.glove import load_glove
word_to_idx, embeddings, PAD_IDX, words = load_glove('embeddings/glove/glove.twitter.27B.100d.txt', EMBEDDING_DIM)

In [7]:
VOCAB_SIZE = len(embeddings)
VOCAB_SIZE

1193515

In [17]:
from datasets.glove_dataset import GloveDataset

train_data = GloveDataset('data/train.csv', MAX_LEN, word_to_idx, PAD_IDX, VOCAB, False)
valid_data = GloveDataset('data/valid.csv', MAX_LEN, word_to_idx, PAD_IDX, VOCAB, False)

TypeError: __init__() takes from 5 to 6 positional arguments but 7 were given

In [9]:
train_data.__len__(), valid_data.__len__()

(134000, 66000)

In [10]:
train_data.__getitem__(78)

(array([     71,      51,     137,      49,      73,      60, 1193514,
            187,     369,      11,     187, 1193514,     236,      50,
             51,     293,     199, 1193514,      76,      49, 1193514,
             11,     293,     293, 1193514,      51,      36,     351,
            293,      51,     456,     456,      49,     199, 1193514,
            506, 1193514,      11,      72,      11,    1016,      10,
             36,     456, 1193514,       4, 1193514,      11,      72,
             11,    1016,      10,      36,     456, 1193514,       4,
        1193514,      11,      72,      11,    1016,      10,      36,
            456, 1193514,       9, 1193514,       9, 1193514,      76,
             73,      50,      51,     456,     369,     187, 1193514,
             72,      49, 1193514,     187,      50, 1193514,     187,
             49,      11,      73,     137, 1193514,       1,       1,
        1193514,      76,      49,      11,      51,     187,      10,
      

In [11]:
from torch.utils.data import DataLoader
training_params = {"batch_size": BATCH_SIZE,
                       "shuffle": True,
                       "drop_last": True}

validation_params = {"batch_size": BATCH_SIZE,
                      "shuffle": False,
                      "drop_last": True}
train_iterator = DataLoader(train_data, **training_params)
valid_iterator = DataLoader(valid_data, **validation_params)

In [12]:
from models.cnn import CNN as CNN

In [13]:
N_FILTERS = 100
FILTER_SIZES = [3]
OUTPUT_DIM = 1
DROPOUT = 0.5

model = CNN(VOCAB_SIZE, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [14]:
VOCAB_SIZE

1193515

In [15]:

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 119,381,701 trainable parameters


In [16]:
import torch 
model.embedding.weight.data.copy_(torch.Tensor(embeddings))

tensor([[ 0.6301,  0.6518,  0.2555,  ...,  0.5510,  0.6471, -0.6093],
        [ 0.1821, -0.0485,  0.2397,  ..., -0.3358,  0.1888, -0.4079],
        [ 1.0674,  0.4572,  0.5146,  ...,  0.1397,  0.7649, -0.1731],
        ...,
        [-0.3831,  0.0646,  0.2738,  ..., -0.1640, -0.2697, -0.6994],
        [-0.0288, -0.7261, -0.8277,  ..., -0.6969, -0.7652, -1.0901],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import torch.optim as optim
import torch.nn as nn

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
from utils.mterics import AverageMeter, binary_accuracy

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    num_iter_per_epoch = len(iterator)
    losses = AverageMeter()
    accuracies = AverageMeter()
    model.train()
    iter = 0
    for batch in iterator:
        iter += 1
        optimizer.zero_grad()
        x_wrd, labels = batch
        x_wrd = torch.LongTensor(x_wrd)
        if torch.cuda.is_available():
            x_wrd = x_wrd.cuda()
            labels = labels.cuda()
        
        predictions = model(x_wrd).squeeze(1)
        
        loss = criterion(predictions, labels.float())
        
        acc = binary_accuracy(predictions, labels)
        
        loss.backward()
        
        optimizer.step()
        
        losses.update(loss.data, x_wrd.size(0))
        accuracy = binary_accuracy(predictions, labels)
        accuracies.update(accuracy, x_wrd.size(0))

        if (iter % 20 == 0) and (iter > 0):
                print("[Train - Epoch: {}] , Iteration: {}/{} , Loss: {}, Accuracy: {}".format(
                epoch + 1,
                iter,
                num_iter_per_epoch,
                losses.avg,
                accuracies.avg
            ))
        
    return losses.avg.item(), accuracies.avg.item()

In [None]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    losses = AverageMeter()
    accuracies = AverageMeter()
    num_iter_per_epoch = len(iterator)

    iter = 0
    for batch in iterator:
        iter += 1
        x_wrd, labels = batch
        x_wrd = torch.LongTensor(x_wrd)
        if torch.cuda.is_available():
            x_wrd = x_wrd.cuda()
            labels = labels.cuda()

        with torch.no_grad():
            predictions = model(x_wrd).squeeze(1)
        
        loss = criterion(predictions, labels.float())
        accuracy = binary_accuracy(predictions, labels)
        losses.update(loss.data, x_wrd.size(0))
        accuracies.update(accuracy, x_wrd.size(0))

        if (iter % 20 == 0) and (iter > 0):
            print("[Validation - Epoch: {}] , Iteration: {}/{} , Loss: {}, Accuracy: {}".format(
                epoch + 1,
                iter,
                num_iter_per_epoch,
                losses.avg,
                accuracies.avg
            ))


    return losses.avg.item(), accuracies.avg.item()

In [None]:

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model.filename)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
from utils.datautils import prepare_test_data

prepare_test_data('test_data.txt', 'data')

In [None]:
def predict(model, iterator):
    model.eval()
    decimal_output, binary_output = [], []
    id = 1
    for tweet in iterator: 
        x_wrd, label = tweet
        x_wrd = torch.LongTensor(x_wrd).unsqueeze(0)
        if torch.cuda.is_available():
            x_wrd = x_wrd.cuda()
        prediction = torch.sigmoid(model(x_wrd))
        decimal_output.append([id, prediction.item()])
        if int(torch.round(prediction).item()) == 0:
            binary_output.append([id, -1])
        elif int(torch.round(prediction).item()) == 1:
            binary_output.append([id, 1])
        else:
            raise Exception("This should never happen")
        id += 1

        if id % 1000 == 0:
            print('Predicting example {}/{}'.format(id, len(iterator)))
    
    return decimal_output, binary_output


In [None]:
test_iterator = GloveDataset('data/test.csv', MAX_LEN, word_to_idx, PAD_IDX, VOCAB, False)

In [None]:
optimal_model = CNN(VOCAB_SIZE, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
optimal_model.load_state_dict(torch.load(model.filename))
optimal_model = optimal_model.to(device)

In [None]:
decimal_pred, binary_pred = predict(optimal_model, test_iterator)

In [None]:
from utils.datautils import write_predictions
binary_pred_df = write_predictions(binary_pred, 'Binary-GloVe-CNN.csv', data_dir='predfiles')