In [1]:
import sys
import json
import os
import timeit
import random
SEED = 340

sys.path.insert(0, '../')
from src.anke.data_loader import ClassificationDataset
from src.anke.sentence_classifier import SentenceClassifier
from src.anke.train import train, evaluate

from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import torch

from torch.utils.tensorboard import SummaryWriter

In [2]:
def file_to_list(path):
    list_ = []
    f = open(path, 'r')
    lines = f.readlines()
    for line in lines:
        list_.append(line.replace('\n',''))
    f.close()

    return list_

def list_to_file(path, list_):
    with open(path, 'w') as f:
        for item in list_:
            f.write(item + '\n')

In [3]:
with open('../utils/params_anke_all.json', 'r') as f:
    params = json.load(f)

embedding_type = 'glove'
embeddings_path = '../utils/glove.6B.100d.txt'

ROOT_DIR = ".."
all_train = "data/classification/all/train"
all_val = "data/classification/all/val"
all_test = "data/classification/all/test"
undersampling_train = "data/classification/all/undersampling_train"

train_data_path = os.path.join(ROOT_DIR, undersampling_train)
valid_data_path = os.path.join(ROOT_DIR, all_val)
test_data_path = os.path.join(ROOT_DIR, all_test)

In [4]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [41]:
def train_and_evaluate(train_dataset, valid_dataset, test_dataset, model_path, summarywriter_suffix):
    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=params['batch_size'], num_workers=2)

    word_vectors = train_dataset.glove_matrix

    model = SentenceClassifier(params, word_vectors)

    optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
    criterion = nn.BCELoss()

    # Push to cuda if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model = model.to(device)

    # Training
    SEED = 2020
    torch.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

    writer = SummaryWriter(filename_suffix=summarywriter_suffix)

    # Model Architecture
    print(model)

    print(f'The model has {count_parameters(model):,} trainable parameters')

    best_valid_loss = float('inf')

    print("Training Started")
    train_start_time = timeit.default_timer()
    for epoch in range(10):

        epoch_start_time = timeit.default_timer()

        # Train the model
        model.train()
        train_iterator = iter(train_loader)
        train_loss, train_acc, train_f1 = train(model, train_iterator, optimizer, criterion)

        model.eval()
        # Evaluate the model
        valid_iterator = iter(valid_loader)
        valid_loss, valid_acc, valid_f1 = evaluate(model, valid_iterator, criterion)

        # Save the best model
        if valid_loss < best_valid_loss:
            print("New model saved with valid loss-", best_valid_loss, "at", model_path)
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), model_path)

        epoch_end_time = timeit.default_timer()

        print(f'Epoch: {epoch} | time taken: {epoch_end_time - epoch_start_time}')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.4f}% | Train F1: {train_f1:.6f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.4f}% | Valid F1: {valid_f1:.6f}')
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('Loss/validation', valid_loss, epoch)
        writer.add_scalar('F1/train', train_f1, epoch)
        writer.add_scalar('F1/validation', valid_f1, epoch)

    train_end_time = timeit.default_timer()
    print(f'Total training time: {train_end_time - train_start_time}')
    
    # Evaluation

    train_iterator = iter(train_loader)
    valid_iterator = iter(valid_loader)

    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], num_workers=2)
    test_iterator = iter(test_loader)

    model.load_state_dict(torch.load(model_path))
    model.eval()

    valid_loss, valid_acc, valid_f1 = evaluate(model, valid_iterator, criterion)
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.4f}% | Valid F1: {valid_f1:.6f}')


    model.load_state_dict(torch.load(model_path))
    model.eval()

    valid_loss, valid_acc, valid_f1 = evaluate(model, test_iterator, criterion)
    print(f'\t Test Loss: {valid_loss:.3f} |  Test Acc: {valid_acc*100:.4f}% | Test F1: {valid_f1:.6f}')

In [34]:
train_data_path_0 = os.path.join(train_data_path, '0')
train_data_path_1= os.path.join(train_data_path, '1')
train_data_path_2 = os.path.join(train_data_path, '2')
train_data_path_3 = os.path.join(train_data_path, '3')
train_data_path_4 = os.path.join(train_data_path, '4')

train_dataset_0 = ClassificationDataset(train_data_path_0, embedding_type, embeddings_path)
train_dataset_1 = ClassificationDataset(train_data_path_1, embedding_type, embeddings_path)
train_dataset_2 = ClassificationDataset(train_data_path_2, embedding_type, embeddings_path)
train_dataset_3 = ClassificationDataset(train_data_path_3, embedding_type, embeddings_path)
train_dataset_4 = ClassificationDataset(train_data_path_4, embedding_type, embeddings_path)

valid_dataset = ClassificationDataset(valid_data_path, embedding_type, embeddings_path)
test_dataset = ClassificationDataset(test_data_path, embedding_type, embeddings_path)

In [42]:
model_path = '../saved_models/anke_all_undersampling_0.pt'
summarywriter_suffix = 'undersampling_0'
train_and_evaluate(train_dataset_0, valid_dataset, test_dataset, model_path, summarywriter_suffix)

cuda
SentenceClassifier(
  (embedding): Embedding(400002, 100)
  (conv): Conv1d(100, 100, kernel_size=(3,), stride=(1,), padding=(1,))
  (maxpool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.5, inplace=True)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=1, bias=True)
)
The model has 191,901 trainable parameters
Training Started
New model saved with valid loss- inf at ../saved_models/anke_all_undersampling_0.pt
Epoch: 0 | time taken: 39.250284690991975
	Train Loss: 0.619 | Train Acc: 80.7939% | Train F1: 0.323889
	 Val. Loss: 2.011 |  Val. Acc: 17.1385% | Valid F1: 0.289455
New model saved with valid loss- 2.0108061027377375 at ../saved_models/anke_all_undersampling_0.pt
Epoch: 1 | time taken: 8.468030366115272
	Train Loss: 0.764 | Train Acc: 57.3467% | Train F1: 0.287582
	 Val. Loss: 1.290 |  Val. Acc: 17.1385% | Valid F1: 0.289455
New model saved with valid loss- 1.2

In [43]:
model_path = '../saved_models/anke_all_undersampling_1.pt'
summarywriter_suffix = 'undersampling_1'
train_and_evaluate(train_dataset_1, valid_dataset, test_dataset, model_path, summarywriter_suffix)

cuda
SentenceClassifier(
  (embedding): Embedding(400002, 100)
  (conv): Conv1d(100, 100, kernel_size=(3,), stride=(1,), padding=(1,))
  (maxpool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.5, inplace=True)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=1, bias=True)
)
The model has 191,901 trainable parameters
Training Started
New model saved with valid loss- inf at ../saved_models/anke_all_undersampling_1.pt
Epoch: 0 | time taken: 7.257950491970405
	Train Loss: 0.623 | Train Acc: 80.4639% | Train F1: 0.321056
	 Val. Loss: 2.019 |  Val. Acc: 17.1385% | Valid F1: 0.289455
New model saved with valid loss- 2.0193755252981633 at ../saved_models/anke_all_undersampling_1.pt
Epoch: 1 | time taken: 7.111807515961118
	Train Loss: 0.764 | Train Acc: 57.5116% | Train F1: 0.285707
	 Val. Loss: 1.273 |  Val. Acc: 17.1385% | Valid F1: 0.289455
New model saved with valid loss- 1.27

In [44]:
model_path = '../saved_models/anke_all_undersampling_2.pt'
summarywriter_suffix = 'undersampling_2'
train_and_evaluate(train_dataset_2, valid_dataset, test_dataset, model_path, summarywriter_suffix)

cuda
SentenceClassifier(
  (embedding): Embedding(400002, 100)
  (conv): Conv1d(100, 100, kernel_size=(3,), stride=(1,), padding=(1,))
  (maxpool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.5, inplace=True)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=1, bias=True)
)
The model has 191,901 trainable parameters
Training Started
New model saved with valid loss- inf at ../saved_models/anke_all_undersampling_2.pt
Epoch: 0 | time taken: 7.177982227993198
	Train Loss: 0.616 | Train Acc: 81.0753% | Train F1: 0.326727
	 Val. Loss: 2.030 |  Val. Acc: 17.1385% | Valid F1: 0.289455
New model saved with valid loss- 2.0299374055712947 at ../saved_models/anke_all_undersampling_2.pt
Epoch: 1 | time taken: 7.2454009969951585
	Train Loss: 0.766 | Train Acc: 57.2981% | Train F1: 0.285448
	 Val. Loss: 1.283 |  Val. Acc: 17.1385% | Valid F1: 0.289455
New model saved with valid loss- 1.2

In [45]:
model_path = '../saved_models/anke_all_undersampling_3.pt'
summarywriter_suffix = 'undersampling_3'
train_and_evaluate(train_dataset_3, valid_dataset, test_dataset, model_path, summarywriter_suffix)

cuda
SentenceClassifier(
  (embedding): Embedding(400002, 100)
  (conv): Conv1d(100, 100, kernel_size=(3,), stride=(1,), padding=(1,))
  (maxpool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.5, inplace=True)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=1, bias=True)
)
The model has 191,901 trainable parameters
Training Started
New model saved with valid loss- inf at ../saved_models/anke_all_undersampling_3.pt
Epoch: 0 | time taken: 7.517769415979274
	Train Loss: 0.622 | Train Acc: 80.6871% | Train F1: 0.323241
	 Val. Loss: 2.008 |  Val. Acc: 17.1385% | Valid F1: 0.289455
New model saved with valid loss- 2.00841339927482 at ../saved_models/anke_all_undersampling_3.pt
Epoch: 1 | time taken: 7.516769345966168
	Train Loss: 0.765 | Train Acc: 57.3806% | Train F1: 0.286310
	 Val. Loss: 1.257 |  Val. Acc: 17.1385% | Valid F1: 0.289455
New model saved with valid loss- 1.2567

In [46]:
model_path = '../saved_models/anke_all_undersampling_4.pt'
summarywriter_suffix = 'undersampling_4'
train_and_evaluate(train_dataset_4, valid_dataset, test_dataset, model_path, summarywriter_suffix)

cuda
SentenceClassifier(
  (embedding): Embedding(400002, 100)
  (conv): Conv1d(100, 100, kernel_size=(3,), stride=(1,), padding=(1,))
  (maxpool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.5, inplace=True)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=1, bias=True)
)
The model has 191,901 trainable parameters
Training Started
New model saved with valid loss- inf at ../saved_models/anke_all_undersampling_4.pt
Epoch: 0 | time taken: 7.349027410033159
	Train Loss: 0.620 | Train Acc: 80.7356% | Train F1: 0.323427
	 Val. Loss: 2.049 |  Val. Acc: 17.1385% | Valid F1: 0.289455
New model saved with valid loss- 2.048695451787273 at ../saved_models/anke_all_undersampling_4.pt
Epoch: 1 | time taken: 7.226410419913009
	Train Loss: 0.767 | Train Acc: 57.4922% | Train F1: 0.284457
	 Val. Loss: 1.262 |  Val. Acc: 17.1385% | Valid F1: 0.289455
New model saved with valid loss- 1.262

In [49]:
train_data_path = os.path.join(ROOT_DIR, 'data/classification/all/train')
model_path = '../saved_models/anke_all_glove.pt'
summarywriter_suffix = 'anke_all_glove'
train_dataset = ClassificationDataset(train_data_path, embedding_type, embeddings_path)
train_and_evaluate(train_dataset, valid_dataset, test_dataset, model_path, summarywriter_suffix)

cuda
SentenceClassifier(
  (embedding): Embedding(400002, 100)
  (conv): Conv1d(100, 100, kernel_size=(3,), stride=(1,), padding=(1,))
  (maxpool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.5, inplace=True)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=1, bias=True)
)
The model has 191,901 trainable parameters
Training Started
New model saved with valid loss- inf at ../saved_models/anke_all_glove.pt
Epoch: 0 | time taken: 16.287920363014564
	Train Loss: 0.450 | Train Acc: 82.8939% | Train F1: 0.003414
	 Val. Loss: 0.408 |  Val. Acc: 82.9937% | Valid F1: 0.015418
New model saved with valid loss- 0.40835279599999935 at ../saved_models/anke_all_glove.pt
Epoch: 1 | time taken: 14.284547388087958
	Train Loss: 0.399 | Train Acc: 83.5445% | Train F1: 0.137826
	 Val. Loss: 0.396 |  Val. Acc: 83.6648% | Valid F1: 0.170354
New model saved with valid loss- 0.39584441179392105 a