In [1]:
# Reload modules
%load_ext autoreload
%autoreload 2

In [2]:
from preprocess.preprocess import clean_str, build_data

In [5]:
from model.model import CNN1d, binary_accuracy, train, evaluate, epoch_time

In [29]:
import torch
from torchtext.data import TabularDataset, Field, LabelField, BucketIterator
import torch.optim as optim
import torch.nn as nn
import time
from gensim.models import KeyedVectors
import numpy as np

In [13]:
BATCH_SIZE = 50
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

TEXT = Field(sequential = True, # text: sequential data
             tokenize = str.split, 
             batch_first = True, 
             fix_length = 56, # padding size: max length of data text
             lower = True)
LABEL = LabelField(sequential = False,
                   dtype = torch.float)

w2v = KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin.gz', binary = True)

In [26]:
# make dataset for 10-fold
data_dir = './preprocess'
train_paths, val_paths = build_data(data_dir)

save the kfold 0 data
save the kfold 1 data
save the kfold 2 data
save the kfold 3 data
save the kfold 4 data
save the kfold 5 data
save the kfold 6 data
save the kfold 7 data
save the kfold 8 data
save the kfold 9 data


In [14]:
N_EPOCHS = 10
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
test_acc_lists = []

In [27]:
for kfold in range(10):
    # make datasets
    train_path = train_paths[kfold]
    val_path = val_paths[kfold]
    train_data = TabularDataset(path= train_path, skip_header = True,
        format='csv', fields=[('label', LABEL), ('text', TEXT)])
    test_data = TabularDataset(path= val_path, skip_header = True,
        format='csv', fields=[('label', LABEL), ('text', TEXT)])

    TEXT.build_vocab(train_data)
    LABEL.build_vocab(train_data)

    # for pretrained embedding vectors
    w2v_vectors = []
    for token, idx in TEXT.vocab.stoi.items():
        # pad token -> zero 
        if idx == 1:
            w2v_vectors.append(torch.zeros(EMBEDDING_DIM))
        # if word in word2vec vocab -> replace with pretrained word2vec
        elif token in w2v.wv.vocab.keys():
            w2v_vectors.append(torch.FloatTensor(w2v[token]))
        # oov -> randomly initialized uniform distribution
        else: 
            w2v_vectors.append(torch.distributions.Uniform(-0.25, +0.25).sample((EMBEDDING_DIM,)))

    TEXT.vocab.set_vectors(TEXT.vocab.stoi, w2v_vectors, EMBEDDING_DIM)
    pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors)

    # make iterators
    train_iterator,  test_iterator = BucketIterator.splits(
        (train_data, test_data), 
        batch_size = BATCH_SIZE, 
        device = device, sort=False, shuffle = True)
    
    # define a model
    INPUT_DIM = len(TEXT.vocab)

    model = CNN1d(pretrained_embeddings, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
    optimizer = optim.Adadelta(model.parameters(), rho=0.95)
    criterion = nn.BCEWithLogitsLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    # train
    best_test_acc = -float('inf')
    model_name= './model/model' + str(kfold) + '.pt'
    print('kfold', kfold)
    for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        test_loss, test_acc = evaluate(model, test_iterator, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if test_acc > best_test_acc:
            best_test_acc = test_acc
            torch.save(model.state_dict(), model_name)

        
        print(f'\tEpoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\t\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t\tTest. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')
    

    model.load_state_dict(torch.load(model_name))

    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    test_acc_lists.append(test_acc)
    print(f'============== last test accuracy: {test_acc}')
    print()



kfold 0
	Epoch: 01 | Epoch Time: 0m 48s
		Train Loss: 0.557 | Train Acc: 70.39%
		Test. Loss: 0.440 |  Val. Acc: 78.95%
	Epoch: 02 | Epoch Time: 0m 51s
		Train Loss: 0.434 | Train Acc: 79.65%
		Test. Loss: 0.415 |  Val. Acc: 78.86%
	Epoch: 03 | Epoch Time: 0m 51s
		Train Loss: 0.349 | Train Acc: 84.71%
		Test. Loss: 0.424 |  Val. Acc: 80.95%
	Epoch: 04 | Epoch Time: 0m 51s
		Train Loss: 0.259 | Train Acc: 88.92%
		Test. Loss: 0.469 |  Val. Acc: 81.24%
	Epoch: 05 | Epoch Time: 0m 51s
		Train Loss: 0.188 | Train Acc: 92.96%
		Test. Loss: 0.483 |  Val. Acc: 81.05%
	Epoch: 06 | Epoch Time: 0m 52s
		Train Loss: 0.116 | Train Acc: 95.84%
		Test. Loss: 0.655 |  Val. Acc: 77.71%
	Epoch: 07 | Epoch Time: 0m 51s
		Train Loss: 0.068 | Train Acc: 97.77%
		Test. Loss: 0.513 |  Val. Acc: 80.48%
	Epoch: 08 | Epoch Time: 0m 52s
		Train Loss: 0.040 | Train Acc: 98.99%
		Test. Loss: 0.651 |  Val. Acc: 78.95%
	Epoch: 09 | Epoch Time: 0m 52s
		Train Loss: 0.023 | Train Acc: 99.51%
		Test. Loss: 0.599 |  V


kfold 7
	Epoch: 01 | Epoch Time: 1m 3s
		Train Loss: 0.553 | Train Acc: 70.64%
		Test. Loss: 0.491 |  Val. Acc: 74.63%
	Epoch: 02 | Epoch Time: 1m 4s
		Train Loss: 0.421 | Train Acc: 80.89%
		Test. Loss: 0.537 |  Val. Acc: 72.00%
	Epoch: 03 | Epoch Time: 1m 6s
		Train Loss: 0.346 | Train Acc: 84.74%
		Test. Loss: 0.446 |  Val. Acc: 79.08%
	Epoch: 04 | Epoch Time: 1m 3s
		Train Loss: 0.260 | Train Acc: 89.44%
		Test. Loss: 0.474 |  Val. Acc: 79.17%
	Epoch: 05 | Epoch Time: 1m 0s
		Train Loss: 0.184 | Train Acc: 93.01%
		Test. Loss: 0.506 |  Val. Acc: 79.82%
	Epoch: 06 | Epoch Time: 0m 59s
		Train Loss: 0.107 | Train Acc: 95.91%
		Test. Loss: 0.553 |  Val. Acc: 79.56%
	Epoch: 07 | Epoch Time: 1m 0s
		Train Loss: 0.066 | Train Acc: 97.92%
		Test. Loss: 0.584 |  Val. Acc: 80.07%
	Epoch: 08 | Epoch Time: 0m 58s
		Train Loss: 0.039 | Train Acc: 98.96%
		Test. Loss: 0.632 |  Val. Acc: 79.93%
	Epoch: 09 | Epoch Time: 0m 59s
		Train Loss: 0.022 | Train Acc: 99.53%
		Test. Loss: 0.693 |  Val. A

In [30]:
print('============== 10 - fold test accuracy ==============')
print(f'Mean acc {np.mean(test_acc_lists)  * 100 :.2f}%')
print(test_acc_lists)

Mean acc 81.01%
[0.821313121102073, 0.7948148142207753, 0.821904752935682, 0.7987674957229978, 0.8052173824537368, 0.8094202876091003, 0.8169924758729481, 0.8101185741631881, 0.809130425038545, 0.8091925467763629, 0.8098989860578016, 0.814705881205472]
