In [1]:
# Reload modules
%load_ext autoreload
%autoreload 2

In [1]:
from preprocess.preprocess import clean_str, build_data

In [2]:
from model.model import CNN1d, binary_accuracy, train, evaluate, epoch_time

In [3]:
import torch
from torchtext.data import TabularDataset, Field, LabelField, BucketIterator
import torch.optim as optim
import torch.nn as nn
import time
from gensim.models import KeyedVectors
import numpy as np

In [4]:
BATCH_SIZE = 50
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

TEXT = Field(sequential = True, # text: sequential data
             tokenize = str.split, 
             batch_first = True, 
             fix_length = 56, # padding size: max length of data text
             lower = True)
LABEL = LabelField(sequential = False,
                   dtype = torch.float)

In [5]:
# make dataset for 10-fold
data_dir = './preprocess'
train_paths, val_paths = build_data(data_dir)

save the kfold 0 data
save the kfold 1 data
save the kfold 2 data
save the kfold 3 data
save the kfold 4 data
save the kfold 5 data
save the kfold 6 data
save the kfold 7 data
save the kfold 8 data
save the kfold 9 data


In [6]:
N_EPOCHS = 10
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
test_acc_lists = []

In [7]:
from torchtext.vocab import FastText
vectors=FastText(language='en')

AttributeError: 'FastText' object has no attribute 'shape'

In [8]:
for kfold in range(10):
    # make datasets
    train_path = train_paths[kfold]
    val_path = val_paths[kfold]
    train_data = TabularDataset(path= train_path, skip_header = True,
        format='csv', fields=[('label', LABEL), ('text', TEXT)])
    test_data = TabularDataset(path= val_path, skip_header = True,
        format='csv', fields=[('label', LABEL), ('text', TEXT)])

    TEXT.build_vocab(train_data, vectors =vectors)
    LABEL.build_vocab(train_data) 
    
    pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors)
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
    UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
    pretrained_embeddings[PAD_IDX]= torch.zeros(EMBEDDING_DIM)
    pretrained_embeddings[UNK_IDX] = torch.distributions.Uniform(-0.25, +0.25).sample((EMBEDDING_DIM,))

    # make iterators
    train_iterator,  test_iterator = BucketIterator.splits(
        (train_data, test_data), 
        batch_size = BATCH_SIZE, 
        device = device, sort=False, shuffle = True)
    
    # define a model
    INPUT_DIM = len(TEXT.vocab)

    model = CNN1d(pretrained_embeddings, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
    optimizer = optim.Adadelta(model.parameters(), rho=0.95)
    criterion = nn.BCEWithLogitsLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    # train
    best_test_acc = -float('inf')
    model_name= './model/model_ft' + str(kfold) + '.pt'
    print('kfold', kfold)
    for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        test_loss, test_acc = evaluate(model, test_iterator, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if test_acc > best_test_acc:
            best_test_acc = test_acc
            torch.save(model.state_dict(), model_name)

        
        print(f'\tEpoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\t\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t\tTest. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')
    

    model.load_state_dict(torch.load(model_name))

    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    test_acc_lists.append(test_acc)
    print(f'============== last test accuracy: {test_acc}')
    print()

kfold 0
	Epoch: 01 | Epoch Time: 1m 1s
		Train Loss: 0.588 | Train Acc: 67.86%
		Test. Loss: 0.539 |  Val. Acc: 73.45%
	Epoch: 02 | Epoch Time: 1m 0s
		Train Loss: 0.470 | Train Acc: 77.17%
		Test. Loss: 0.500 |  Val. Acc: 74.82%
	Epoch: 03 | Epoch Time: 1m 0s
		Train Loss: 0.400 | Train Acc: 81.99%
		Test. Loss: 0.534 |  Val. Acc: 76.09%
	Epoch: 04 | Epoch Time: 1m 0s
		Train Loss: 0.342 | Train Acc: 85.15%
		Test. Loss: 0.498 |  Val. Acc: 77.36%
	Epoch: 05 | Epoch Time: 1m 11s
		Train Loss: 0.273 | Train Acc: 89.00%
		Test. Loss: 0.547 |  Val. Acc: 76.36%
	Epoch: 06 | Epoch Time: 1m 19s
		Train Loss: 0.212 | Train Acc: 91.51%
		Test. Loss: 0.574 |  Val. Acc: 78.36%
	Epoch: 07 | Epoch Time: 1m 16s
		Train Loss: 0.149 | Train Acc: 94.33%
		Test. Loss: 0.638 |  Val. Acc: 77.55%
	Epoch: 08 | Epoch Time: 1m 20s
		Train Loss: 0.115 | Train Acc: 95.89%
		Test. Loss: 0.688 |  Val. Acc: 75.45%
	Epoch: 09 | Epoch Time: 1m 19s
		Train Loss: 0.065 | Train Acc: 98.05%
		Test. Loss: 0.823 |  Val. 


kfold 7
	Epoch: 01 | Epoch Time: 1m 6s
		Train Loss: 0.590 | Train Acc: 67.70%
		Test. Loss: 0.510 |  Val. Acc: 73.79%
	Epoch: 02 | Epoch Time: 0m 58s
		Train Loss: 0.475 | Train Acc: 77.51%
		Test. Loss: 0.441 |  Val. Acc: 78.78%
	Epoch: 03 | Epoch Time: 0m 57s
		Train Loss: 0.407 | Train Acc: 81.75%
		Test. Loss: 0.447 |  Val. Acc: 76.83%
	Epoch: 04 | Epoch Time: 1m 0s
		Train Loss: 0.338 | Train Acc: 85.19%
		Test. Loss: 0.454 |  Val. Acc: 78.87%
	Epoch: 05 | Epoch Time: 0m 58s
		Train Loss: 0.271 | Train Acc: 88.59%
		Test. Loss: 0.453 |  Val. Acc: 78.40%
	Epoch: 06 | Epoch Time: 0m 58s
		Train Loss: 0.213 | Train Acc: 91.19%
		Test. Loss: 0.489 |  Val. Acc: 79.63%
	Epoch: 07 | Epoch Time: 0m 57s
		Train Loss: 0.149 | Train Acc: 94.31%
		Test. Loss: 0.486 |  Val. Acc: 78.93%
	Epoch: 08 | Epoch Time: 0m 58s
		Train Loss: 0.100 | Train Acc: 96.53%
		Test. Loss: 0.552 |  Val. Acc: 80.16%
	Epoch: 09 | Epoch Time: 0m 59s
		Train Loss: 0.077 | Train Acc: 97.42%
		Test. Loss: 0.588 |  Va

In [9]:
print('============== 10 - fold test accuracy ==============')
print(f'Mean acc {np.mean(test_acc_lists)  * 100 :.2f}%')
print(test_acc_lists)

Mean acc 80.19%
[0.7754545374350115, 0.8145454580133612, 0.8272727213122628, 0.7871957648368109, 0.8094197267835791, 0.8002006567042806, 0.8269696973619007, 0.8016148987023727, 0.7983957149765708, 0.7775842036519732]
