In [1]:
#%% import packages
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import *
from tqdm import tqdm
from sklearn.metrics import f1_score
import numpy as np

In [2]:
from collections import Counter
import gensim
import time

#%% build Dictionary
def build_vocab(List : list) -> dict: 
    """
    word to index
    used for creating dataset
    """
    print("Starting to build Dictionary..."); start_time = time.time()
    
    Vocab = Counter()
    for file in List: # train.txt and validation.txt
        with open(file,"r",encoding="utf-8") as f:
            for line in f.readlines(): # 1	死囚 爱 刽子手 女贼 爱 衙役 我们 爱 你们 难道 还有 别的 选择 没想到 ...
                sentence = line.strip().split() # ['1', '死囚', '爱', '刽子手', '女贼', '爱', '衙役', ..
                for voca in sentence[1:]: # first word is label
                    if voca not in Vocab.keys():
                        Vocab[voca] = len(Vocab)
    end_time = time.time(); elapsed_time = end_time - start_time
    print("Build vector ended! Elapsed time: {:.2f} seconds".format(elapsed_time))
    return Vocab

vocab =  build_vocab(["../Dataset/train.txt"]) #train, test

#%%build sentence vector
n_exist = 0
def build_vector(List : list, Vocab : dict) -> np.ndarray: 
    """
    word to vector (index from Vocab)
    only used in model.py
    """
    print("Starting to build vector..."); start_time = time.time()
    
    global n_exist
    # loaded pre-trained model
    preModel = gensim.models.KeyedVectors.load_word2vec_format("../Dataset/wiki_word2vec_50.bin",binary=True)
    vector = np.array([np.zeros(preModel.vector_size)]*(len(Vocab)+1)) # +1 for padding (although there are already 0 row vector for words not in vocab)

    for voca in Vocab: # some words don't exist in preModel -> 
        try:
            vector[Vocab[voca]] = preModel[voca]
        except Exception as e:
            # TODO : Make a better way to handle this exception
            # - pre process the data? idk...
            n_exist += 1
            pass
            # print("An exception occurred: " + str(e))
            
    end_time = time.time(); elapsed_time = end_time - start_time
    print("Building vector ended! Elapsed time: {:.2f} seconds".format(elapsed_time))
    print("There are ",n_exist," words that doesn't exist in Model (include duplication)")
    print("Length of sentence vector is ",len(vector))# 53735~59290
    return vector

s_vectors = build_vector(["../Dataset/train.txt"],vocab)

#%% parse data function to build dataset
def build_dataset(path : str,vocab : dict,max_length=50): # length of single sentence to use from data
    """
    returns contents and labels in numpy array
    from train, test, validation
    """
    print("Starting to parse data from ",path,"..."); start_time = time.time()
    
    words,labels = np.array([0]*max_length), np.array([],dtype=np.float64) # can't use integer here
    with open(path,encoding='utf-8',errors='ignore') as f:
        for line in f.readlines():
            sentence = line.strip().split() # ['1', '如果', '我', '无聊',...
            stripped_sentence = np.asarray([vocab.get(word,len(vocab)) for word in sentence [1:]])[:max_length] # strip only first max_length elements
                                    # index vector of sentence
                                    # if key doesn't exist in vocab, return 0 (len(Vocab) 이여야 되는거 아니냐? ㅅㅂ?)
            # pad the content to match length
            padding = max(max_length - len(stripped_sentence), 0)
            stripped_sentence = np.pad(stripped_sentence, pad_width=(0, padding), mode='constant', constant_values=len(vocab))# len(Vocab)
            
            # append label, pos -> 1, neg ->0
            labels = np.append(labels, int(sentence[0])) 
            
            # append content
            words = np.vstack([words,stripped_sentence])
    # delete the first row of contents (to match its length with labels)
    words = np.delete(words,0,axis=0)
    
    end_time = time.time();elapsed_time = end_time - start_time
    print("Parsing data ended! Elapsed time: {:.2f} seconds-------------------------------".format(elapsed_time))
    return words, labels


Starting to build Dictionary...
Build vector ended! Elapsed time: 0.17 seconds
Starting to build vector...
Building vector ended! Elapsed time: 2.77 seconds
There are  9066  words that doesn't exist in Model (include duplication)
Length of sentence vector is  53338


In [3]:
#hyper params (decided in main.py)
max_length = 50
batch_size = 100
learning_rate = 1e-3

In [4]:
# load data
train_contents , train_labels = build_dataset("../Dataset/train.txt",vocab, max_length)
val_contents, val_labels = build_dataset("../Dataset/validation.txt",vocab,max_length) 
test_contents, test_labels = build_dataset("../Dataset/test.txt",vocab,max_length)

# train dataset
train_dataset = TensorDataset(
    torch.from_numpy(train_contents).type(torch.float),
    torch.from_numpy(train_labels).type(torch.long),
)
train_dataloader = DataLoader(
    dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=1
)

# validation dataset
val_dataset = TensorDataset(
    torch.from_numpy(val_contents).type(torch.float),
    torch.from_numpy(val_labels).type(torch.long),
)
val_dataloader = DataLoader(
    dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=1
)

# test dataset
test_dataset = TensorDataset(
    torch.from_numpy(test_contents).type(torch.float),
    torch.from_numpy(test_labels).type(torch.long),
)
test_dataloader = DataLoader(
    dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=1
)

Starting to parse data from  ../Dataset/train.txt ...
Parsing data ended! Elapsed time: 5.00 seconds-------------------------------
Starting to parse data from  ../Dataset/validation.txt ...
Parsing data ended! Elapsed time: 0.41 seconds-------------------------------
Starting to parse data from  ../Dataset/test.txt ...
Parsing data ended! Elapsed time: 0.02 seconds-------------------------------


In [5]:
# models
import torch.nn.functional as F

class model_config():
    """
    For all datasets we use: rectified linear units, 
    filter windows (h) of 3, 4, 5 with 100 feature maps each, 
    dropout rate (p) of 0.5, l2 constraint (s) of 3, 
    and mini-batch size of 50. 
    These values were chosen via a grid search on the SST-2 dev set.
    ...
    From the paper: https://arxiv.org/pdf/1408.5882.pdf
    """
    update_w2v = True           # whether to update w2v 
    vocab_size = len(vocab)+1   # +1 for padding (recall that we added one more row for sentence vector)
    n_classes = 2               # 0 -> neg, 1 -> pos | binary classification
    embedding_dim = 50          # dimension of word embedding. same as word2vec model length 50
    dropout_rate = 0.5          # dropout rate
    kernel_num = 20             # number of each kind of kernel
    kernel_sizes = [3,4,5]      # size of kernel, h (window size)
    pretrained_embed = s_vectors# pretrained embedding matrix
    #------------- RNN ONLY -----------------------------------------------------------------
    hidden_size = 100           # hidden size of rnn
    num_layers = 2              # number of layers of rnn

config = model_config()

#%% CNN model
class CNN(nn.Module):
    def __init__(self, config : model_config):
        super(CNN,self).__init__()
        update_w2v = config.update_w2v
        vocab_size = config.vocab_size
        n_class = config.n_classes
        embedding_dim = config.embedding_dim
        dropout_rate = config.dropout_rate
        kernel_num = config.kernel_num
        kernel_sizes = config.kernel_sizes
        pretrained_embed = config.pretrained_embed

        # embedding layer
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        self.embedding.weight.requires_grad = update_w2v
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embed))
        
        # convolution layer
        # input channel size is 1, because we only have one channel (word embedding) 
        # kernel_size = height * width!
        self.conv1_1 = nn.Conv2d(in_channels=1,out_channels= kernel_num,kernel_size=(kernel_sizes[0],embedding_dim),stride=1,padding = 0)
        self.conv1_2 = nn.Conv2d(1,kernel_num,(kernel_sizes[1],embedding_dim))
        self.conv1_3 = nn.Conv2d(1,kernel_num,(kernel_sizes[2],embedding_dim))
        
        # pooling layer
        self.pool = nn.MaxPool1d
        
        # dropout
        self.dropout = nn.Dropout(dropout_rate)
        # fully connected layer
        self.fc = nn.Linear(len(kernel_sizes)*kernel_num,n_class)

    @staticmethod
    def conv_and_pool(x,conv):
        x = conv(x)
        x = F.relu(x.squeeze(3)) #  concatenates 20
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x
    
    def forward(self,x):
        # (batch_size,1,max_length,embedding_dim), converts sentence represented by id into batch size tensor
        x = self.embedding(x.to(torch.int64)).unsqueeze(1)
        x1 = self.conv_and_pool(x,self.conv1_1) # (batch_size, kernel_num)
        x2 = self.conv_and_pool(x,self.conv1_2) # (batch_size, kernel_num)
        x3 = self.conv_and_pool(x,self.conv1_3) # (batch_size, kernel_num)
        # concatenate x1,x2,x3 column-wise, apply dropout, and apply fully-connected layer to get output
        # as it's a binary classification, we use log_softmax as activation function
        x = F.log_softmax(self.fc(self.dropout(torch.cat((x1,x2,x3),1))),dim=1)
        return x
    
#%% RNN model



#%% MLP model

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = CNN(config).to(DEVICE)


In [6]:
# train, validation and test
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=5)


def train(train_dataloader):
    model.train()
    
    train_loss , train_accuracy = 0.0, 0.0
    count , correct = 0,0
    full_true = []
    full_pred = []
    for _, (sentences, labels) in enumerate(train_dataloader):
        sentences = sentences.to(DEVICE)
        labels = labels.to(DEVICE)
        
        # forward
        optimizer.zero_grad()
        output = model(sentences)
        loss = criterion(output, labels)
        
        # backward
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        correct += (output.argmax(1) == labels).float().sum().item()
        count += len(sentences)
        full_true.extend(labels.cpu().numpy().tolist())
        full_pred.extend(output.argmax(1).cpu().numpy().tolist())
    train_loss *= batch_size
    train_loss /= len(train_dataloader.dataset)
    train_accuracy = correct / count
    
    scheduler.step()
    train_f1 = f1_score(np.array(full_true),np.array(full_pred),average = "binary")
    return train_loss, train_accuracy, train_f1
# valid and test
def valid_and_test(dataloader):
    model.eval()

    val_loss, val_acc = 0.0, 0.0
    count, correct = 0, 0
    full_true = []
    full_pred = []
    for _, (sentences, labels) in enumerate(dataloader):
        sentences, labels = sentences.to(DEVICE), labels.to(DEVICE)
        
        #forawrd
        output = model(sentences) # invokes forward()
        loss = criterion(output, labels)
        
        val_loss += loss.item()
        correct += (output.argmax(1) == labels).float().sum().item()
        count += len(sentences)
        full_true.extend(labels.cpu().numpy().tolist())
        full_pred.extend(output.argmax(1).cpu().numpy().tolist())
        
    val_loss *= batch_size
    val_loss /= len(dataloader.dataset)
    val_acc = correct / count
    f1 = f1_score(np.array(full_true), np.array(full_pred), average="binary")
    return val_loss, val_acc, f1

In [7]:
EPOCHS = 10
# 
for each in tqdm(range(1, EPOCHS + 1)):
    tr_loss, tr_acc, tr_f1 = train(train_dataloader)
    val_loss, val_acc, val_f1 = valid_and_test(val_dataloader)
    test_loss, test_acc, test_f1 = valid_and_test(test_dataloader)
    print(
        f"for epoch {each}/{EPOCHS}, train_loss: {tr_loss:.4f}, train_acc: {tr_acc:.4f}, val_loss: {val_loss:.4f}, val_acc: {val_acc:.4f}, test_loss: {test_loss:.4f}, test_acc: {test_acc:.4f} (in average)"
    )

 10%|█         | 1/10 [00:16<02:29, 16.59s/it]

for epoch 1/10, train_loss: 0.5846, train_acc: 0.6845, val_loss: 0.4764, val_acc: 0.7838, test_loss: 0.4988, test_acc: 0.7913 (in average)


 20%|██        | 2/10 [00:33<02:15, 16.96s/it]

for epoch 2/10, train_loss: 0.4213, train_acc: 0.8119, val_loss: 0.4147, val_acc: 0.8151, test_loss: 0.4079, test_acc: 0.8238 (in average)


 30%|███       | 3/10 [00:51<02:00, 17.20s/it]

for epoch 3/10, train_loss: 0.3274, train_acc: 0.8655, val_loss: 0.3922, val_acc: 0.8279, test_loss: 0.3731, test_acc: 0.8374 (in average)


 40%|████      | 4/10 [01:08<01:43, 17.24s/it]

for epoch 4/10, train_loss: 0.2524, train_acc: 0.9038, val_loss: 0.3899, val_acc: 0.8300, test_loss: 0.3706, test_acc: 0.8266 (in average)


 50%|█████     | 5/10 [01:24<01:24, 16.85s/it]

for epoch 5/10, train_loss: 0.1894, train_acc: 0.9319, val_loss: 0.4099, val_acc: 0.8303, test_loss: 0.3892, test_acc: 0.8266 (in average)


 60%|██████    | 6/10 [01:41<01:07, 16.94s/it]

for epoch 6/10, train_loss: 0.1349, train_acc: 0.9561, val_loss: 0.4159, val_acc: 0.8328, test_loss: 0.3794, test_acc: 0.8293 (in average)


 70%|███████   | 7/10 [02:00<00:52, 17.52s/it]

for epoch 7/10, train_loss: 0.1287, train_acc: 0.9575, val_loss: 0.4196, val_acc: 0.8319, test_loss: 0.3861, test_acc: 0.8320 (in average)


 80%|████████  | 8/10 [02:19<00:35, 17.91s/it]

for epoch 8/10, train_loss: 0.1238, train_acc: 0.9593, val_loss: 0.4245, val_acc: 0.8325, test_loss: 0.3862, test_acc: 0.8320 (in average)


 90%|█████████ | 9/10 [02:35<00:17, 17.48s/it]

for epoch 9/10, train_loss: 0.1200, train_acc: 0.9616, val_loss: 0.4305, val_acc: 0.8307, test_loss: 0.3884, test_acc: 0.8374 (in average)


100%|██████████| 10/10 [02:51<00:00, 17.18s/it]

for epoch 10/10, train_loss: 0.1138, train_acc: 0.9638, val_loss: 0.4329, val_acc: 0.8319, test_loss: 0.3937, test_acc: 0.8293 (in average)



