In [1]:
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from torch.autograd import Variable
from sklearn.metrics import precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fold = 2

In [3]:
class DataProcessor(object):
    def read_text(self,is_train_data):
        #read that data
        #is_train_data==True   reading the training data 
        #is_train_data==False  reading the testing data 
        datas = []
        labels = []
        if(is_train_data):
            #training data folder
            INFOCOM_path = "C://Users//Jin Xu//Desktop//NLP_project//new_data/"+str(fold)+"/train//INFOCOM/"
            ISCAS_path = "C://Users//Jin Xu//Desktop//NLP_project//new_data/"+str(fold)+"/train//ISCAS/"
            SIGGRAPH_path = "C://Users//Jin Xu//Desktop//NLP_project//new_data/"+str(fold)+"/train//SIGGRAPH/"
            VLDB_path = "C://Users//Jin Xu//Desktop//NLP_project//new_data/"+str(fold)+"/train//VLDB/"
            WWW_path = "C://Users//Jin Xu//Desktop//NLP_project//new_data/"+str(fold)+"/train//WWW/" 
        else:
            #testing data folder
            INFOCOM_path = "C://Users//Jin Xu//Desktop//NLP_project//new_data/"+str(fold)+"/valid/INFOCOM/"
            ISCAS_path = "C://Users//Jin Xu//Desktop//NLP_project//new_data/"+str(fold)+"/valid/ISCAS/"
            SIGGRAPH_path = "C://Users//Jin Xu//Desktop//NLP_project//new_data/"+str(fold)+"/valid/SIGGRAPH/"
            VLDB_path = "C://Users//Jin Xu//Desktop//NLP_project//new_data/"+str(fold)+"/valid/VLDB/"
            WWW_path = "C://Users//Jin Xu//Desktop//NLP_project//new_data/"+str(fold)+"/valid/WWW/"
        INFOCOM_files= os.listdir(INFOCOM_path) 
        ISCAS_files = os.listdir(ISCAS_path)
        SIGGRAPH_files = os.listdir(SIGGRAPH_path)
        VLDB_files = os.listdir(VLDB_path)
        WWW_files = os.listdir(WWW_path)
        
        
        for file_name in INFOCOM_files: 
            file_position = INFOCOM_path + file_name
            with open(file_position, "r",encoding='utf-8') as f:
                data = f.read() 
                datas.append(data)
                labels.append([1,0,0,0,0])  #transform the label into vector
        
        for file_name in ISCAS_files:
            file_position = ISCAS_path + file_name 
            with open(file_position, "r",encoding='utf-8') as f:
                data = f.read()
                datas.append(data)
                labels.append([0,1,0,0,0])  #transform the label into vector

        for file_name in SIGGRAPH_files:
            file_position = SIGGRAPH_path + file_name 
            with open(file_position, "r",encoding='utf-8') as f:
                data = f.read()
                datas.append(data)
                labels.append([0,0,1,0,0])  #transform the label into vector

        for file_name in VLDB_files:
            file_position = VLDB_path + file_name 
            with open(file_position, "r",encoding='utf-8') as f:
                data = f.read()
                datas.append(data)
                labels.append([0,0,0,1,0])  #transform the label into vector

        for file_name in WWW_files:
            file_position = WWW_path + file_name 
            with open(file_position, "r",encoding='utf-8') as f:
                data = f.read()
                datas.append(data)
                labels.append([0,0,0,0,1])  #transform the label into vector
        return datas, labels
    
    def word_count(self, datas):
        
        dic = {}
        for data in datas:
            data_list = data.split()
            for word in data_list:
                word = word.lower()
                if(word in dic):
                    dic[word] += 1
                else:
                    dic[word] = 1
        word_count_sorted = sorted(dic.items(), key=lambda item:item[1], reverse=True)
        return  word_count_sorted
    
    def word_index(self, datas, vocab_size):
       
        word_count_sorted = self.word_count(datas)
        word2index = {}
        
        word2index["<unk>"] = 0
        
        word2index["<pad>"] = 1
        
        
        vocab_size = min(len(word_count_sorted), vocab_size)
        for i in range(vocab_size):
            word = word_count_sorted[i][0]
            word2index[word] = i + 2
          
        return word2index, vocab_size
    
    def get_datasets(self, vocab_size, embedding_size, max_len):
        
        train_datas, train_labels = self.read_text(is_train_data=True)
        word2index, vocab_size = self.word_index(train_datas, vocab_size)
        
        test_datas, test_labels = self.read_text(is_train_data = False)
        
        train_features = []
        for data in train_datas:
            feature = []
            data_list = data.split()
            for word in data_list:
                word = word.lower()
                if word in word2index:
                    feature.append(word2index[word])
                else:
                    feature.append(word2index["<unk>"])
                if(len(feature)==max_len):
                    break
            
            feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
            train_features.append(feature)
            
        test_features = []
        for data in test_datas:
            feature = []
            data_list = data.split()
            for word in data_list:
                word = word.lower() 
                if word in word2index:
                    feature.append(word2index[word])
                else:
                    feature.append(word2index["<unk>"]) 
                if(len(feature)==max_len):
                    break
           
            feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
            test_features.append(feature)
            
        train_features = torch.LongTensor(train_features)
        train_labels = torch.FloatTensor(train_labels)
        
        test_features = torch.LongTensor(test_features)
        test_labels = torch.FloatTensor(test_labels)
        
        embed = nn.Embedding(vocab_size + 2, embedding_size)
        train_features = embed(train_features)
        test_features = embed(test_features)
        
        train_features = Variable(train_features, requires_grad=False)
        train_datasets = torch.utils.data.TensorDataset(train_features, train_labels)
        
        test_features = Variable(test_features, requires_grad=False)
        test_datasets = torch.utils.data.TensorDataset(test_features, test_labels)
        return train_datasets, test_datasets

In [30]:
torch.manual_seed(123) 

vocab_size = 31000   #size of vocabulary
embedding_size = 100  #size of word embedding
num_classes = 5    #5 classification
sentence_max_len = 20  #the maximum of sentence length
hidden_size = 16

num_layers = 1 #one layer of lstm
num_directions = 2  #bidirectional lstm
lr = 1e-5
batch_size = 64   
epochs = 20

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [32]:
#Bi-LSTM model
class BiLSTMModel(nn.Module):
    def __init__(self, embedding_size,hidden_size, num_layers, num_directions, num_classes):
        super(BiLSTMModel, self).__init__()
        
        self.input_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_directions = num_directions
        
        
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers = num_layers, bidirectional = (num_directions == 2))
        self.attention_weights_layer = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(inplace=True)
        )
        self.liner = nn.Linear(hidden_size, num_classes)
        self.act_func = nn.Softmax(dim=1)
    
    def forward(self, x):
        #lstm's shape [seq_len, batch, input_size]
        #x [batch_size, sentence_length, embedding_size]
        x = x.permute(1, 0, 2)         #[sentence_length, batch_size, embedding_size]
        
        batch_size = x.size(1)
        
        h_0 = torch.randn(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(device)
        c_0 = torch.randn(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(device)
        
        #out[seq_len, batch, num_directions * hidden_size]
        #h_n, c_n [num_layers * num_directions, batch, hidden_size]
        out, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        #print(out.shape) #20, 16, 32
        
        #split the output of lstm into forward and backward
        (forward_out, backward_out) = torch.chunk(out, 2, dim = 2)
        out = forward_out + backward_out  #[seq_len, batch, hidden_size]
        out = out.permute(1, 0, 2)  #[batch, seq_len, hidden_size] #16,20,16
        
        h_n = h_n.permute(1, 0, 2)  #[batch, num_layers * num_directions,  hidden_size]
        h_n = torch.sum(h_n, dim=1) #[batch, 1,  hidden_size]
        h_n = h_n.squeeze(dim=1)  #[batch, hidden_size]
        
        attention_w = self.attention_weights_layer(h_n)  #[batch, hidden_size]
        attention_w = attention_w.unsqueeze(dim=1) #[batch, 1, hidden_size]
        
        #print(out.transpose(1,2).shape) #16, 16, 20
        attention_context = torch.bmm(attention_w, out.transpose(1, 2))  #[batch, 1, seq_len]
        softmax_w = F.softmax(attention_context, dim=-1)  #[batch, 1, seq_len]
        
        x = torch.bmm(softmax_w, out)  #[batch, 1, hidden_size]
        x = x.squeeze(dim=1)  #[batch, hidden_size]
        x = self.liner(x)
        x = self.act_func(x)
        return x

In [33]:
def test(model, test_loader, loss_func):
    model.eval()
    loss_val = 0.0
    corrects = 0.0
    preds_total = []
    labels_total = []
    for datas, labels in test_loader:
        datas = datas.to(device)
        labels = labels.to(device)
        
        preds = model(datas)
        loss = loss_func(preds, labels)
        
        loss_val += loss.item() * datas.size(0)
        
        preds = torch.argmax(preds, dim=1)
        labels = torch.argmax(labels, dim=1)
        preds_total.extend(preds.cpu().numpy().tolist())
        labels_total.extend(labels.cpu().numpy().tolist())
        corrects += torch.sum(preds == labels).item()
    test_loss = loss_val / len(test_loader.dataset)
    test_acc = corrects / len(test_loader.dataset)
    print("Test Loss: {}, Test Acc: {}".format(test_loss, test_acc))
    print("precision ", precision_score(labels_total, preds_total, average='macro'))
    print("recall ", recall_score(labels_total, preds_total, average='macro'))
    print("f1 ", f1_score(labels_total, preds_total, average='macro'))
    return test_acc

In [34]:
def train(model, train_loader,test_loader, optimizer, loss_func, epochs):
    best_val_acc = 0.0
    best_model_params = copy.deepcopy(model.state_dict())
    for epoch in range(epochs):
        print("epoch: ", epoch)
        model.train()
        loss_val = 0.0
        corrects = 0.0
        for datas, labels in train_loader:
            datas = datas.to(device)
            labels = labels.to(device)
            
            preds = model(datas)
            loss = loss_func(preds, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_val += loss.item() * datas.size(0)
            
            preds = torch.argmax(preds, dim=1)
            labels = torch.argmax(labels, dim=1)
            corrects += torch.sum(preds == labels).item()
        train_loss = loss_val / len(train_loader.dataset)
        train_acc = corrects / len(train_loader.dataset)
        if(epoch % 1 == 0):
            print("Train Loss: {}, Train Acc: {}".format(train_loss, train_acc))
            test_acc = test(model, test_loader, loss_func)
            if(best_val_acc < test_acc):
                best_val_acc = test_acc
                best_model_params = copy.deepcopy(model.state_dict())
    model.load_state_dict(best_model_params)
    return model

In [35]:
processor = DataProcessor()
train_datasets, test_datasets = processor.get_datasets(vocab_size=vocab_size, embedding_size=embedding_size, max_len=sentence_max_len)
train_loader = torch.utils.data.DataLoader(train_datasets, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_datasets, batch_size=batch_size, shuffle=True)

In [36]:
model = BiLSTMModel(embedding_size, hidden_size, num_layers, num_directions, num_classes)

In [37]:
model = model.to(device)

In [38]:
model

BiLSTMModel(
  (lstm): LSTM(100, 16, bidirectional=True)
  (attention_weights_layer): Sequential(
    (0): Linear(in_features=16, out_features=16, bias=True)
    (1): ReLU(inplace=True)
  )
  (liner): Linear(in_features=16, out_features=5, bias=True)
  (act_func): Softmax(dim=1)
)

In [39]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [40]:
loss_func = nn.BCELoss()

In [41]:
model = train(model, train_loader, test_loader, optimizer, loss_func, epochs)

epoch:  0
Train Loss: 0.501254797301915, Train Acc: 0.2097938441993417
Test Loss: 0.5002152960552863, Test Acc: 0.23555247341655108
precision  0.14716214471684236
recall  0.18194634619066746
f1  0.14118403149118458
epoch:  1


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.49808975133119643, Train Acc: 0.2835364093087717
Test Loss: 0.4969376289783889, Test Acc: 0.2979657882570504
precision  0.14719943632776894
recall  0.18857190827677311
f1  0.13466764766023204
epoch:  2


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.4949253008286741, Train Acc: 0.3263844776808916
Test Loss: 0.4940398166811251, Test Acc: 0.3261673601479427
precision  0.12483707785784937
recall  0.19265748305774766
f1  0.116143329749622
epoch:  3


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.4920111164014375, Train Acc: 0.33839579603857484
Test Loss: 0.4911234830189677, Test Acc: 0.33587609801202034
precision  0.117281194754479
recall  0.19494736157308726
f1  0.10669934111145774
epoch:  4


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.4895117497853059, Train Acc: 0.3435930010971877
Test Loss: 0.48876724692885654, Test Acc: 0.3407304669440592
precision  0.0995823917563048
recall  0.19717222275061824
f1  0.10588197655408238
epoch:  5


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.4876020634849662, Train Acc: 0.3444014552174164
Test Loss: 0.4871238868544072, Test Acc: 0.34257975034674065
precision  0.09749350043330444
recall  0.1977871766216473
f1  0.10422148272250378
epoch:  6


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.48617681998430584, Train Acc: 0.3449211757232777
Test Loss: 0.4859747832371469, Test Acc: 0.34350439204808136
precision  0.10027819855511537
recall  0.19831979979075517
f1  0.10441652231885094
epoch:  7


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.4852102849581517, Train Acc: 0.34561413639775945
Test Loss: 0.48504043500345173, Test Acc: 0.343042071197411
precision  0.09045676998368679
recall  0.19787337121932663
f1  0.10356005120375913
epoch:  8


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.4844215987990771, Train Acc: 0.3454986429520125
Test Loss: 0.4844067501652533, Test Acc: 0.34396671289875175
precision  0.08581510780207849
recall  0.19831593589499716
f1  0.10335232383808095
epoch:  9


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.48392648509862607, Train Acc: 0.34572962984350636
Test Loss: 0.4840597434557537, Test Acc: 0.3451225150254276
precision  0.09784153549843926
recall  0.1990717733498193
f1  0.10399641656526878
epoch:  10


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.48348087663359185, Train Acc: 0.3458451232892533
Test Loss: 0.48362523799575263, Test Acc: 0.34489135460009246
precision  0.09777895820191893
recall  0.19893861755754233
f1  0.10390982090609288
epoch:  11


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.48318602793271914, Train Acc: 0.3458451232892533
Test Loss: 0.48339619973551273, Test Acc: 0.345584835876098
precision  0.09597957288765088
recall  0.19924802644093592
f1  0.10366645191397547
epoch:  12


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.4827851682820496, Train Acc: 0.34590287001212683
Test Loss: 0.4830550328344865, Test Acc: 0.34581599630143317
precision  0.1068129062209842
recall  0.19947124072665018
f1  0.10410408551499228
epoch:  13


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.48254023582085714, Train Acc: 0.3460183634578738
Test Loss: 0.4828273056973146, Test Acc: 0.345584835876098
precision  0.09785217103082532
recall  0.19924802644093592
f1  0.1036320458619151
epoch:  14


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.48215871758657053, Train Acc: 0.3458451232892533
Test Loss: 0.4823959840291967, Test Acc: 0.34581599630143317
precision  0.10463921750771836
recall  0.19947124072665018
f1  0.10413803976235521
epoch:  15


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.48179413189360415, Train Acc: 0.34607611018074724
Test Loss: 0.48203637901137286, Test Acc: 0.345584835876098
precision  0.08929499072356215
recall  0.19915796794749857
f1  0.1032278788127311
epoch:  16


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.48159235461160094, Train Acc: 0.3460183634578738
Test Loss: 0.4818081776549074, Test Acc: 0.34535367545076284
precision  0.10000356951633053
recall  0.19911487064865893
f1  0.10356418986702524
epoch:  17


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.48122326389584136, Train Acc: 0.3460183634578738
Test Loss: 0.48151212763962675, Test Acc: 0.3451225150254276
precision  0.08588494548828578
recall  0.19889165636294465
f1  0.10310696593007773
epoch:  18


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.48082607725706483, Train Acc: 0.3460183634578738
Test Loss: 0.48112571400676785, Test Acc: 0.34581599630143317
precision  0.08932529561789937
recall  0.19929112373977553
f1  0.10327899035568741
epoch:  19


  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 0.4804049209358304, Train Acc: 0.3460183634578738
Test Loss: 0.48076425091127967, Test Acc: 0.34581599630143317
precision  0.10266047950502706
recall  0.19938118223321286
f1  0.10370280537608653


  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
accuracy = test(model, test_loader, loss_func)

Test Loss: 0.4830681062025766, Test Acc: 0.34535367545076284
precision  0.09428256326909681
recall  0.19911487064865893
f1  0.10361432501895125


  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
accuracy

0.6999537679149329