In [1]:
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from torch.autograd import Variable
from sklearn.metrics import precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DataProcessor(object):
    def read_text(self,is_train_data):
        #read that data
        #is_train_data==True   reading the training data 
        #is_train_data==False  reading the testing data 
        datas = []
        labels = []
        if(is_train_data):
            #training data folder
            INFOCOM_path = "./data/train/INFOCOM/"
            ISCAS_path = "./data/train/ISCAS/"
            SIGGRAPH_path = "./data/train/SIGGRAPH/"
            VLDB_path = "./data/train/VLDB/"
            WWW_path = "./data/train/WWW/" 
        else:
            #testing data folder
            INFOCOM_path = "./data/test/INFOCOM/"
            ISCAS_path = "./data/test/ISCAS/"
            SIGGRAPH_path = "./data/test/SIGGRAPH/"
            VLDB_path = "./data/test/VLDB/"
            WWW_path = "./data/test/WWW/"
        INFOCOM_files= os.listdir(INFOCOM_path)
        ISCAS_files = os.listdir(ISCAS_path)
        SIGGRAPH_files = os.listdir(SIGGRAPH_path)
        VLDB_files = os.listdir(VLDB_path)
        WWW_files = os.listdir(WWW_path)
        
        
        for file_name in INFOCOM_files: 
            file_position = INFOCOM_path + file_name
            with open(file_position, "r",encoding='utf-8') as f:  
                data = f.read()   
                datas.append(data)
                labels.append([1,0,0,0,0]) #transform the label into vector
        
        for file_name in ISCAS_files:
            file_position = ISCAS_path + file_name 
            with open(file_position, "r",encoding='utf-8') as f:
                data = f.read()
                datas.append(data)
                labels.append([0,1,0,0,0]) #transform the label into vector

        for file_name in SIGGRAPH_files:
            file_position = SIGGRAPH_path + file_name 
            with open(file_position, "r",encoding='utf-8') as f:
                data = f.read()
                datas.append(data)
                labels.append([0,0,1,0,0]) #transform the label into vector

        for file_name in VLDB_files:
            file_position = VLDB_path + file_name 
            with open(file_position, "r",encoding='utf-8') as f:
                data = f.read()
                datas.append(data)
                labels.append([0,0,0,1,0]) #transform the label into vector

        for file_name in WWW_files:
            file_position = WWW_path + file_name 
            with open(file_position, "r",encoding='utf-8') as f:
                data = f.read()
                datas.append(data)
                labels.append([0,0,0,0,1]) #transform the label into vector
        return datas, labels
    
    def word_count(self, datas):
        dic = {}
        for data in datas:
            data_list = data.split()
            for word in data_list:
                word = word.lower()
                if(word in dic):
                    dic[word] += 1
                else:
                    dic[word] = 1
        word_count_sorted = sorted(dic.items(), key=lambda item:item[1], reverse=True)
        return  word_count_sorted
    
    def word_index(self, datas, vocab_size):
        word_count_sorted = self.word_count(datas)
        word2index = {}
        word2index["<unk>"] = 0
        word2index["<pad>"] = 1
        
        vocab_size = min(len(word_count_sorted), vocab_size)
        for i in range(vocab_size):
            word = word_count_sorted[i][0]
            word2index[word] = i + 2
          
        return word2index, vocab_size
    
    def get_datasets(self, vocab_size, embedding_size, max_len):
        train_datas, train_labels = self.read_text(is_train_data=True)
        word2index, vocab_size = self.word_index(train_datas, vocab_size)
        
        test_datas, test_labels = self.read_text(is_train_data = False)
        
        train_features = []
        for data in train_datas:
            feature = []
            data_list = data.split()
            for word in data_list:
                word = word.lower() 
                if word in word2index:
                    feature.append(word2index[word])
                else:
                    feature.append(word2index["<unk>"]) 
                if(len(feature)==max_len): 
                    break

            feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
            train_features.append(feature)
            
        test_features = []
        for data in test_datas:
            feature = []
            data_list = data.split()
            for word in data_list:
                word = word.lower()
                if word in word2index:
                    feature.append(word2index[word])
                else:
                    feature.append(word2index["<unk>"])
                if(len(feature)==max_len):
                    break
            
            feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
            test_features.append(feature)
            
        train_features = torch.LongTensor(train_features)
        train_labels = torch.FloatTensor(train_labels)
        
        test_features = torch.LongTensor(test_features)
        test_labels = torch.FloatTensor(test_labels)
        
        embed = nn.Embedding(vocab_size + 2, embedding_size)
        train_features = embed(train_features)
        test_features = embed(test_features)
        
        train_features = Variable(train_features, requires_grad=False)
        train_datasets = torch.utils.data.TensorDataset(train_features, train_labels)
        
        test_features = Variable(test_features, requires_grad=False)
        test_datasets = torch.utils.data.TensorDataset(test_features, test_labels)
        return train_datasets, test_datasets

In [17]:
torch.manual_seed(123)

vocab_size = 31000   #size of vocabulary
embedding_size = 100   #size of word embedding
num_classes = 5    #5 classification
sentence_max_len = 20  #the maximum of sentence length
hidden_size = 16

num_layers = 1  #one layer of lstm
num_directions = 2  #bidirectional lstm
lr = 1e-3
batch_size = 32   
epochs = 20

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
#Bi-LSTM model
class BiLSTMModel(nn.Module):
    def __init__(self, embedding_size,hidden_size, num_layers, num_directions, num_classes):
        super(BiLSTMModel, self).__init__()
        
        self.input_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_directions = num_directions
        
        
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers = num_layers, bidirectional = (num_directions == 2))
        #self.attention_weights_layer = nn.Sequential(
         #   nn.Linear(hidden_size, hidden_size),
         #   nn.ReLU(inplace=True)
        #)
        self.liner = nn.Linear(hidden_size, num_classes)
        self.act_func = nn.Softmax(dim=1)
    
    def forward(self, x):
        #lstm's shape [seq_len, batch, input_size]
        #x [batch_size, sentence_length, embedding_size]
        x = x.permute(1, 0, 2)         #[sentence_length, batch_size, embedding_size]
        
        batch_size = x.size(1)
        
        h_0 = torch.randn(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(device)
        c_0 = torch.randn(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(device)
        
        #out[seq_len, batch, num_directions * hidden_size]
        #h_n, c_n [num_layers * num_directions, batch, hidden_size]
        out, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        #print(out.shape) #20, 16, 32
        
        #split the output of lstm into forward and backward
        #(forward_out, backward_out) = torch.chunk(out, 2, dim = 2)
        #out = forward_out + backward_out  #[seq_len, batch, hidden_size]
        #out = out.permute(1, 0, 2)  #[batch, seq_len, hidden_size] #16,20,16
        
        h_n = h_n.permute(1, 0, 2)  #[batch, num_layers * num_directions,  hidden_size]
        h_n = torch.sum(h_n, dim=1) #[batch, 1,  hidden_size]
        h_n = h_n.squeeze(dim=1)  #[batch, hidden_size]
        
        #attention_w = self.attention_weights_layer(h_n)  #[batch, hidden_size]
        #attention_w = attention_w.unsqueeze(dim=1) #[batch, 1, hidden_size]
        
        #print(out.transpose(1,2).shape) #16, 16, 20
        #attention_context = torch.bmm(attention_w, out.transpose(1, 2))  #[batch, 1, seq_len]
        #softmax_w = F.softmax(attention_context, dim=-1)  #[batch, 1, seq_len]
        
        #x = torch.bmm(softmax_w, out)  #[batch, 1, hidden_size]
        #x = x.squeeze(dim=1)  #[batch, hidden_size]
        x = self.liner(h_n)
        x = self.act_func(x)
        return x

In [20]:
def test(model, test_loader, loss_func):
    model.eval()
    loss_val = 0.0
    corrects = 0.0
    preds_total = []
    labels_total = []
    for datas, labels in test_loader:
        datas = datas.to(device)
        labels = labels.to(device)
        
        preds = model(datas)
        loss = loss_func(preds, labels)
        
        loss_val += loss.item() * datas.size(0)
        
        preds = torch.argmax(preds, dim=1)
        labels = torch.argmax(labels, dim=1)
        preds_total.extend(preds.cpu().numpy().tolist())
        labels_total.extend(labels.cpu().numpy().tolist())
        corrects += torch.sum(preds == labels).item()
    test_loss = loss_val / len(test_loader.dataset)
    test_acc = corrects / len(test_loader.dataset)
    print("Test Loss: {}, Test Acc: {}".format(test_loss, test_acc))
    print("precision ", precision_score(labels_total, preds_total, average='macro'))
    print("recall ", recall_score(labels_total, preds_total, average='macro'))
    print("f1 ", f1_score(labels_total, preds_total, average='macro'))
    return test_acc

In [21]:
def train(model, train_loader,test_loader, optimizer, loss_func, epochs):
    best_val_acc = 0.0
    best_model_params = copy.deepcopy(model.state_dict())
    for epoch in range(epochs):
        print("epoch: ", epoch)
        model.train()
        loss_val = 0.0
        corrects = 0.0
        #i = 0
        #preds_total = []
        #labels_total = []
        for datas, labels in train_loader:
            #i = i+1
            #print(i/len(train_loader)*100, "%")
            datas = datas.to(device)
            labels = labels.to(device)
            
            preds = model(datas)
            loss = loss_func(preds, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_val += loss.item() * datas.size(0)
            
            preds = torch.argmax(preds, dim=1)
            labels = torch.argmax(labels, dim=1)
            #print(preds)
            #print(labels)
            #preds_total.extend(preds.cpu().numpy().tolist())
            #labels_total.extend(labels.cpu().numpy().tolist())
            corrects += torch.sum(preds == labels).item()
        train_loss = loss_val / len(train_loader.dataset)
        train_acc = corrects / len(train_loader.dataset)
        if(epoch % 1== 0):
            print("Train Loss: {}, Train Acc: {}".format(train_loss, train_acc))
            #print(precision_score(labels_total, preds_total, average='macro'))
            #print(recall_score(labels_total, preds_total, average='macro'))
            #print(f1_score(labels_total, preds_total, average='macro'))
            test_acc = test(model, test_loader, loss_func)
            if(best_val_acc < test_acc):
                best_val_acc = test_acc
                best_model_params = copy.deepcopy(model.state_dict())
    model.load_state_dict(best_model_params)
    return model

In [22]:
processor = DataProcessor()
train_datasets, test_datasets = processor.get_datasets(vocab_size=vocab_size, embedding_size=embedding_size, max_len=sentence_max_len)
train_loader = torch.utils.data.DataLoader(train_datasets, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_datasets, batch_size=batch_size, shuffle=True)

In [23]:
train_datasets[3][0].shape

torch.Size([20, 100])

In [24]:
model = BiLSTMModel(embedding_size, hidden_size, num_layers, num_directions, num_classes)

In [25]:
model = model.to(device)

In [26]:
model

BiLSTMModel(
  (lstm): LSTM(100, 16, bidirectional=True)
  (liner): Linear(in_features=16, out_features=5, bias=True)
  (act_func): Softmax(dim=1)
)

In [27]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [28]:
loss_func = nn.BCELoss()

In [29]:
model = train(model, train_loader, test_loader, optimizer, loss_func, epochs)

epoch:  0
Train Loss: 0.40347099904707134, Train Acc: 0.49309245483528164
Test Loss: 0.3253706007921975, Test Acc: 0.6350429884375927
precision  0.4706953254317243
recall  0.47926499517650034
f1  0.45907739445408646
epoch:  1
Train Loss: 0.2992464621732797, Train Acc: 0.6620154322413714
Test Loss: 0.27704943037513735, Test Acc: 0.6919656092499259
precision  0.5731887731634194
recall  0.5507231240311464
f1  0.5267687136783612
epoch:  2
Train Loss: 0.25287434533641184, Train Acc: 0.7256849789770364
Test Loss: 0.27498105935268646, Test Acc: 0.6890008894159502
precision  0.5500026719142835
recall  0.5780226897014432
f1  0.5361988656216854
epoch:  3
Train Loss: 0.22592063300965295, Train Acc: 0.7590444947558102
Test Loss: 0.28274089823375637, Test Acc: 0.68781500148236
precision  0.5492891491493117
recall  0.614125381010189
f1  0.558657327917528
epoch:  4
Train Loss: 0.2078586101672436, Train Acc: 0.7808991359793005
Test Loss: 0.2651351072775515, Test Acc: 0.7103468722205751
precision  0.57

In [30]:
accuracy = test(model, test_loader, loss_func)

Test Loss: 0.2976014317142267, Test Acc: 0.724873999407056
precision  0.581392194613668
recall  0.625047549188562
f1  0.58948431846565


In [17]:
accuracy

0.7147939519715387