In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import BertTokenizer,BertModel
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
np.random.seed(2020)
torch.manual_seed(2020)
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    torch.cuda.manual_seed(2020)

In [3]:
fold = 4

In [4]:
#read the training data and change the label into 0 to 4
dblp = pd.read_table(r"C:\Users\Jin Xu\Desktop\NLP_project\new_data\\" + str(fold) + "_train.txt", sep='\t', header=None)
dblp.columns = ['idx', 'conference', 'title']
dblp_new = dblp.replace('INFOCOM', 0).replace('ISCAS', 1).replace('SIGGRAPH', 2).replace('VLDB', 3).replace('WWW', 4)

In [5]:
#remove some punctuations
def pretreatment(comments):
    result_comments=[]
    punctuation='。，？！：%&~（）、；“”&|,.?!:%&~();""'
    for comment in comments:
        comment= ''.join([c for c in comment if c not in punctuation])
        comment= ''.join(comment.split())   #\xa0
        result_comments.append(comment)
    
    return result_comments

In [6]:
result_comments=pretreatment(list(dblp_new['title'].values))

In [7]:
len(result_comments)

17317

In [8]:
#import the pretrained bert model

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [8]:
result_comments_id = tokenizer(dblp_new['title'].values.tolist(),padding=True,truncation=True,max_length=20,return_tensors='pt')

In [11]:
#split the training data into train-valid data

In [9]:
X=result_comments_id['input_ids']
y=torch.from_numpy(dblp_new['conference'].values).float()

X_train,X_valid, y_train, y_valid =train_test_split(X,y,test_size=0.3,shuffle=True,stratify=y,random_state=2020)

In [10]:
len(X_train),len(X_valid)

(12121, 5196)

In [11]:
dblp_test = pd.read_table(r"C:\Users\Jin Xu\Desktop\NLP_project\new_data\\" + str(fold) + "_valid.txt", sep='\t', header=None)
dblp_test.columns = ['idx', 'conference', 'title']
dblp_test_new = dblp_test.replace('INFOCOM', 0).replace('ISCAS', 1).replace('SIGGRAPH', 2).replace('VLDB', 3).replace('WWW', 4)

In [12]:
result_comments_id_test = tokenizer(dblp_test_new['title'].values.tolist(),padding=True,truncation=True,max_length=20,return_tensors='pt')

In [13]:
# create Tensor datasets
train_data = TensorDataset(X_train, y_train)
valid_data = TensorDataset(X_valid, y_valid)
test_data = TensorDataset(result_comments_id_test['input_ids'], torch.from_numpy(dblp_test_new['conference'].values).float())

# dataloaders
batch_size = 32

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size,drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,drop_last=True)


In [14]:
if(USE_CUDA):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [15]:
class bert_lstm(nn.Module):
    def __init__(self, hidden_dim,output_size,n_layers,bidirectional=True, drop_prob=0.5):
        super(bert_lstm, self).__init__()
 
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        
        #Bert
        self.bert=BertModel.from_pretrained("bert-base-cased")
        for param in self.bert.parameters():
            param.requires_grad = True
        
        # LSTM layers
        self.lstm = nn.LSTM(768, hidden_dim, n_layers, batch_first=True,bidirectional=bidirectional)
        self.attention_weights_layer = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(inplace=True)
        )
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # linear and sigmoid layers
        if bidirectional:
            self.fc = nn.Linear(hidden_dim*2, output_size)
        else:
            self.fc = nn.Linear(hidden_dim, output_size)
          
        #self.sig = nn.Sigmoid()
 
    def forward(self, x, hidden):
        batch_size = x.size(0)
        #bert word vector
        x=self.bert(x)[0]
        
        # lstm_out
        #x = x.float()
        lstm_out, (hidden_last,cn_last) = self.lstm(x, hidden)
        #print(lstm_out.shape,'lstm_out')   #[32,20,768]
        #print(hidden_last.shape, 'hidden_last')   #[4, 32, 384]
        #print(cn_last.shape)    #[4, 32, 384]
        
        #split the output into forward and backward
        (forward_out, backward_out) = torch.chunk(lstm_out, 2, dim = 2)
        hidden_last_out = forward_out + backward_out  #[batch, seq_len, hidden_size]
        #out = out.permute(1, 0, 2)  #[batch, seq_len, hidden_size] #16,20,16
        #cope with the bidirectional 
        #if self.bidirectional:
            #forward last layer, last time slot
            #hidden_last_L=hidden_last[-2]
            #print(hidden_last_L.shape)  #[32, 384]
            #backward last layer, last time slot
            #hidden_last_R=hidden_last[-1]
            #print(hidden_last_R.shape)   #[32, 384]
            #contatenate
            #hidden_last_out=torch.cat([hidden_last_L,hidden_last_R],dim=-1)
            #print(hidden_last_out.shape,'hidden_last_out')   #[32, 768]
        #else:
            #hidden_last_out=hidden_last[-1]   #[32, 384]
            
        hidden_last = hidden_last.permute(1, 0, 2)  #[batch, num_layers * num_directions,  hidden_size]
        hidden_last = torch.sum(hidden_last, dim=1) #[batch, 1,  hidden_size]
        hidden_last = hidden_last.squeeze(dim=1)  #[batch, hidden_size]
        
        attention_w = self.attention_weights_layer(hidden_last)  #[batch, hidden_size]
        attention_w = attention_w.unsqueeze(dim=1) #[batch, 1, hidden_size]
        
        #print(attention_w.shape)
        #print(hidden_last_out.transpose(1, 2).shape)
        attention_context = torch.bmm(attention_w, hidden_last_out.transpose(1, 2))  #[batch, 1, seq_len]
        softmax_w = F.softmax(attention_context, dim=-1)  #[batch, 1, seq_len],权重归一化
        
        x = torch.bmm(softmax_w, lstm_out)  #[batch, 1, hidden_size]
        x = x.squeeze(dim=1)  #[batch, hidden_size]
        
        # dropout and fully-connected layer
        out = self.dropout(x)
        #print(out.shape)    #[32,768]
        out = self.fc(out)
        
        return out
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        number = 1
        if self.bidirectional:
            number = 2
        
        if (USE_CUDA):
            hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda(),
                      weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda()
                     )
        else:
            hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float(),
                      weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float()
                     )
        
        return hidden

In [16]:
output_size = 5
hidden_dim = 384   #768/2
n_layers = 2
bidirectional = True  #true means bidirectional

net = bert_lstm(hidden_dim, output_size,n_layers, bidirectional)

#print(net)

In [17]:
# loss and optimization functions
lr=2e-4
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

# training params
epochs = 5
# batch_size=50
print_every = 100
clip=5 # gradient clipping
 
# move model to GPU, if available
if(USE_CUDA):
    net.cuda()

In [18]:
net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)
    counter = 0
 
    # batch loop
    for inputs, labels in train_loader:
        counter += 1
        
        if(USE_CUDA):
            inputs, labels = inputs.cuda(), labels.cuda()
        h = tuple([each.data for each in h])
        net.zero_grad()
        output= net(inputs, h)
        #print(output.shape)
        #labels = labels.to(torch.int64)
        #print(labels.shape)
        loss = criterion(output, labels.long())
        loss.backward()
        optimizer.step()
 
        # loss stats
        if counter % print_every == 0:
            net.eval()
            with torch.no_grad():
                val_h = net.init_hidden(batch_size)
                val_losses = []
                total_acc_val = 0
                for inputs, labels in valid_loader:
                    val_h = tuple([each.data for each in val_h])

                    if(USE_CUDA):
                        inputs, labels = inputs.cuda(), labels.cuda()

                    output = net(inputs, val_h)
                    acc = (output.argmax(dim=1) == labels).sum().item()
                    total_acc_val = total_acc_val + acc
                    val_loss = criterion(output.squeeze(), labels.long())

                    val_losses.append(val_loss.item())
 
            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)),
                 "Accuracy: {:.6f}...".format(total_acc_val / len(valid_data))
                 )


Epoch: 1/5... Step: 100... Loss: 0.899543... Val Loss: 0.734099 Accuracy: 0.728253...
Epoch: 1/5... Step: 200... Loss: 0.737573... Val Loss: 0.688296 Accuracy: 0.721901...
Epoch: 1/5... Step: 300... Loss: 0.419533... Val Loss: 0.598504 Accuracy: 0.794265...
Epoch: 2/5... Step: 100... Loss: 0.554378... Val Loss: 0.562518 Accuracy: 0.825828...
Epoch: 2/5... Step: 200... Loss: 0.654045... Val Loss: 0.648887 Accuracy: 0.799846...
Epoch: 2/5... Step: 300... Loss: 1.465117... Val Loss: 1.544356 Accuracy: 0.346613...
Epoch: 3/5... Step: 100... Loss: 1.554122... Val Loss: 1.539853 Accuracy: 0.346420...
Epoch: 3/5... Step: 200... Loss: 1.594955... Val Loss: 1.544990 Accuracy: 0.346613...
Epoch: 3/5... Step: 300... Loss: 1.574809... Val Loss: 1.546288 Accuracy: 0.346228...
Epoch: 4/5... Step: 100... Loss: 1.563314... Val Loss: 1.540844 Accuracy: 0.346805...
Epoch: 4/5... Step: 200... Loss: 1.516192... Val Loss: 1.543340 Accuracy: 0.346420...
Epoch: 4/5... Step: 300... Loss: 1.622187... Val Loss:

In [19]:
test_losses = [] # track loss
num_correct = 0
 
# init hidden state
h = net.init_hidden(batch_size)
 
net.eval()
preds_total = []
labels_total = []
# iterate over test data
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    if(USE_CUDA):
        inputs, labels = inputs.cuda(), labels.cuda()
    output = net(inputs, h)
    test_loss = criterion(output.squeeze(), labels.long())
    test_losses.append(test_loss.item())
    
    output=torch.nn.Softmax(dim=1)(output)
    pred=torch.max(output, 1)[1]
    preds_total.extend(pred.cpu().numpy().tolist())
    labels_total.extend(labels.cpu().numpy().tolist())
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not USE_CUDA else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
 
# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))
print("precision ", precision_score(labels_total, preds_total, average='macro'))
print("recall ", recall_score(labels_total, preds_total, average='macro'))
print("f1 ", f1_score(labels_total, preds_total, average='macro'))


Test loss: 1.539
Test accuracy: 0.347
precision  0.06953703703703704
recall  0.2
f1  0.10319477842665752


  _warn_prf(average, modifier, msg_start, len(result))
