In [1]:
import pandas as pd
from gensim.models import Word2Vec
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import numpy as np
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

In [2]:
column_names = ['type','title','text']
df = pd.read_csv('./all_after_mapping.tsv',sep='\t',names=column_names)


In [3]:
tokenlizeword = np.load('tokenlizeword0225_nopunct.npy',allow_pickle=True)


In [4]:
wmodel = Word2Vec(tokenlizeword, size=300, window=5, min_count=0)
wmodel.save("word2vec.model")

In [5]:
labels = df['type'].values
labels = np.array(labels)

In [6]:
max_size = 512
x_train = []
for k in range(tokenlizeword.shape[0]):
  # every article have max_size * 300 embedding matrix
    embedding_matrix = np.zeros((max_size,300))
    for i in range(len(tokenlizeword[k])):
        if(i>=max_size):
            break
        embedding_matrix[i] = wmodel[tokenlizeword[k][i]]
    x_train.append(embedding_matrix)

  if __name__ == '__main__':


In [7]:
x_train = np.array(x_train)


In [8]:
x_train.shape

(35546, 512, 300)

In [9]:
dev_x = x_train[:5000]
dev_y = labels[:5000]

test_x = x_train[5000:10000]
test_y = labels[5000:10000]

train_x = x_train[10000:]
train_y = labels[10000:]


In [10]:
embedding_dim = 300
n_hidden = 128 # number of hidden units in one cell
num_classes = 7  
BATCH_SIZE = 64

class BiLSTM_Attention(nn.Module):
    def __init__(self):
        super(BiLSTM_Attention, self).__init__()

#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, n_hidden, bidirectional=True)
        self.out = nn.Linear(n_hidden * 2, num_classes)

    # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
    def attention_net(self, lstm_output, final_state):
        hidden = final_state.view(-1, n_hidden * 2, 1)   # hidden : [batch_size, n_hidden * num_directions(=2), 1(=n_layer)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2) # attn_weights : [batch_size, n_step]
        soft_attn_weights = F.softmax(attn_weights, 1)
        # [batch_size, n_hidden * num_directions(=2), n_step] * [batch_size, n_step, 1] = [batch_size, n_hidden * num_directions(=2), 1]
        context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
#         return context, soft_attn_weights.data.numpy() # context : [batch_size, n_hidden * num_directions(=2)]
        return context, soft_attn_weights # context : [batch_size, n_hidden * num_directions(=2)]


    def forward(self, X):
#         input = self.embedding(X) # input : [batch_size, len_seq, embedding_dim]
        input = X
        input = input.permute(1, 0, 2) # input : [len_seq, batch_size, embedding_dim]
#         print(input)
        
        hidden_state = Variable(torch.zeros(1*2, BATCH_SIZE, n_hidden)) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        cell_state = Variable(torch.zeros(1*2, BATCH_SIZE, n_hidden)) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        hidden_state = hidden_state.double()
        cell_state = cell_state.double()
        hidden_state = hidden_state.to(device)
        cell_state = cell_state.to(device)
#         print(cell_state)
#         print(hidden_state)
#         print(hidden_state)
        # final_hidden_state, final_cell_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        output, (final_hidden_state, final_cell_state) = self.lstm(input, (hidden_state, cell_state))
        output = output.permute(1, 0, 2) # output : [batch_size, len_seq, n_hidden]
        attn_output, attention = self.attention_net(output, final_hidden_state)
#         print('attn_output.shape',attn_output.shape)
#         print('attention.shape',attention.shape)
        return self.out(attn_output), attention # model : [batch_size, num_classes], attention : [batch_size, n_step]

In [11]:
class Lstmdataset(Dataset):
    def __init__(self, x,y):
        self.x = torch.from_numpy(x).double()
        self.y = torch.from_numpy(y).double()
        self.len = x.shape[0]
    def __getitem__(self, index):
        x = self.x[index]
        y = self.y[index]
        return x,y

    def __len__(self):
        return self.len

In [12]:
trainset = Lstmdataset(train_x,train_y)
trainloader = DataLoader(trainset,batch_size=BATCH_SIZE,drop_last=True)


In [13]:
model = BiLSTM_Attention()


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')
print('device:',device)

model = model.double()
model = torch.load('model_lstmattention_epoch10.pkl')
model = model.to(device)

model.train()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(10):
    running_loss = 0
    for data in trainloader:
        x,y = [t.to(device) for t in data]
        optimizer.zero_grad()
        output, attention = model(x)
        y = y.long()
        loss = criterion(output, y)
#         if (epoch + 1) % 10 == 0:
#         print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('Epoch:',epoch+1,'loss=',running_loss)
    torch.save(model, 'model_lstmattention_epoch10.pkl')
    

device: cuda:0
Epoch: 1 loss= 745.1874724410842


  "type " + obj.__name__ + ". It won't be checked "


Epoch: 2 loss= 620.1792342531356
Epoch: 3 loss= 480.8146599279366
Epoch: 4 loss= 381.27264161745336
Epoch: 5 loss= 328.90335454120634
Epoch: 6 loss= 297.3497408487988
Epoch: 7 loss= 275.87431695183085
Epoch: 8 loss= 262.22184053166876
Epoch: 9 loss= 248.27353264785492
Epoch: 10 loss= 237.59630483884214


In [14]:
testset = Lstmdataset(test_x,test_y)
testloader = DataLoader(testset,batch_size=BATCH_SIZE,drop_last=True)


In [15]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    predictions_withoutmax = None
    correct = 0
    total = 0
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                x,y = [t.to("cuda:0") for t in data if t is not None]
                
            outputs , attention_output = model(x)
            after_softmax = F.softmax(outputs, dim=1)
            _, pred = torch.max(after_softmax, 1)

            # 用來計算訓練集的分類準確率
            if compute_acc:
                total += y.shape[0]
                correct += (pred == y).sum().item()

            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
                
            if predictions_withoutmax is None:
                predictions_withoutmax = after_softmax
            else:
                predictions_withoutmax = torch.cat((predictions_withoutmax,after_softmax))
    
    if compute_acc:
        acc = correct / total
        return predictions , predictions_withoutmax, acc
    return predictions_withoutmax
    

In [16]:
pred , pred_probi ,acc = get_predictions(model,testloader,True)
print(acc)

0.7483974358974359
