In [1]:
import copy
from statistics import median
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torch.utils.data as Data
torch.manual_seed(1)    # reproducible
from model import HierAttnNet 
from tqdm import trange
from sklearn.model_selection import train_test_split
import csv 
from keras.utils import to_categorical


def load_x():
    
    doc_list = []
    with open('./yct/train_data3.tsv') as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        for row in reader:
            doc_list.append(row[0])
    
    return doc_list

def onehot_y():
    
    mapping = {}
    order = 1
    y_cate_list = []
    
    with open('./yct/train_data3.tsv') as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        for row in reader:
            cate = row[1]
            if cate not in mapping:
                mapping[cate] = order
                y_cate_list.append(order)
                order += 1
            else:
                y_cate_list.append(mapping[cate])
    return y_cate_list, order

Using TensorFlow backend.


In [2]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer


docs = load_x()
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
encoded_docs = t.texts_to_sequences(docs)
max_length = 15
padded_docs = pad_sequences(encoded_docs, maxlen=max_length)
#print(padded_docs)

In [3]:
# settings
epochs = 50
vali_ratio = 0.01
batch_size = 64
optimizer_str = 'Adam'
default_dropout_rate = 0.2
hidden_dim = 70


X_seq = np.array(padded_docs)
ycate = np.array(onehot_y()[0])

# train valid split
x_train, x_valid, y_train, y_valid = train_test_split(X_seq, ycate, random_state=127, test_size = vali_ratio)


# put data to dataloader (gpu version)
# x_train = torch.from_numpy(x_train).to(torch.long).cuda()
# y_train = torch.from_numpy(y_train).to(torch.long).cuda()
# x_valid = torch.from_numpy(x_valid).to(torch.long).cuda()
# y_valid = torch.from_numpy(y_valid).to(torch.long).cuda()

# put data to dataloader (cpu version)
x_train = torch.from_numpy(x_train).to(torch.long)
y_train = torch.from_numpy(y_train).to(torch.long)
x_valid = torch.from_numpy(x_valid).to(torch.long)
y_valid = torch.from_numpy(y_valid).to(torch.long)


# 轉torch 能識別的 Dataset
torch_dataset = Data.TensorDataset(x_train, y_train)

# 把 dataset 放入 DataLoader
loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=batch_size,      # mini batch size
        shuffle=True,           
        num_workers=8,             # only cpu
        pin_memory=True            # only cpu
        )

In [4]:
# define model
model = HierAttnNet(
        doc_len=x_train.shape[1],  
        embedding_dim=150,
        hidden_dim=hidden_dim,
        vocab_size=vocab_size,
        tagset_size=onehot_y()[1],
        default_dropout_rate=default_dropout_rate,
        embedding_pretrained=None,
        embedding_freeze=False)

loss_function = nn.NLLLoss()
if optimizer_str == 'SGD':
    optimizer = optim.SGD(model.parameters(), lr=0.1)
elif optimizer_str == 'Adam':
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
else:
    raise 'optimizer_str is not set correctly!'

In [5]:
print("Start training...")
for epoch in range(epochs):   # train e epochs
    for step, (batch_x, batch_y) in enumerate(loader):  # evety step, loader using one batch training data
        output, at_weight = model(batch_x)
        loss = loss_function(output, batch_y)   # cross entropy loss
        optimizer.zero_grad()            # clear gradients for this training step
        loss.backward(retain_graph=True) # backpropagation, compute gradients
        optimizer.step()                 # apply gradients
        #exit()
        if step % 100 == 0:
            with torch.no_grad():
                print("Epoch: %d, Step: %d:" %(epoch, step))
torch.save(model, "./model_att/best")

Start training...
Epoch: 0, Step: 0:
Epoch: 0, Step: 100:
Epoch: 0, Step: 200:
Epoch: 0, Step: 300:
Epoch: 0, Step: 400:
Epoch: 0, Step: 500:
Epoch: 0, Step: 600:
Epoch: 1, Step: 0:
Epoch: 1, Step: 100:
Epoch: 1, Step: 200:
Epoch: 1, Step: 300:
Epoch: 1, Step: 400:
Epoch: 1, Step: 500:
Epoch: 1, Step: 600:
Epoch: 2, Step: 0:
Epoch: 2, Step: 100:
Epoch: 2, Step: 200:
Epoch: 2, Step: 300:
Epoch: 2, Step: 400:
Epoch: 2, Step: 500:
Epoch: 2, Step: 600:
Epoch: 3, Step: 0:
Epoch: 3, Step: 100:
Epoch: 3, Step: 200:
Epoch: 3, Step: 300:
Epoch: 3, Step: 400:
Epoch: 3, Step: 500:
Epoch: 3, Step: 600:
Epoch: 4, Step: 0:
Epoch: 4, Step: 100:
Epoch: 4, Step: 200:
Epoch: 4, Step: 300:
Epoch: 4, Step: 400:
Epoch: 4, Step: 500:
Epoch: 4, Step: 600:
Epoch: 5, Step: 0:
Epoch: 5, Step: 100:
Epoch: 5, Step: 200:
Epoch: 5, Step: 300:
Epoch: 5, Step: 400:
Epoch: 5, Step: 500:
Epoch: 5, Step: 600:
Epoch: 6, Step: 0:
Epoch: 6, Step: 100:
Epoch: 6, Step: 200:
Epoch: 6, Step: 300:
Epoch: 6, Step: 400:
Epoch: 6,

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [6]:
# import copy
# from statistics import median
# import pickle
# import numpy as np
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F

# import torch.utils.data as Data
# torch.manual_seed(1)    # reproducible
# from model import HierAttnNet


# test_idx = [18,2049]


# for doc in np.array(docs)[test_idx]:
#     print(doc)
#     print()
    
# X_seq_test = X_seq[test_idx]
# x_test = torch.from_numpy(X_seq_test).to(torch.long)
# GT = ycate[test_idx]


# model = torch.load("./model_att/best")
# model.eval()
# pred_distrib, att_weight = model(x_test) #log softmax 
# pred_y = torch.max(pred_distrib, 1)[1].data.cpu().numpy()
# top3 = torch.topk(att_weight, k=3)[1].cpu().numpy()

# for i in range(len(X_seq_test)):
#     for idx in top3[i]:
#         print('%0.4f' % (float(att_weight[i][idx])), end=' ')
        
#         if X_seq_test[i][idx] != 0:
#             print(t.index_word[X_seq_test[i][idx]])
#         else:
#             print()
#     print("-"*100)


In [7]:
input_title = 'China is recording so few new coronavirus infections that South Korea looks like the new center of the epidemic'
input_title = 'Nathaniel Woods executed in Alabama after Supreme Court denies stay'
print(input_title)
print()

X_seq_test =  pad_sequences(t.texts_to_sequences([input_title]), maxlen=max_length)
x_test = torch.from_numpy(X_seq_test).to(torch.long)


model = torch.load("./model_att/best")
model.eval()
pred_distrib, att_weight = model(x_test) #log softmax 
pred_y = torch.max(pred_distrib, 1)[1].data.cpu().numpy()
top3 = torch.topk(att_weight, k=3)[1].cpu().numpy()

for i in range(len(X_seq_test)):   
    for idx in top3[i]:
        print('%0.4f' % (float(att_weight[i][idx])), end=' ')
        
        if X_seq_test[i][idx] != 0:
            print(t.index_word[X_seq_test[i][idx]])
        else:
            print()
    print("-"*100)

Nathaniel Woods executed in Alabama after Supreme Court denies stay

0.2057 woods
0.1069 
0.1060 
----------------------------------------------------------------------------------------------------
