In [1]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('../chapter_7/GoogleNews-vectors-negative300.bin', binary=True)

unable to import 'smart_open.gcs', disabling that module


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import optuna

class RNN(nn.Module):
    def __init__(self, trial, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        num_layer = trial.suggest_int('num_layer', 1, 3)
        
        self.hidden_size = hidden_size
        layers = []
        num_input = input_size + hidden_size
        power_two = 1
        
        while power_two < hidden_size:
            power_two *= 2
        num_output_last = power_two
        
        for i in range(num_layer):
            power_two = 1
            
            while power_two < num_input:
                power_two *= 2
            if i == num_layer-1:
                num_output = num_output_last
            else:
                num_output = power_two / 2
            layers.append(nn.Linear(int(num_input), int(num_output)))
            num_input = num_output
            
        # 順方向のRNN
        self.layers = nn.ModuleList(layers)
        self.fc2h = nn.Linear(num_output_last, hidden_size)
        
        # 逆方向のRNN
        self.layers_b = nn.ModuleList(layers)
        self.fc_b2h_b = nn.Linear(num_output_last, hidden_size)
        
        # output
        self.h2o = nn.Linear(hidden_size + hidden_size, output_size)
        
    def forward(self, input, input_b, hidden, hidden_b):
        # 順方向のRNN
        combined = torch.cat((input, hidden), 1)
        for layer in self.layers:
            combined = F.relu(layer(combined))
        hidden = F.relu(self.fc2h(combined))
        
        # 逆方向のRNN
        combined_b = torch.cat((input_b, hidden_b), 1)
        for layer in self.layers_b:
            combined_b = F.relu(layer(combined_b))
        hidden_b = F.relu(self.fc_b2h_b(combined_b))
        
        # output
        combined_o = torch.cat((hidden, hidden_b), 1)
        output = self.h2o(combined_o)
        return output, hidden, hidden_b
        
    def initHidden(self, batch_size):
        return torch.zeros(batch_size, self.hidden_size)

In [3]:
def make_id_dic(fname):
    id_dic = {}
    
    with open(fname) as file:
        for i, line in enumerate(file, 1):
            line = line.rstrip('\n')
            id_dic[line] = i
            
    return id_dic

In [4]:
def words2id(sentence, id_dic):
    import re
    import snowballstemmer
    
    words_id = []
    
    # 文字種の統一
    sentence = sentence.lower()
    
    # 数字の置き換え
    sentence = re.sub(r'[0-9]+', '0', sentence)
    
    # '-'を' 'に変換
    sentence = sentence.replace('-', ' ')
    
    words = sentence.split()
    
    # ステミング処理
    stemmer = snowballstemmer.stemmer('english')
    words2 = [stemmer.stemWord(word) for word in words]
    words = words2
    
    for word in words:
        if word in id_dic.keys():
            words_id.append(id_dic[word])
        else:
            words_id.append(0)
            
    return words_id

In [5]:
def id2vec(words_id, id_dic):
    words_vec = []
    
    for id in words_id:
        word_list = [k for k, v in id_dic.items() if v == id]
        if len(word_list) > 0:
            try:
                words_vec.append(torch.from_numpy(model[word_list[0]]).view(1,-1))
            except KeyError:
                words_vec.append(torch.zeros(1, 300))
        else:
            words_vec.append(torch.zeros(1, 300))
    
    return torch.cat(words_vec)

In [6]:
def make_dataset(fname, id_dic):
    with open(fname) as file:
        lines = file.readlines()
        lines_vec = []
        labels = np.zeros([len(lines), 1])
        
        for i, line in enumerate(lines):
            
            line = line.rstrip('\n')
            category = line.strip('\t')[0]
            title = line.split('\t')[1]
            
            # lines_vecの処理
            words_id = words2id(title, id_dic)
            words_vec = id2vec(words_id, id_dic)
            lines_vec.append(words_vec)
            
            # labelsの処理
            if category == 'b':
                labels[i] = 0
            elif category == 't':
                labels[i] = 1
            elif category == 'e':
                labels[i] = 2
            elif category == 'm':
                labels[i] = 3
    labels = torch.from_numpy(labels).long()
                
    return lines_vec, labels

In [7]:
def train(rnn, criterion, optimizer, batch_size, words_vec_batch, words_vec_batch_b, label_batch):
    hidden, hidden_b = rnn.initHidden(batch_size), rnn.initHidden(batch_size)
    rnn.zero_grad()
    optimizer.zero_grad()
    hidden, hidden_b = hidden.detach(), hidden_b.detach()
    
    for i in range(len(words_vec_batch)):
        output, hidden, hidden_b = rnn(words_vec_batch[i], words_vec_batch_b[i], hidden, hidden_b)
    loss = criterion(output, label_batch)
    loss.backward(retain_graph=True)
    optimizer.step()

In [8]:
def get_loss_and_accyracy(rnn, criterion, lines_vec, labels):
    running_loss = 0
    correct_count = 0
    
    for words_vec, label in zip(lines_vec, labels):
        words_vec_batch, words_vec_batch_b, label_batch = batch_process([words_vec], [label])
        hidden, hidden_b = rnn.initHidden(1), rnn.initHidden(1)
        
        for i in range(len(words_vec_batch)):
            output, hidden, hidden_b = rnn(words_vec_batch[i], words_vec_batch_b[i], hidden, hidden_b)
        loss = criterion(output, label)
        running_loss += loss
        pre_label = torch.max(output, 1)[1]
        if pre_label == label:
            correct_count += 1
            
    return running_loss / len(labels), correct_count / len(labels)

In [9]:
def batch_process(words_vec_list, label_list):
    # words_vec_batchの処理
    len_max = 0
    for words_vec in words_vec_list:
        if words_vec.size()[0] > len_max:
            len_max = words_vec.size()[0]
    words_vec_batch = torch.zeros(len_max, len(words_vec_list), 300)
    for i in range(len(words_vec_list)):
        for j in range(len(words_vec_list[i])):
            words_vec_batch[len_max - len(words_vec_list[i]) + j, i, :] = words_vec_list[i][j]
            
    # words_vec_batch_bの処理
    words_vec_batch_b = torch.zeros(len_max, len(words_vec_list), 300)
    for i in range(len(words_vec_list)):
        words_vec_batch_b[:, i, :] = words_vec_batch[:, len(words_vec_list) - 1 - i, :]
    
    # label_batchの処理
    label_batch = torch.zeros(len(label_list)).long()
    for i, label in enumerate(label_list):
        label_batch[i] = label
        
    return words_vec_batch, words_vec_batch_b, label_batch

In [10]:
def make_optimizer(trial, rnn):
    lr = trial.suggest_loguniform('learning_rate', 1e-4, 1e-0)
    wd = trial.suggest_loguniform('weight_decay', 1e-16, 1e-12)
    
    optimizer = optim.SGD(rnn.parameters(), lr=lr, weight_decay = wd)
    return optimizer

In [11]:
def train_all(trial, rnn, criterion, optimizer, lines_vec, labels, lines_vec_test, labels_test):
    batch_size = trial.suggest_int('batch_size', 16, 256)
    
    for epoch in range(10):
        words_vec_list = []
        label_list = []
        
        for i, dataset in enumerate(zip(lines_vec, labels), 1):
            words_vec_list.append(dataset[0])
            label_list.append(dataset[1])
            
            if i % batch_size == 0 or i == len(lines_vec):
                words_vec_batch, words_vec_batch_b, label_batch = batch_process(words_vec_list, label_list)
                train(rnn, criterion, optimizer, len(words_vec_list), words_vec_batch, words_vec_batch_b, label_batch)
                words_vec_list = []
                label_list = []

In [12]:
def objective(trial):
    rnn = RNN(trial, 300, 50, 4) # RNN(d_w, d_h, L)
    criterion = nn.CrossEntropyLoss()
    optimizer = make_optimizer(trial, rnn)
    
    id_dic = make_id_dic('../chapter_6/train.feature.txt')
    lines_vec, labels = make_dataset('../chapter_6/train.txt', id_dic)
    lines_vec_test, labels_test = make_dataset('../chapter_6/test.txt', id_dic)
    
    train_all(trial, rnn, criterion, optimizer, lines_vec, labels, lines_vec_test, labels_test)
    
    loss_test, accuracy_test = get_loss_and_accyracy(rnn, criterion, lines_vec_test, labels_test)
    
    return 1 - accuracy_test

In [13]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2020-04-30 19:24:27,356] Finished trial#0 with value: 0.5809595202398801 with parameters: {'num_layer': 2, 'learning_rate': 0.9807899752017868, 'weight_decay': 1.9756967914508553e-16, 'batch_size': 152}. Best is trial#0 with value: 0.5809595202398801.
[I 2020-04-30 19:25:51,222] Finished trial#1 with value: 0.5809595202398801 with parameters: {'num_layer': 2, 'learning_rate': 0.028348561430050993, 'weight_decay': 8.736424780316326e-14, 'batch_size': 183}. Best is trial#0 with value: 0.5809595202398801.
[I 2020-04-30 19:27:17,685] Finished trial#2 with value: 0.2946026986506747 with parameters: {'num_layer': 2, 'learning_rate': 0.07841298670752679, 'weight_decay': 2.0047605638462836e-15, 'batch_size': 160}. Best is trial#2 with value: 0.2946026986506747.
[I 2020-04-30 19:28:44,360] Finished trial#3 with value: 0.5809595202398801 with parameters: {'num_layer': 2, 'learning_rate': 0.0014033979058862703, 'weight_decay': 3.611812460762474e-15, 'batch_size': 154}. Best is trial#2 with val

[I 2020-04-30 20:07:34,383] Finished trial#30 with value: 0.5809595202398801 with parameters: {'num_layer': 3, 'learning_rate': 0.08424739149274865, 'weight_decay': 2.369627568354325e-15, 'batch_size': 129}. Best is trial#11 with value: 0.2931034482758621.
[I 2020-04-30 20:09:01,193] Finished trial#31 with value: 0.487256371814093 with parameters: {'num_layer': 2, 'learning_rate': 0.06603283467196047, 'weight_decay': 3.324852814611492e-13, 'batch_size': 191}. Best is trial#11 with value: 0.2931034482758621.
[I 2020-04-30 20:10:28,358] Finished trial#32 with value: 0.5809595202398801 with parameters: {'num_layer': 2, 'learning_rate': 0.025102086731717536, 'weight_decay': 3.0989603275796884e-13, 'batch_size': 160}. Best is trial#11 with value: 0.2931034482758621.
[I 2020-04-30 20:11:53,805] Finished trial#33 with value: 0.4212893553223388 with parameters: {'num_layer': 2, 'learning_rate': 0.18387751577124956, 'weight_decay': 9.045653006281493e-13, 'batch_size': 215}. Best is trial#11 wit

[I 2020-04-30 20:55:01,878] Finished trial#61 with value: 0.44677661169415295 with parameters: {'num_layer': 2, 'learning_rate': 0.06538821700107052, 'weight_decay': 2.8832561388491653e-15, 'batch_size': 149}. Best is trial#11 with value: 0.2931034482758621.
[I 2020-04-30 20:56:33,596] Finished trial#62 with value: 0.5734632683658171 with parameters: {'num_layer': 2, 'learning_rate': 0.12008820647841462, 'weight_decay': 4.017040229337247e-15, 'batch_size': 139}. Best is trial#11 with value: 0.2931034482758621.
[I 2020-04-30 20:58:05,155] Finished trial#63 with value: 0.6071964017991005 with parameters: {'num_layer': 2, 'learning_rate': 0.0001045406452591979, 'weight_decay': 1.014224432795627e-15, 'batch_size': 129}. Best is trial#11 with value: 0.2931034482758621.
[I 2020-04-30 20:59:33,270] Finished trial#64 with value: 0.5809595202398801 with parameters: {'num_layer': 2, 'learning_rate': 0.1927544560196321, 'weight_decay': 6.18821469447794e-16, 'batch_size': 169}. Best is trial#11 wi

[I 2020-04-30 21:41:26,764] Finished trial#92 with value: 0.4497751124437781 with parameters: {'num_layer': 2, 'learning_rate': 0.05954263081705069, 'weight_decay': 3.9808017288812485e-15, 'batch_size': 154}. Best is trial#11 with value: 0.2931034482758621.
[I 2020-04-30 21:42:55,831] Finished trial#93 with value: 0.29760119940029983 with parameters: {'num_layer': 2, 'learning_rate': 0.07804569153550524, 'weight_decay': 2.7127375838729446e-15, 'batch_size': 176}. Best is trial#11 with value: 0.2931034482758621.
[I 2020-04-30 21:44:25,405] Finished trial#94 with value: 0.5764617691154423 with parameters: {'num_layer': 2, 'learning_rate': 0.050922077090722, 'weight_decay': 2.576604981415326e-15, 'batch_size': 138}. Best is trial#11 with value: 0.2931034482758621.
[I 2020-04-30 21:45:51,053] Finished trial#95 with value: 0.6071964017991005 with parameters: {'num_layer': 2, 'learning_rate': 0.07802746861386897, 'weight_decay': 8.53524247945522e-15, 'batch_size': 167}. Best is trial#11 with

In [14]:
print('best value: {0}'.format(study.best_value))
print('best params: {0}'.format(study.best_params))

best value: 0.2931034482758621
best params: {'num_layer': 1, 'learning_rate': 0.08696842859366748, 'weight_decay': 6.246141038939574e-13, 'batch_size': 250}
