In [1]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('../chapter_7/GoogleNews-vectors-negative300.bin', binary=True)

unable to import 'smart_open.gcs', disabling that module


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import optuna

class Net(nn.Module):
    def __init__(self, trial, d_h):
        super(Net, self).__init__()
        
        num_layer = trial.suggest_int('num_layer', 1, 4)
        
        layers = []
        num_input = d_h
        num_output_last = 4
        
        for i in range(num_layer):
            power_two = 1
            
            while power_two < num_input:
                power_two *= 2
            if i == num_layer-1:
                num_output = num_output_last
            else:
                num_output = power_two / 2
            layers.append(nn.Linear(int(num_input), int(num_output)))
            num_input = num_output
            
        self.conv1 = nn.Conv2d(1, 50, (3, 300), padding=(1, 0))
        self.layers = nn.ModuleList(layers)
        
    def forward(self, x):
        words_len = x.size(2)
        
        x =  F.relu(self.conv1(x)).squeeze(3)
        x = F.max_pool2d(x, (1, words_len)).squeeze(2)
        for i, layer in enumerate(self.layers):
            if i == len(self.layers) - 1:
                x = layer(x)
            else:
                x = F.relu(layer(x))
        return x

In [3]:
def make_id_dic(fname):
    id_dic = {}
    
    with open(fname) as file:
        for i, line in enumerate(file, 1):
            line = line.rstrip('\n')
            id_dic[line] = i
            
    return id_dic

In [4]:
def words2id(sentence, id_dic):
    import re
    import snowballstemmer
    
    words_id = []
    
    # 文字種の統一
    sentence = sentence.lower()
    
    # 数字の置き換え
    sentence = re.sub(r'[0-9]+', '0', sentence)
    
    # '-'を' 'に変換
    sentence = sentence.replace('-', ' ')
    
    words = sentence.split()
    
    # ステミング処理
    stemmer = snowballstemmer.stemmer('english')
    words2 = [stemmer.stemWord(word) for word in words]
    words = words2
    
    for word in words:
        if word in id_dic.keys():
            words_id.append(id_dic[word])
        else:
            words_id.append(0)
            
    return words_id

In [5]:
def id2vec(words_id, id_dic):
    words_vec = []
    
    for id in words_id:
        word_list = [k for k, v in id_dic.items() if v == id]
        if len(word_list) > 0:
            try:
                words_vec.append(torch.from_numpy(model[word_list[0]]).view(1,-1))
            except KeyError:
                words_vec.append(torch.zeros(1, 300))
        else:
            words_vec.append(torch.zeros(1, 300))
    
    return torch.cat(words_vec)

In [6]:
def make_dataset(fname, id_dic):
    with open(fname) as file:
        lines = file.readlines()
        lines_vec = []
        labels = np.zeros([len(lines), 1])
        
        for i, line in enumerate(lines):
            
            line = line.rstrip('\n')
            category = line.strip('\t')[0]
            title = line.split('\t')[1]
            
            # lines_vecの処理
            words_id = words2id(title, id_dic)
            words_vec = id2vec(words_id, id_dic)
            lines_vec.append(words_vec)
            
            # labelsの処理
            if category == 'b':
                labels[i] = 0
            elif category == 't':
                labels[i] = 1
            elif category == 'e':
                labels[i] = 2
            elif category == 'm':
                labels[i] = 3
    labels = torch.from_numpy(labels).long()
                
    return lines_vec, labels

In [7]:
def train(cnn, criterion, optimizer, words_vec, label):
    cnn.zero_grad()
    optimizer.zero_grad()
    
    output = cnn(words_vec.unsqueeze(0).unsqueeze(0))
    loss = criterion(output, label)
    loss.backward()
    optimizer.step()

In [8]:
def get_loss_and_accyracy(cnn, criterion, lines_vec, labels):
    running_loss = 0
    correct_count = 0
    
    for words_vec, label in zip(lines_vec, labels):
        output = cnn(words_vec.unsqueeze(0).unsqueeze(0))
        loss = criterion(output, label)
        running_loss += loss
        pre_label = torch.max(output, 1)[1]
        if pre_label == label:
            correct_count += 1
            
    return running_loss / len(labels), correct_count / len(labels)

In [9]:
def make_optimizer(trial, cnn):
    lr = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    wd = trial.suggest_loguniform('weight_decay', 1e-11, 1e-7)
    
    optimizer = optim.SGD(cnn.parameters(), lr=lr, weight_decay = wd)
    return optimizer

In [10]:
def train_all(cnn, criterion, optimizer, lines_vec, labels, lines_vec_test, labels_test):
    for epoch in range(10):
        for words_vec, label in zip(lines_vec, labels):
            train(cnn, criterion, optimizer, words_vec, label)

In [11]:
def objective(trial):
    d_h = 50 # d_h: 畳み込み演算後の各時刻のベクトルの次元数
    cnn = Net(trial, d_h)
    criterion = nn.CrossEntropyLoss()
    optimizer = make_optimizer(trial, cnn)
    
    id_dic = make_id_dic('../chapter_6/train.feature.txt')
    lines_vec, labels = make_dataset('../chapter_6/train.txt', id_dic)
    lines_vec_test, labels_test = make_dataset('../chapter_6/test.txt', id_dic)
    
    train_all(cnn, criterion, optimizer, lines_vec, labels, lines_vec_test, labels_test)
    
    loss_test, accuracy_test = get_loss_and_accyracy(cnn, criterion, lines_vec_test, labels_test)
    
    return 1 - accuracy_test

In [12]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2020-04-30 19:50:45,301] Finished trial#0 with value: 0.5809595202398801 with parameters: {'num_layer': 4, 'learning_rate': 5.0789136669157204e-05, 'weight_decay': 1.731946870163098e-08}. Best is trial#0 with value: 0.5809595202398801.
[I 2020-04-30 19:53:28,516] Finished trial#1 with value: 0.5074962518740629 with parameters: {'num_layer': 2, 'learning_rate': 1.4855639948409303e-05, 'weight_decay': 1.7515473024875947e-11}. Best is trial#1 with value: 0.5074962518740629.
[I 2020-04-30 19:55:54,073] Finished trial#2 with value: 0.25337331334332835 with parameters: {'num_layer': 1, 'learning_rate': 0.00010356358897314089, 'weight_decay': 6.5448868034125844e-09}. Best is trial#2 with value: 0.25337331334332835.
[I 2020-04-30 19:58:16,427] Finished trial#3 with value: 0.1296851574212894 with parameters: {'num_layer': 1, 'learning_rate': 0.06059216879820546, 'weight_decay': 2.4983708748477233e-09}. Best is trial#3 with value: 0.1296851574212894.
[I 2020-04-30 20:00:52,748] Finished trial

[I 2020-04-30 21:17:06,461] Finished trial#32 with value: 0.11544227886056968 with parameters: {'num_layer': 1, 'learning_rate': 0.009580012264652788, 'weight_decay': 5.739438899915268e-08}. Best is trial#23 with value: 0.1139430284857571.
[I 2020-04-30 21:19:36,021] Finished trial#33 with value: 0.1139430284857571 with parameters: {'num_layer': 1, 'learning_rate': 0.008700213345486478, 'weight_decay': 7.431015319395308e-08}. Best is trial#23 with value: 0.1139430284857571.
[I 2020-04-30 21:22:26,729] Finished trial#34 with value: 0.13418290854572712 with parameters: {'num_layer': 2, 'learning_rate': 0.00831104408284212, 'weight_decay': 6.492417923420167e-08}. Best is trial#23 with value: 0.1139430284857571.
[I 2020-04-30 21:25:06,799] Finished trial#35 with value: 0.15292353823088456 with parameters: {'num_layer': 1, 'learning_rate': 0.0007358048950142827, 'weight_decay': 2.6434621077041074e-08}. Best is trial#23 with value: 0.1139430284857571.
[I 2020-04-30 21:27:33,949] Finished tri

[I 2020-04-30 22:30:44,652] Finished trial#66 with value: 0.1214392803598201 with parameters: {'num_layer': 1, 'learning_rate': 0.014035979438997965, 'weight_decay': 1.3310450728646295e-08}. Best is trial#23 with value: 0.1139430284857571.
[I 2020-04-30 22:32:37,428] Finished trial#67 with value: 0.11994002998500752 with parameters: {'num_layer': 1, 'learning_rate': 0.00270370891973323, 'weight_decay': 4.4118452013256474e-09}. Best is trial#23 with value: 0.1139430284857571.
[I 2020-04-30 22:34:27,100] Finished trial#68 with value: 0.1281859070464768 with parameters: {'num_layer': 1, 'learning_rate': 0.025279166944358052, 'weight_decay': 8.690810072551375e-09}. Best is trial#23 with value: 0.1139430284857571.
[I 2020-04-30 22:36:20,161] Finished trial#69 with value: 0.13193403298350825 with parameters: {'num_layer': 1, 'learning_rate': 0.055243463744373125, 'weight_decay': 3.4116534439631504e-08}. Best is trial#23 with value: 0.1139430284857571.
[I 2020-04-30 22:38:14,467] Finished tri

In [13]:
print('best value: {0}'.format(study.best_value))
print('best params: {0}'.format(study.best_params))

best value: 0.11244377811094453
best params: {'num_layer': 1, 'learning_rate': 0.012652708009751688, 'weight_decay': 1.8718998497570655e-08}
