In [1]:
from google.colab import drive
drive.mount('./drive')

Drive already mounted at ./drive; to attempt to forcibly remount, call drive.mount("./drive", force_remount=True).


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        return output, hidden
    
    def initHidden(self, batch_size):
        return torch.zeros(batch_size, self.hidden_size)

In [0]:
def make_id_dic(fname):
    id_dic = {}
    
    with open(fname) as file:
        for i, line in enumerate(file, 1):
            line = line.rstrip('\n')
            id_dic[line] = i
            
    return id_dic

In [0]:
def words2id(sentence, id_dic):
    import re
    import snowballstemmer
    
    words_id = []
    
    # 文字種の統一
    sentence = sentence.lower()
    
    # 数字の置き換え
    sentence = re.sub(r'[0-9]+', '0', sentence)
    
    # '-'を' 'に変換
    sentence = sentence.replace('-', ' ')
    
    words = sentence.split()
    
    # ステミング処理
    stemmer = snowballstemmer.stemmer('english')
    words2 = [stemmer.stemWord(word) for word in words]
    words = words2
    
    for word in words:
        if word in id_dic.keys():
            words_id.append(id_dic[word])
        else:
            words_id.append(0)
            
    return words_id

In [0]:
def id2vec(words_id, embeds):
    words_vec = embeds(torch.Tensor(words_id).long())
    
    return words_vec

In [0]:
def make_dataset(fname, id_dic, embeds):
    with open(fname) as file:
        lines = file.readlines()
        lines_vec = []
        labels = np.zeros([len(lines), 1])
        
        for i, line in enumerate(lines):
            
            line = line.rstrip('\n')
            category = line.strip('\t')[0]
            title = line.split('\t')[1]
            
            # lines_vecの処理
            words_id = words2id(title, id_dic)
            words_vec = id2vec(words_id, embeds)
            lines_vec.append(words_vec)
            
            # labelsの処理
            if category == 'b':
                labels[i] = 0
            elif category == 't':
                labels[i] = 1
            elif category == 'e':
                labels[i] = 2
            elif category == 'm':
                labels[i] = 3
    labels = torch.from_numpy(labels).long()
                
    return lines_vec, labels

In [0]:
def train(rnn, criterion, optimizer, batch_size, words_vec_batch, label_batch):
    hidden = rnn.initHidden(batch_size)
    rnn.zero_grad()
    optimizer.zero_grad()
    hidden = hidden.detach()
    
    for i in range(len(words_vec_batch)):
        output, hidden = rnn(words_vec_batch[i].to(device), hidden.to(device))
    loss = criterion(output, label_batch.to(device))
    loss.backward(retain_graph=True)
    optimizer.step()
    
    return loss.item()

In [0]:
def get_loss_and_accyracy(rnn, criterion, lines_vec, labels):
    running_loss = 0
    correct_count = 0
    words_vec_list = []
    label_list = []
    
    for words_vec, label in zip(lines_vec, labels):
        words_vec_list.append(words_vec)
        label_list.append(label)
    words_vec_all, label_all = batch_process(words_vec_list, label_list)
    
    hidden = rnn.initHidden(len(lines_vec))
    
    for i in range(len(words_vec_all)):
        output, hidden = rnn(words_vec_all[i].to(device), hidden.to(device))
    loss = criterion(output, label_all.to(device))
    pre_labels = torch.max(output, 1)[1]
    accuracy = (pre_labels == label_all.to(device)).sum().item() / len(labels)
    
    return loss, accuracy

In [0]:
def batch_process(words_vec_list, label_list):
    # words_vec_batchの処理
    len_max = 0
    for words_vec in words_vec_list:
        if words_vec.size()[0] > len_max:
            len_max = words_vec.size()[0]
    words_vec_batch = torch.zeros(len_max, len(words_vec_list), 300)
    for i in range(len(words_vec_list)):
        for j in range(len(words_vec_list[i])):
            words_vec_batch[len_max - len(words_vec_list[i]) + j, i, :] = words_vec_list[i][j]
        
    # label_batchの処理
    label_batch = torch.zeros(len(label_list)).long()
    for i, label in enumerate(label_list):
        label_batch[i] = label
        
    return words_vec_batch, label_batch

In [13]:
import numpy as np
import torch.optim as optim

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('{0}\n'.format(device))

rnn = RNN(300, 50, 4).to(device) # RNN(d_w, d_h, L)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(rnn.parameters(), lr=pow(10, -3)) # lr: 検証の結果、10エポック目で最も精度が高かった数値

id_dic = make_id_dic('drive/My Drive/Colab Notebooks/100_knocks/chapter_6/train.feature.txt')
embeds = nn.Embedding(len(id_dic)+1, 300)
lines_vec, labels = make_dataset('drive/My Drive/Colab Notebooks/100_knocks/chapter_6/train.txt', id_dic, embeds)
lines_vec_test, labels_test = make_dataset('drive/My Drive/Colab Notebooks/100_knocks/chapter_6/test.txt', id_dic, embeds)
batch_size = 8

for epoch in range(10):
    running_loss = 0
    words_vec_list = []
    label_list = []
    
    for i, dataset in enumerate(zip(lines_vec, labels), 1):
        words_vec_list.append(dataset[0])
        label_list.append(dataset[1])
        
        if i % batch_size == 0 or i == len(lines_vec):
            words_vec_batch, label_batch = batch_process(words_vec_list, label_list)
            loss = train(rnn, criterion, optimizer, len(words_vec_list), words_vec_batch, label_batch)
            running_loss += loss
            words_vec_list = []
            label_list = []
            
    loss_train, accuracy_train = get_loss_and_accyracy(rnn, criterion, lines_vec, labels)
    loss_test, accuracy_test = get_loss_and_accyracy(rnn, criterion, lines_vec_test, labels_test)
    print('[{0}] 訓練データ上の損失: {1}\t訓練データ上の正解率: {2}\t評価データ上の損失: {3}\t評価データ上の正解率: {4}'.format(epoch + 1, loss_train, accuracy_train, loss_test, accuracy_test))

cuda:0

[1] 訓練データ上の損失: 1.17604660987854	訓練データ上の正解率: 0.508519003931848	評価データ上の損失: 1.194509506225586	評価データ上の正解率: 0.507496251874063
[2] 訓練データ上の損失: 1.094875454902649	訓練データ上の正解率: 0.5619734132184984	評価データ上の損失: 1.1291295289993286	評価データ上の正解率: 0.5449775112443778
[3] 訓練データ上の損失: 1.050013542175293	訓練データ上の正解率: 0.5837858079011421	評価データ上の損失: 1.0972168445587158	評価データ上の正解率: 0.5584707646176912
[4] 訓練データ上の損失: 1.019877314567566	訓練データ上の正解率: 0.5961430443737128	評価データ上の損失: 1.0783015489578247	評価データ上の正解率: 0.56071964017991
[5] 訓練データ上の損失: 0.9968733787536621	訓練データ上の正解率: 0.6043812020220932	評価データ上の損失: 1.065365195274353	評価データ上の正解率: 0.5749625187406296
[6] 訓練データ上の損失: 0.9775402545928955	訓練データ上の正解率: 0.6117768208200711	評価データ上の損失: 1.0552279949188232	評価データ上の正解率: 0.5787106446776612
[7] 訓練データ上の損失: 0.9600287079811096	訓練データ上の正解率: 0.6202022093240966	評価データ上の損失: 1.0462276935577393	評価データ上の正解率: 0.5854572713643178
[8] 訓練データ上の損失: 0.9430308938026428	訓練データ上の正解率: 0.6279722898333645	評価データ上の損失: 1.0371863842010498	評価データ上の正解率: 0.593703148425