In [1]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('../chapter_7/GoogleNews-vectors-negative300.bin', binary=True)

unable to import 'smart_open.gcs', disabling that module


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, d_h):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 50, (3, 300), padding=(1, 0))
        self.fc1 = nn.Linear(d_h, 4)
        
    def forward(self, x):
        words_len = x.size(2)
        
        x =  F.relu(self.conv1(x)).squeeze(3)
        x = F.max_pool2d(x, (1, words_len)).squeeze(2)
        x = self.fc1(x)
        return x

In [3]:
def make_id_dic(fname):
    id_dic = {}
    
    with open(fname) as file:
        for i, line in enumerate(file, 1):
            line = line.rstrip('\n')
            id_dic[line] = i
            
    return id_dic

In [4]:
def words2id(sentence, id_dic):
    import re
    import snowballstemmer
    
    words_id = []
    
    # 文字種の統一
    sentence = sentence.lower()
    
    # 数字の置き換え
    sentence = re.sub(r'[0-9]+', '0', sentence)
    
    # '-'を' 'に変換
    sentence = sentence.replace('-', ' ')
    
    words = sentence.split()
    
    # ステミング処理
    stemmer = snowballstemmer.stemmer('english')
    words2 = [stemmer.stemWord(word) for word in words]
    words = words2
    
    for word in words:
        if word in id_dic.keys():
            words_id.append(id_dic[word])
        else:
            words_id.append(0)
            
    return words_id

In [5]:
def id2vec(words_id, id_dic):
    words_vec = []
    
    for id in words_id:
        word_list = [k for k, v in id_dic.items() if v == id]
        if len(word_list) > 0:
            try:
                words_vec.append(torch.from_numpy(model[word_list[0]]).view(1,-1))
            except KeyError:
                words_vec.append(torch.zeros(1, 300))
        else:
            words_vec.append(torch.zeros(1, 300))
    
    return torch.cat(words_vec)

In [6]:
def make_dataset(fname, id_dic):
    with open(fname) as file:
        lines = file.readlines()
        lines_vec = []
        labels = np.zeros([len(lines), 1])
        
        for i, line in enumerate(lines):
            
            line = line.rstrip('\n')
            category = line.strip('\t')[0]
            title = line.split('\t')[1]
            
            # lines_vecの処理
            words_id = words2id(title, id_dic)
            words_vec = id2vec(words_id, id_dic)
            lines_vec.append(words_vec)
            
            # labelsの処理
            if category == 'b':
                labels[i] = 0
            elif category == 't':
                labels[i] = 1
            elif category == 'e':
                labels[i] = 2
            elif category == 'm':
                labels[i] = 3
    labels = torch.from_numpy(labels).long()
                
    return lines_vec, labels

In [7]:
def train(cnn, criterion, optimizer, words_vec, label):
    cnn.zero_grad()
    optimizer.zero_grad()
    
    output = cnn(words_vec.unsqueeze(0).unsqueeze(0))
    loss = criterion(output, label)
    loss.backward()
    optimizer.step()

In [8]:
def get_loss_and_accyracy(cnn, criterion, lines_vec, labels):
    running_loss = 0
    correct_count = 0
    
    for words_vec, label in zip(lines_vec, labels):
        output = cnn(words_vec.unsqueeze(0).unsqueeze(0))
        loss = criterion(output, label)
        running_loss += loss
        pre_label = torch.max(output, 1)[1]
        if pre_label == label:
            correct_count += 1
            
    return running_loss / len(labels), correct_count / len(labels)

In [10]:
import numpy as np
import torch.optim as optim

d_h = 50 # d_h: 畳み込み演算後の各時刻のベクトルの次元数
cnn = Net(d_h)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(cnn.parameters(), lr=pow(10, -2)) # lr: 検証の結果、10エポック目で最も精度が高かった数値

id_dic = make_id_dic('../chapter_6/train.feature.txt')
lines_vec, labels = make_dataset('../chapter_6/train.txt', id_dic)
lines_vec_test, labels_test = make_dataset('../chapter_6/test.txt', id_dic)

for epoch in range(10):
    for words_vec, label in zip(lines_vec, labels):
        train(cnn, criterion, optimizer, words_vec, label)
    loss_train, accuracy_train = get_loss_and_accyracy(cnn, criterion, lines_vec, labels)
    loss_test, accuracy_test = get_loss_and_accyracy(cnn, criterion, lines_vec_test, labels_test)
    print('[{0}] 訓練データ上の損失: {1}\t訓練データ上の正解率: {2}\t評価データ上の損失: {3}\t評価データ上の正解率: {4}'.format(epoch + 1, loss_train, accuracy_train, loss_test, accuracy_test))

[1] 訓練データ上の損失: 0.3290466368198395	訓練データ上の正解率: 0.8892529488859764	評価データ上の損失: 0.41012105345726013	評価データ上の正解率: 0.8583208395802099
[2] 訓練データ上の損失: 0.22339335083961487	訓練データ上の正解率: 0.9313798914061038	評価データ上の損失: 0.36543741822242737	評価データ上の正解率: 0.8770614692653673
[3] 訓練データ上の損失: 0.15620648860931396	訓練データ上の正解率: 0.9569369032016476	評価データ上の損失: 0.3525507152080536	評価データ上の正解率: 0.8770614692653673
[4] 訓練データ上の損失: 0.10955225676298141	訓練データ上の正解率: 0.9722898333645384	評価データ上の損失: 0.3574272394180298	評価データ上の正解率: 0.8800599700149925
[5] 訓練データ上の損失: 0.07729192823171616	訓練データ上の正解率: 0.9828683767084816	評価データ上の損失: 0.3787167966365814	評価データ上の正解率: 0.883808095952024
[6] 訓練データ上の損失: 0.05431031808257103	訓練データ上の正解率: 0.989421456656057	評価データ上の損失: 0.3995789587497711	評価データ上の正解率: 0.8778110944527736
[7] 訓練データ上の損失: 0.0399080254137516	訓練データ上の正解率: 0.9938213817637147	評価データ上の損失: 0.41736680269241333	評価データ上の正解率: 0.8748125937031485
[8] 訓練データ上の損失: 0.0319071039557457	訓練データ上の正解率: 0.9951319977532297	評価データ上の損失: 0.4317984879016876	評価データ上の正解率: 0.877