In [1]:
def make_features_dic(fname):
    with open(fname) as file:
        return {line.rstrip('\n'): i for i, line in enumerate(file)}

In [2]:
def make_queue(fname, features_dic):
    with open(fname) as file:
        lines = file.readlines()
        queue_x = np.zeros([len(lines), len(features_dic)])
        queue_y = np.zeros([len(lines), 4]) # One-hot表現: [b, t, e, m]
        
        for i, line in enumerate(lines):
            import re
            from nltk.corpus import stopwords
            import snowballstemmer
            
            line = line.rstrip('\n')
            category = line.strip('\t')[0]
            title = line.split('\t')[1]
            
            # queue_xの処理
            # 文字種の統一
            title = title.lower()
            
            # 数字の置き換え -> 除去
            title = re.sub(r'[0-9]+', '', title)
            
            # '-'を' 'に変換
            title = title.replace('-', ' ')
            
            words = title.split()
            
            # ストップワードの除去
            stop_words = stopwords.words('english')
            words2 = [word for word in words if word not in stop_words]
            words = words2
            
            # ステミング処理
            stemmer = snowballstemmer.stemmer('english')
            words2 = [stemmer.stemWord(word) for word in words]
            words = words2
            
            # 記号の除去
            words2 = [word for word in words if word.islower()]
            words = words2
            
            for word in words:
                if word in features_dic.keys():
                    queue_x[i, features_dic[word]] = 1
                    
            # queue_yの処理
            if category == 'b':
                queue_y[i, 0] = 1
            elif category == 't':
                queue_y[i, 1] = 1
            elif category == 'e':
                queue_y[i, 2] = 1
            elif category == 'm':
                queue_y[i, 3] = 1
                
    return queue_x, queue_y

In [3]:
def softmax(queue): # ソフトマックス関数
    return np.exp(queue) / np.sum(np.exp(queue), axis = 1)[:, np.newaxis]

In [4]:
def valid(queue_x, queue_y, queue_w, vector_b):
    pred_queue_y = np.zeros(queue_y.shape)
    phi = softmax(np.dot(queue_x, queue_w) + vector_b)
    class_labels = np.argmax(phi, axis = 1)
    correct_count = 0
    
    for i in range(len(queue_y)):
        pred_queue_y[i, class_labels[i]] = 1
    for i in range(len(queue_y)):
        if np.all(queue_y[i, :] == pred_queue_y[i, :]):
            correct_count += 1
            
    return correct_count / len(queue_y)

In [5]:
def get_accuracy(fname):
    features_dic = make_features_dic('train.feature.txt')
    queue_x, queue_y = make_queue(fname, features_dic)
    w_and_b = np.load('results/52_result.npy')
    queue_w = w_and_b[:-1]
    vector_b = w_and_b[-1]
    accuracy = valid(queue_x, queue_y, queue_w, vector_b)
    
    return accuracy

In [6]:
import numpy as np

train_accuracy = get_accuracy('train.txt')
test_accuracy = get_accuracy('test.txt')

print('学習データの正解率: {0}'.format(train_accuracy))
print('評価データの正解率: {0}'.format(test_accuracy))

学習データの正解率: 0.9964426137427448
評価データの正解率: 0.9092953523238381
