In [8]:
def make_features_dic(fname):
    with open(fname) as file:
        return {line.rstrip('\n'): i for i, line in enumerate(file)}

In [9]:
def make_queue(fname, features_dic):
    with open(fname) as file:
        lines = file.readlines()
        queue_x = np.zeros([len(lines), len(features_dic)])
        queue_y = np.zeros([len(lines), 4]) # One-hot表現: [b, t, e, m]
        
        for i, line in enumerate(lines):
            import re
            from nltk.corpus import stopwords
            import snowballstemmer
            
            line = line.rstrip('\n')
            category = line.strip('\t')[0]
            title = line.split('\t')[1]
            
            # queue_xの処理
            # 文字種の統一
            title = title.lower()
            
            # 数字の置き換え -> 除去
            title = re.sub(r'[0-9]+', '', title)
            
            # '-'を' 'に変換
            title = title.replace('-', ' ')
            
            words = title.split()
            
            # ストップワードの除去
            stop_words = stopwords.words('english')
            words2 = [word for word in words if word not in stop_words]
            words = words2
            
            # ステミング処理
            stemmer = snowballstemmer.stemmer('english')
            words2 = [stemmer.stemWord(word) for word in words]
            words = words2
            
            # 記号の除去
            words2 = [word for word in words if word.islower()]
            words = words2
            
            for word in words:
                if word in features_dic.keys():
                    queue_x[i, features_dic[word]] = 1
                    
            # queue_yの処理
            if category == 'b':
                queue_y[i, 0] = 1
            elif category == 't':
                queue_y[i, 1] = 1
            elif category == 'e':
                queue_y[i, 2] = 1
            elif category == 'm':
                queue_y[i, 3] = 1
                
    return queue_x, queue_y

In [10]:
def softmax(queue): # ソフトマックス関数
    return np.exp(queue) / np.sum(np.exp(queue), axis = 1)[:, np.newaxis]

In [11]:
def calculate_gradient(queue_x, queue_y, queue_w, vector_b):
    phi = softmax(np.dot(queue_x, queue_w) + vector_b)
    closs_entropy = -np.sum(queue_y * np.log(phi)) # 交差エントロピー
    dw = -np.dot(queue_x.T, queue_y - phi)
    db = -np.dot(np.ones([1, queue_y.shape[0]]), queue_y - phi)
    
    return dw, db

In [12]:
def train(queue_x, queue_y, queue_w, vector_b, lr):
    dw, db = caluculate_gradient(queue_x, queue_y, queue_w, vector_b)
    queue_w -= lr * dw
    vector_b -= lr * db
    
    return queue_w, vector_b

In [13]:
def valid(queue_x, queue_y, queue_w, vector_b):
    pred_queue_y = np.zeros(queue_y.shape)
    phi = softmax(np.dot(queue_x, queue_w) + vector_b)
    class_labels = np.argmax(phi, axis = 1)
    correct_count = 0
    
    for i in range(len(queue_y)):
        pred_queue_y[i, class_labels[i]] = 1
    for i in range(len(queue_y)):
        if np.all(queue_y[i, :] == pred_queue_y[i, :]):
            correct_count += 1
            
    return correct_count / len(queue_y)

In [14]:
import numpy as np

features_dic = make_features_dic('train.feature.txt')
train_queue_x, train_queue_y = make_queue('train.txt', features_dic)
valid_queue_x, valid_queue_y = make_queue('valid.txt', features_dic)
lr = pow(10, -3) # 学習率: 検証の結果、最も精度が高かった数値
queue_w = np.ones([len(features_dic), 4])
vector_b = np.ones([1, 4])
accuracy, best_accuracy, pre_best_accuracy = 0, 0, 0
count = 1

while True:
    pre_queue_w = queue_w
    pre_vector_b = vector_b
    
    queue_w, vector_b = train(train_queue_x, train_queue_y, pre_queue_w, pre_vector_b, lr)
    accuracy = valid(valid_queue_x, valid_queue_y, queue_w, vector_b)
    if accuracy > best_accuracy:
        best_queue_w = queue_w
        best_vector_b = vector_b
        best_accuracy = accuracy
        best_count = count
    if count % 1000 == 0:
        print('[{0}] {1}'.format(count, accuracy))
        if best_accuracy == pre_best_accuracy:
            break
        pre_best_accuracy = best_accuracy
    count += 1
print('')

print('RESULT')
print('[{0}] {1}'.format(best_count, best_accuracy))
np.save('results/52_result', np.concatenate([best_queue_w, best_vector_b]))

[1000] 0.9137931034482759
[2000] 0.9160419790104948
[3000] 0.9145427286356822

RESULT
[1166] 0.9167916041979011
