인명 분류기
====

* 음절 길이가 3이고 품사 태그가 NNP인 단일 형태소에 대해 인명 여부를 분류하는 분류기
* word2vec은 주최측에서 제공하는 것을 사용 (위키로부터 추출)
* 주변 문맥을 이용하여 자질을 추출하여 SVM을 이용하여 구현

In [1]:
from __future__ import unicode_literals
from __future__ import print_function

import sys
sys.path.append('../lib')
import word2vec
import sentence

In [2]:
# word2vec을 읽어들인다.

w2v_dic = word2vec.load('../data/word2vec.pkl')

In [3]:
# JSON 문서(train/dev)를 읽어들인다.

import json

dev_json = json.load(open('../data/distribution2016/dev.json'))
train_json = json.load(open('../data/distribution2016/train.json'))

In [4]:
# 학습 집합을 생성하는 함수

def feature_label(w2v_dic, sentences, window):
    """
    get features and labels from corpus
    :param  w2v_dic:    word2vec dictionary
    :param  sentences:  sentences JSON array
    :param  window:     window size
    :return:            (features, labels) pair
    """
    features = []
    labels = []
    for sent_obj in sentences:
        sent = sentence.Sentence(sent_obj)
        for morp in sent.morps:
            if len(morp.lemma()) != 3 or morp.tag() != 'NNP':
                continue
            feature = []
            for win_idx in range(-window, window+1):
                morp_idx = morp.id() + win_idx
                if morp_idx < 0 or morp_idx >= len(sent.morps):
                    vec = word2vec.eos(w2v_dic)
                else:
                    vec = word2vec.get(w2v_dic, sent.morps[morp_idx].lemma(), sent.morps[morp_idx].tag())
                feature += vec
            features.append(feature)
            labels.append(1 if sent.is_person(morp.id()) else 0)
    return features, labels

SVM Classifier 비교
----

In [5]:
# 윈도우를 2로 고정하고 SVM classifier 별 성능 비교

from sklearn import cross_validation
from sklearn import svm

X_train, y_train = feature_label(w2v_dic, train_json['sentence'], 2)
for idx, clf in enumerate([svm.SVC(), svm.NuSVC(), svm.LinearSVC()]):
    scores = cross_validation.cross_val_score(clf, X_train, y_train, cv=10)
    print('(%d) %f' % (idx, scores.mean()))
    print(scores)
    print()

(0) 0.841134
[ 0.75945017  0.87628866  0.84536082  0.88659794  0.85223368  0.83161512
  0.82130584  0.83848797  0.87241379  0.82758621]

(1) 0.849041
[ 0.76975945  0.87628866  0.85223368  0.89003436  0.86941581  0.83848797
  0.83505155  0.84879725  0.87241379  0.83793103]

(2) 0.847328
[ 0.81099656  0.86254296  0.84536082  0.83505155  0.82817869  0.82817869
  0.87972509  0.85910653  0.86896552  0.85517241]



큰 차이는 없지만 NuSVC가 가장 좋다.

윈도우 크기별
----

In [6]:
# SVM classifier를 NuSVC로 고정하고 윈도우 크기별 성능 비교

from sklearn import cross_validation
from sklearn import svm

for window in range(5):
    X_train, y_train = feature_label(w2v_dic, train_json['sentence'], window)
    scores = cross_validation.cross_val_score(svm.NuSVC(), X_train, y_train, cv=10)
    print('(%d) %f' % (window, scores.mean()))
    print(scores)
    print()

(0) 0.815007
[ 0.73539519  0.81099656  0.80756014  0.85223368  0.82474227  0.79381443
  0.82130584  0.83505155  0.86551724  0.80344828]

(1) 0.843538
[ 0.75601375  0.8556701   0.83505155  0.89347079  0.87285223  0.83848797
  0.83161512  0.8556701   0.86551724  0.83103448]

(2) 0.849041
[ 0.76975945  0.87628866  0.85223368  0.89003436  0.86941581  0.83848797
  0.83505155  0.84879725  0.87241379  0.83793103]

(3) 0.844234
[ 0.76632302  0.87285223  0.84879725  0.84536082  0.86941581  0.83848797
  0.83161512  0.84879725  0.87586207  0.84482759]

(4) 0.844926
[ 0.7628866   0.86941581  0.85910653  0.8419244   0.87285223  0.84879725
  0.81443299  0.84536082  0.87931034  0.85517241]



역시 큰 차이는 없지만 2가 가장 적당하다.

평가
====

In [7]:
# train 전체로 학습하여 dev 코퍼스에 평가한다.

from sklearn.metrics import accuracy_score
from sklearn import svm

X_train, y_train = feature_label(w2v_dic, train_json['sentence'], 2)
clf = svm.NuSVC()
clf.fit(X_train, y_train)

X_dev, y_dev = feature_label(w2v_dic, dev_json['sentence'], 2)
y_predict = clf.predict(X_dev)
accuracy_score(y_dev, y_predict)

0.89635316698656431