# Verifications: NLTK

- Purpose: To perform sentimental analysis of movie review corpus using NLTK
- Keywords: Sentimental Analysis, Classification, Naive Bayes, Logistic Regresssion

In [1]:
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
import collections


## 特徴量抽出

In [2]:
# NLTKからmoview review コーパスを読み込み、クラスカテゴリーを表示
movie_reviews.categories()

['neg', 'pos']

In [3]:
# 一例として、ネガティブカテゴリーに属するファイル群を一部表示してみる
movie_reviews.fileids(categories=['neg'])[0:10]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt']

NLTK分類機の訓練データのフォーマットは、[(辞書,ラベル),(辞書,ラベル),..]。例えば<br>
train_feats = [<br>
...     ({'a': 1, 'b': 1, 'c': 1}, 'y'),<br>
...     ({'a': 5, 'b': 5, 'c': 5}, 'x'),<br>
...     ({'a': 0.9, 'b': 0.9, 'c': 0.9}, 'y'),<br>
...}<br>
<br>
train_feats = [<br>
...               ({"a": 5, "b": 2, "c": 1}, "ham"),<br>
...               ({"a": 0, "b": 3, "c": 4}, "spam"),<br>
...               ({"a": 5, "b": 1, "c": 1}, "ham"),<br>
...               ({"a": 1, "b": 4, "c": 3}, "spam")]<br>
...}<br>
今回の訓練データもデータフォーマットを合わせて作る必要がある。
例えば、
train_feats = [<br>
...               ({"plot": True, "two": True, "teen": True,..}, "neg"),<br>
...               ({"happy": True, "nice": True, "good": True}, "pos")]<br>
...}<br>
といった具合に。

In [4]:
def bag_of_words(words):
    """
    ワード一式を格納するリストから、値をTrueとする辞書型に変換する関数
    >>> bag_of_words(['the', 'quick', 'brown', 'fox'])
    {'quick': True, 'brown': True, 'the': True, 'fox': True}
    """
    return dict([(word, True) for word in words])

In [5]:
def label_feats_from_corpus(corp, feature_detector=bag_of_words):
    """ ラベル と bug of words型コーパスを紐付け"""
    label_feats = collections.defaultdict(list)
    for label in corp.categories():
        for fileid in corp.fileids(categories=[label]): # ファイルパスを取得するイテレーション
            # corp.words(fileids=[fileid]) により、１ファイル中の全単語をリストで取得する
            # e.g. ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
            # 上で定義したbag_of_words()を用いてこれを　{'plot': True, ':': True, 'two': True, 'teen': True, '..}という辞書形式へ変換
            feats = feature_detector(corp.words(fileids=[fileid])) 
            # negとposの２種類のリストを作っていたのでラベル毎に特徴量をアペンド
            label_feats[label].append(feats)

    return label_feats


In [6]:
lfeats = label_feats_from_corpus(movie_reviews)
# lfeats.keys()  => dict_keys(['pos', 'neg'])
# len(lfeats.get('neg')) => 1000
# len(lfeats.get('pos')) => 1000

In [7]:
def split_label_feats(lfeats, split=0.75):
    """ 訓練用データとテスト用データに分割 """
    train_feats = []
    test_feats = []

    for label, feats in lfeats.items():
        cutoff = int(len(feats) * split)
        train_feats.extend([(feat, label) for feat in feats[:cutoff]])
        test_feats.extend([(feat, label) for feat in feats[cutoff:]])

    return train_feats, test_feats

In [8]:
train_feats, test_feats = split_label_feats(lfeats)
# len(train_feats) => 1500
# len(test_feats) => 500

In [9]:
# コレクション型なので、items()を使って辞書の内容を表示してみる
train_feats[0][0].items()

dict_items([('plot', True), (':', True), ('two', True), ('teen', True), ('couples', True), ('go', True), ('to', True), ('a', True), ('church', True), ('party', True), (',', True), ('drink', True), ('and', True), ('then', True), ('drive', True), ('.', True), ('they', True), ('get', True), ('into', True), ('an', True), ('accident', True), ('one', True), ('of', True), ('the', True), ('guys', True), ('dies', True), ('but', True), ('his', True), ('girlfriend', True), ('continues', True), ('see', True), ('him', True), ('in', True), ('her', True), ('life', True), ('has', True), ('nightmares', True), ('what', True), ("'", True), ('s', True), ('deal', True), ('?', True), ('watch', True), ('movie', True), ('"', True), ('sorta', True), ('find', True), ('out', True), ('critique', True), ('mind', True), ('-', True), ('fuck', True), ('for', True), ('generation', True), ('that', True), ('touches', True), ('on', True), ('very', True), ('cool', True), ('idea', True), ('presents', True), ('it', True), (

## ナイーブベイズ型訓練モデルの生成と精度評価

In [10]:
nb_classifier = NaiveBayesClassifier.train(train_feats)

In [11]:
accuracy(nb_classifier, test_feats)

0.728

## ロジスティック回帰型の訓練モデル生成と精度評価

In [12]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
sk_classifier = SklearnClassifier(LogisticRegression()).train(train_feats)

In [13]:
accuracy(sk_classifier, test_feats)

0.892

## テスト

In [14]:
negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous'])
sk_classifier.classify(negfeat)

'neg'

In [15]:
posfeat = bag_of_words(['that', 'was', 'a', 'wonderful','moment'])
sk_classifier.classify(posfeat)

'pos'