# 感情分析

## 映画レビューデータセットを取得する

In [56]:
import tarfile

In [57]:
import pyprind
import pandas as pd
import os

In [58]:
basepath = "aclImdb"

In [59]:
labels = {"pos":1, "neg":0}

In [60]:
pbar = pyprind.ProgBar(50000)

In [61]:
df = pd.DataFrame()

In [62]:
for s in ("test", "train"):
    for l in ("pos", "neg"):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:55


In [63]:
df.columns = ["review", "sentiment"]

In [64]:
df.head()

Unnamed: 0,review,sentiment
0,"With the obvious exception of Fools & Horses, ...",1
1,Progeny is about a husband and wife who experi...,1
2,This is truly a funny movie. His dance scene d...,1
3,As a long-time fan of Studio Ghibli and especi...,1
4,I went into this film thinking I wasn't going ...,1


In [65]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv("movie_data.csv", index = False, encoding = "utf-8")

In [66]:
df = pd.read_csv("movie_data.csv")
df.head(3)

Unnamed: 0,review,sentiment
0,This documentary about the life and comedy of ...,1
1,"Produced by International Playhouse Pictures, ...",0
2,This can't be Mandy Schaffer's last film. Some...,0


In [12]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
count = CountVectorizer()
docs = np.array([
    "The sun is shining",
    "The weather is sweet",
    "The sun is shining, the weather is sweet, and one and one is two"])
bag = count.fit_transform(docs)

In [14]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [15]:
bag.toarray()

array([[0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1],
       [2, 3, 2, 1, 1, 1, 2, 1, 1]], dtype=int64)

## TF-IDFを使って単語の関連性を評価する

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

In [17]:
tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
np.set_printoptions(precision = 2)
bag = count.fit_transform(docs)
print(tfidf.fit_transform(bag).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [18]:
df.loc[0, "review"][-50:]

'dy familiar with him.<br /><br />I rated it a ten.'

In [19]:
import re

In [20]:
#:)のような文字以外のいらない文字を削除
def preprocessor(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
    text = (re.sub("[\W]+", " ", text.lower()) + "".join(emoticons).replace("-", ""))
    return text

In [21]:
df["review"] = df["review"].apply(preprocessor)

# 文書をトークン化する

In [22]:
def tokenizer(text):
    return text.split()

In [23]:
tokenizer("runners like running and thus they run")

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

# 単語を原型に変換

In [24]:
from nltk.stem.porter import PorterStemmer

In [25]:
porter = PorterStemmer()

In [26]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [27]:
tokenizer_porter("runners like running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

# ストップワードを削除

In [28]:
import nltk

In [29]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hiroshi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
from nltk.corpus import stopwords

In [31]:
stop = stopwords.words("english")
[w for w in tokenizer_porter("a runner likes running and runs a lot")[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

## 文書を分類するロジスティック回帰モデルのトレーニング

In [32]:
X_train = df.loc[:25000, "review"].values
y_train = df.loc[:25000, "sentiment"].values
X_test = df.loc[25000:, "review"].values
y_test = df.loc[25000:, "sentiment"].values

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

In [36]:
param_grid = [{"vect__ngram_range": [(1,1)],
               "vect__stop_words": [stop, None],
               "vect__tokenizer": [tokenizer, tokenizer_porter],#単純に単語に分割するか、原型に変換してから分割するか
               "clf__penalty": ["l1", "l2"],
               "clf__C": [1.0, 10.0, 100.0]},
              {"vect__ngram_range": [(1,1)],
               "vect__stop_words": [stop, None],
               "vect__tokenizer": [tokenizer, tokenizer_porter],
               "vect__use_idf": [False],
               "vect__norm": [None],
               "clf__penalty": ["l1", "l2"],
               "clf__C": [1.0, 10.0, 100.0]}]

In [37]:
lr_tfidf = Pipeline([("vect", tfidf),
                     ("clf", LogisticRegression(random_state = 0))])

In [40]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring = "accuracy", cv = 5, verbose = 1, n_jobs = -1)

In [41]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 27.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...e, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=T

In [44]:
print("Best parameter set: %s"%gs_lr_tfidf.best_params_)

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7f2044c54d08>}


In [45]:
print("CV Accuracy: %.3f" % gs_lr_tfidf.best_score_)

CV Accuracy: 0.895


In [47]:
clf = gs_lr_tfidf.best_estimator_
print("Test Accuracy: %.3f" % clf.score(X_test, y_test))

Test Accuracy: 0.899


# さらに大規模なデータの処理：オンラインアルゴリズムとアウトオブコア学習

In [85]:
import numpy as np
import re
from nltk.corpus import stopwords

In [86]:
stop = stopwords.words("english")

In [87]:
def tokenizer(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text.lower())
    text = (re.sub("[\W]+", " ", text.lower()) + "".join(emoticons).replace("-", ""))
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [88]:
def stream_docs(path):
    with open(path, "r", encoding="utf-8") as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [89]:
next(stream_docs(path="movie_data.csv"))

('"This documentary about the life and comedy of Bill Hicks features bits from Hicks\' ""Revelations"" and other stand-up gigs. It also features interviews with fellow comedians and people in the industry who knew him, as well as reporters and journalists who talk about how his political commentary was raw and brutal.<br /><br />I enjoyed it very much. I had already seen ""Revelations"" but the comedy clips were still refreshing. It\'s a nice balance of comedy and documentary that will explain Hicks\' popularity to non-fans and please those who are already familiar with him.<br /><br />I rated it a ten."',
 1)

In [92]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [104]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [105]:
#アウトアブコア学習のためにHashingVectorizerを使う
vect = HashingVectorizer(decode_error="ignore", n_features=2**21, preprocessor=None, tokenizer=tokenizer)

In [106]:
clf = SGDClassifier(loss = "log", random_state = 1, n_iter = -1)

In [107]:
doc_stream = stream_docs(path = "movie_data.csv")

In [108]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])

In [109]:
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size = 1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes = classes)
    pbar.update()



0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:15


In [110]:
X_test, y_test = get_minibatch(doc_stream, size = 5000)
X_test = vect.transform(X_test)
print("Accuracy: %.3f" % clf.score(X_test, y_test))

Accuracy: 0.875


In [112]:
clf = clf.partial_fit(X_test, y_test)



# 学習済みのscikit-learn推定器をシリアライズする

In [113]:
import pickle
import os

In [115]:
dest = os.path.join("movieclassifier", "pkl_objects")

In [117]:
if not os.path.exists(dest):
    os.makedirs(dest)

In [118]:
pickle.dump(stop, open(os.path.join(dest, "stopwords.pkl"), "wb"), protocol=4)

In [119]:
pickle.dump(clf, open(os.path.join(dest, "classifier.pkl"), "wb"), protocol=4)

# 潜在ディリクレ配分によるトピックモデルの構築

In [1]:
import pandas as pd
df = pd.read_csv("movie_data.csv", encoding = "utf-8")
df

Unnamed: 0,review,sentiment
0,This documentary about the life and comedy of ...,1
1,"Produced by International Playhouse Pictures, ...",0
2,This can't be Mandy Schaffer's last film. Some...,0
3,I thought this move was very good. There were ...,1
4,The next time you are at a party and someone a...,0
5,"As the maker of ""This Darkness,"" I admit we ne...",1
6,I went to see Ashura as 2005 Fantasia Festival...,1
7,I was surprised that I liked this movie. But i...,1
8,There's a legion of Mick Garris haters out the...,1
9,There is no greater disservice to do to histor...,1


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words="english", max_df = .1, max_features=5000)
X = count.fit_transform(df["review"].values)

In [4]:
from sklearn.decomposition import LatentDirichletAllocation

In [6]:
#文書から10種類のトピックを推定させる
lda = LatentDirichletAllocation(n_topics = 10, random_state = 123, learning_method="batch", n_jobs=-1)
X_topics = lda.fit_transform(X)

In [7]:
lda.components_.shape

(10, 5000)

In [8]:
lda.components_

array([[9.31387196e+01, 1.08792686e+02, 3.57086206e+02, ...,
        3.13126474e+02, 2.05674342e+02, 3.06089228e+01],
       [2.84500939e+01, 1.08509630e+01, 5.37843836e+01, ...,
        1.00004446e-01, 1.00002647e-01, 8.85449064e+00],
       [1.13746018e+01, 1.65017683e+02, 1.01038176e+02, ...,
        1.00008048e-01, 1.00010216e-01, 7.84940248e+00],
       ...,
       [1.87164979e+00, 1.55578883e+01, 1.04561791e+01, ...,
        1.00009355e-01, 1.00009965e-01, 1.84953170e+02],
       [1.24125241e+01, 2.77687153e+01, 7.01562786e+01, ...,
        1.00011692e-01, 1.00015403e-01, 2.34807663e-01],
       [1.00019939e-01, 3.01301462e+01, 9.56749360e+01, ...,
        1.00014430e-01, 1.00017765e-01, 4.96200507e+00]])