## 準備

In [3]:
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>' , '',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[: -3], int(line[-2])
            yield text, label

# 確認
#print(next(stream_docs(path='../../movie_data.csv')))

def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
         for _ in range(size):
                text, label = next(doc_stream)
                docs.append(text)
                y.append(label)
    except StopIteration:
        return None, None
    return docs, y

# アウトオブメモリで使うため、TfidfVectorizerの代わりにHshingVectorizerを使って文書を特徴量に分割する。
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None,tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='../../movie_data.csv')

import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:41


In [4]:
X_test, y_test = get_minibatch(doc_stream, 5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))
# テストで使ったデータを使ってモデルを更新
clf = clf.partial_fit(X_test, y_test)


Accuracy: 0.867


## 9.1

### シリアライズ(一回やればOK)

In [10]:
import pickle
import os 

dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest,'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest,'classifier.pkl'), 'wb'), protocol=4)

NameError: name 'stop' is not defined

### デシリアライズ確認

In [12]:

import pickle
import re
import os
from movieeclassifier.vectorizer import vect
clf = pickle.load(open(os.path.join('movieclassifier', 'pkl_objects', 'classifier.pkl'), 'rb'))


In [14]:
import numpy as np
label = {0:'negative', 1:'positive'}
example = ['I love this movie']
X = vect.transform(example)
print('Prediction:%s\nProbability: %.2f%%' % (label[clf.predict(X)[0]], np.max(clf.predict_proba(X))*100))
print(clf.predict_proba(X))


Prediction:positive
Probability: 82.52%
[[ 0.17480325  0.82519675]]


## 9.2 sqlite準備

In [1]:
import sqlite3
import os
conn = sqlite3.connect("movieclassifier/reviews.sqlite")
c = conn.cursor()
c.execute("create table review_db ( review TEXT, sentiment INTEGER, date TEXT)")
example1 = "I love this movie"
c.execute("insert into review_db (review, sentiment, date) values (?,?, DATETIME('now'))", (example1, 1))
example2 = "I disliked this movie"
c.execute("insert into review_db (review, sentiment, date) values (?,?, DATETIME('now'))", (example2, 1))
conn.commit()
conn.close()


In [2]:
conn = sqlite3.connect("movieclassifier/reviews.sqlite")
c = conn.cursor()

c.execute("select * from review_db where date between '2016-01-01 00:00:00' and DATETIME('now')")
results = c.fetchall()
conn.close()
print(results)

[('I love this movie', 1, '2016-12-15 00:11:26'), ('I disliked this movie', 1, '2016-12-15 00:11:26')]


## 9.3 Flask