In [2]:
from janome.tokenizer import Tokenizer

In [3]:
t = Tokenizer()

s = 'すもももももももものうち'

In [4]:
print(type(t.tokenize(s)))


<class 'list'>


In [5]:
print(type(t.tokenize(s)[0]))


<class 'janome.tokenizer.Token'>


In [6]:
for token in t.tokenize(s):
    print(token)

すもも	名詞,一般,*,*,*,*,すもも,スモモ,スモモ
も	助詞,係助詞,*,*,*,*,も,モ,モ
もも	名詞,一般,*,*,*,*,もも,モモ,モモ
も	助詞,係助詞,*,*,*,*,も,モ,モ
もも	名詞,一般,*,*,*,*,もも,モモ,モモ
の	助詞,連体化,*,*,*,*,の,ノ,ノ
うち	名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ


In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd


In [8]:
# class: 0
df_a = pd.DataFrame({'x1': np.random.randn(100),
                     'x2': np.random.randn(100),
                     'y' : 0})
# class: 1
df_b = pd.DataFrame({'x1': np.random.randn(100) + 5,
                     'x2': np.random.randn(100) + 3,
                     'y' : 1})
df = df_a.append(df_b)

# トレーニングデータとテストデータに分割
X_train, X_test, y_train, y_test = \
    train_test_split(df[['x1','x2']], df['y'], test_size=0.2)


In [9]:
# 1. モデルインスタンス生成
clf = SVC()

# 2. fit 学習
clf.fit(X_train, y_train)

# 3. predict 予測
y_pred = clf.predict(X_test)




In [10]:
y_pred

array([0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])

In [11]:
import gensim
from gensim import corpora


In [12]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [13]:
stop_words = set('for a of the and to in'.split())

In [14]:
texts = [[word for word in document.lower().split() if word not in stop_words] for document in documents]

In [15]:
from pprint import pprint

In [16]:
pprint(texts)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]


In [17]:
# 単語の出現回数を格納するfrequency変数を定義
from collections import defaultdict
frequency = defaultdict(int)
# 単語の出現回数をfrequency変数でカウント
for text in texts:
    for token in text:
        frequency[token] += 1
# frequency変数で1より上の単語のみを配列に構築
texts = [[token for token in text if frequency[token] > 1] for text in texts]

In [18]:
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [19]:
dictionary = corpora.Dictionary(texts)
# ファイルに保存できます
dictionary.save('/tmp/deerwester.dict')
# テキストファイルに保存することもできます
dictionary.save_as_text('/tmp/deerwester.dict.txt')

In [20]:
cat /tmp/deerwester.dict.txt

9
0	computer	2
8	eps	2
10	graph	3
1	human	2
2	interface	2
11	minors	2
3	response	2
4	survey	2
5	system	3
6	time	2
9	trees	3
7	user	3


In [21]:
corpus = [dictionary.doc2bow(text) for text in texts]
# ファイルに保存できる
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)

In [22]:
pprint(corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


In [23]:
# num_topics=5で、5個のトピックを持つLDAモデルを作成
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=5, id2word=dictionary)

In [24]:
pprint(lda.show_topics())

[(0,
  '0.187*"time" + 0.187*"user" + 0.187*"response" + 0.184*"trees" + '
  '0.032*"system" + 0.032*"graph" + 0.032*"minors" + 0.032*"interface" + '
  '0.032*"computer" + 0.032*"human"'),
 (1,
  '0.177*"user" + 0.177*"system" + 0.097*"time" + 0.097*"survey" + '
  '0.097*"response" + 0.097*"computer" + 0.096*"interface" + 0.096*"eps" + '
  '0.017*"trees" + 0.017*"graph"'),
 (2,
  '0.233*"system" + 0.233*"human" + 0.127*"eps" + 0.127*"computer" + '
  '0.127*"interface" + 0.023*"trees" + 0.022*"minors" + 0.022*"graph" + '
  '0.022*"user" + 0.022*"response"'),
 (3,
  '0.296*"trees" + 0.296*"graph" + 0.161*"minors" + 0.028*"system" + '
  '0.028*"user" + 0.028*"interface" + 0.027*"response" + 0.027*"survey" + '
  '0.027*"eps" + 0.027*"computer"'),
 (4,
  '0.221*"graph" + 0.220*"survey" + 0.220*"minors" + 0.039*"trees" + '
  '0.038*"system" + 0.038*"eps" + 0.038*"user" + 0.038*"interface" + '
  '0.037*"response" + 0.037*"human"')]


In [25]:
# 文を定義
test_documents = ["Computer themselves and software yet to be developed will revolutionize the way we learn"]

# 単語を分割
test_texts = [[word for word in document.lower().split()] for document in test_documents]

# 既存の辞書を使用して、コーパスを作成
test_corpus = [dictionary.doc2bow(text) for text in test_texts]


In [27]:
pprint(test_corpus)

[[(0, 1)]]


In [28]:
for topics_per_document in lda[test_corpus]:
    pprint(topics_per_document)


[(0, 0.100038156),
 (1, 0.1030303),
 (2, 0.596856),
 (3, 0.10003165),
 (4, 0.10004392)]
