In [1]:
!python -V

Python 3.6.0 :: Continuum Analytics, Inc.


In [2]:
import gensim
import numpy as np

import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict, Counter

In [3]:
print(gensim.__version__, np.__version__, sklearn.__version__)

3.7.3 1.16.4 0.20.2


In [4]:
# サンプルデータで sklearn にある The 20 newsgroups text dataset を利用する
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

# カテゴリーをみる
print(list(newsgroups_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [5]:
# 本文の例
print(newsgroups_train.data[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [6]:
# 学習用として1000件を用いる
raw_documents = newsgroups_train.data[0:1000]

In [7]:
# Word2Vec にて学習し，単語の表現ベクトルを得る
model = gensim.models.Word2Vec([doc.split() for doc in raw_documents], size=300, window=5)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

  app.launch_new_instance()


In [8]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(w2v.values())))

    def fit(self, X, y=None):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [9]:
# 上記のw2vを用いて，文章の加重平均ベクトルを学習させる

vectorizer = TfidfEmbeddingVectorizer(w2v)
vectorizer.fit(raw_documents)

<__main__.TfidfEmbeddingVectorizer at 0x120f194e0>

In [10]:
# あるセンテンスとあるセンテンスの類似度を測定する

sentence1 = "I like a sushi"
vec1 = vectorizer.transform([sentence1])[0]

sentence2 = "I like a apple"
vec2 = vectorizer.transform([sentence2])[0]

# cosine類似度で測る
print(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))

0.99987483
