<a href="https://colab.research.google.com/github/irasin/jp_w2v/blob/master/jp_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Google Colabの使い方



*   各ブロックにコードやmarkdown形式のテキストデータが記されている
*   各ブロックの左側の開始ボタンを押すか、クリックしてshift+enterを同時に押すかでブロックを実行できる

詳しく知りたい人は、次のページを参照してください

https://medium.com/deep-learning-turkey/google-colab-free-gpu-tutorial-e113627b9f5d

https://qiita.com/tomo_makes/items/b3c60b10f7b25a0a5935

https://qiita.com/shoji9x9/items/0ff0f6f603df18d631ab

## Chainerのバージョンを更新
最初に実行するとき、下記のようなWARNINGが出現したら、`ランタイム/ランタイムを再起動`を選んで、再起動する。


WARNING: The following packages were previously imported in this runtime:

 　 [typing]

You must restart the runtime in order to use newly installed versions.

In [2]:
!pip install -U chainer

Requirement already up-to-date: chainer in /usr/local/lib/python3.6/dist-packages (6.3.0)


## GPUが使えるように、対応するCupyのバージョンを更新

In [5]:
!pip install 'cupy-cuda100>=6.2.0,<7.0.0'



## Word2Vecの実装

下記のブロックを実行する際に、Cupyのバージョンが合わないエラーが出たら、上記のCupyの更新をもう一度実行してください。

In [0]:
import argparse
import collections
import os
import six
import warnings

import numpy as np

import chainer
from chainer.backends import cuda
import chainer.functions as F
import chainer.initializers as I
import chainer.links as L
import chainer.optimizers as O
from chainer import reporter
from chainer import training
from chainer.training import extensions


# CBOWのクラスの定義
class ContinuousBoW(chainer.Chain):
    """Definition of Continuous Bag of Words Model"""

    def __init__(self, n_vocab, n_units, loss_func):
        super(ContinuousBoW, self).__init__()

        with self.init_scope():
            # 埋め込み層の定義
            self.embed = L.EmbedID(
                n_vocab, n_units, initialW=I.Uniform(1. / n_units))
            self.loss_func = loss_func

    def forward(self, x, contexts):
        # フォワードでロスを計算
        e = self.embed(contexts)
        h = F.sum(e, axis=1) * (1. / contexts.shape[1])
        loss = self.loss_func(h, x)
        reporter.report({'loss': loss}, self)
        return loss

# SkipGramのクラスの定義
class SkipGram(chainer.Chain):
    """Definition of Skip-gram Model"""

    def __init__(self, n_vocab, n_units, loss_func):
        super(SkipGram, self).__init__()

        with self.init_scope():
            # 埋め込み層の定義
            self.embed = L.EmbedID(
                n_vocab, n_units, initialW=I.Uniform(1. / n_units))
            self.loss_func = loss_func

    def forward(self, x, contexts):
        # フォワードでロスを計算
        e = self.embed(contexts)
        batch_size, n_context, n_units = e.shape
        x = F.broadcast_to(x[:, None], (batch_size, n_context))
        e = F.reshape(e, (batch_size * n_context, n_units))
        x = F.reshape(x, (batch_size * n_context,))
        loss = self.loss_func(e, x)
        reporter.report({'loss': loss}, self)
        return loss

    
# ソフトマックスクロスエントロピーのクラス
# 線形層を挟む
class SoftmaxCrossEntropyLoss(chainer.Chain):
    """Softmax cross entropy loss function preceded by linear transformation.
    """

    def __init__(self, n_in, n_out):
        super(SoftmaxCrossEntropyLoss, self).__init__()
        with self.init_scope():
            self.out = L.Linear(n_in, n_out, initialW=0)

    def forward(self, x, t):
        return F.softmax_cross_entropy(self.out(x), t)

    
# データセットのイテレータ
# 中心語と文脈語のペアを返す
class WindowIterator(chainer.dataset.Iterator):
    """Dataset iterator to create a batch of sequences at different positions.
    This iterator returns a pair of the current words and the context words.
    """

    def __init__(self, dataset, window, batch_size, repeat=True):
        self.dataset = np.array(dataset, np.int32)
        self.window = window  # size of context window
        self.batch_size = batch_size
        self._repeat = repeat
        # order is the array which is shuffled ``[window, window + 1, ...,
        # len(dataset) - window - 1]``
        self.order = np.random.permutation(
            len(dataset) - window * 2).astype(np.int32)
        self.order += window
        self.current_position = 0
        # Number of completed sweeps over the dataset. In this case, it is
        # incremented if every word is visited at least once after the last
        # increment.
        self.epoch = 0
        # True if the epoch is incremented at the last iteration.
        self.is_new_epoch = False

    def __next__(self):
        """This iterator returns a list representing a mini-batch.
        Each item indicates a different position in the original sequence.
        """
        if not self._repeat and self.epoch > 0:
            raise StopIteration

        i = self.current_position
        i_end = i + self.batch_size
        position = self.order[i:i_end]
        w = np.random.randint(self.window - 1) + 1
        offset = np.concatenate([np.arange(-w, 0), np.arange(1, w + 1)])
        pos = position[:, None] + offset[None, :]
        contexts = self.dataset.take(pos)
        center = self.dataset.take(position)

        if i_end >= len(self.order):
            np.random.shuffle(self.order)
            self.epoch += 1
            self.is_new_epoch = True
            self.current_position = 0
        else:
            self.is_new_epoch = False
            self.current_position = i_end

        return center, contexts

    @property
    def epoch_detail(self):
        return self.epoch + float(self.current_position) / len(self.order)

    def serialize(self, serializer):
        self.current_position = serializer('current_position',
                                           self.current_position)
        self.epoch = serializer('epoch', self.epoch)
        self.is_new_epoch = serializer('is_new_epoch', self.is_new_epoch)
        if self.order is not None:
            serializer('order', self.order)

# 学習データを対応するデバイスに変換する関数
@chainer.dataset.converter()
def convert(batch, device):
    center, contexts = batch
    center = device.send(center)
    contexts = device.send(contexts)
    return center, contexts

## Google Driveの認証
実行した後、認証用のリンクがあるので、自分のGoogleアカウントで認証してください。
認証コードを下のブランクに貼って確認する

In [0]:
from google.colab import drive
drive.mount('/content/drive')

## 自分のドライブの中身を確認する

tokenized_data.txtがあるはず



In [0]:
!ls -l drive/My\ Drive/ 

## tokenized_data.txtに対して、Word2Vecの学習を行うための前処理を行う

In [0]:
# 分かち書き済みのデータを読み込む
with open('drive/My Drive/tokenized_data.txt', 'r') as f:
    data = f.read().split()

import collections

# 単語とindexの対応付け
index2word = {}
word2index = {}

idx = 0
for w in set(data):
    word2index[w] = idx
    index2word[idx] = w
    idx += 1

# 学習データの生成
data_array = []

for w in data:
    data_array.append(word2index[w])

# 学習データをnp.arrayの形式に変換
data_array = np.array(data_array, dtype='int32')

In [0]:
# データセットのセットアップ
train = data_array[:]

counts = collections.Counter(train)
n_vocab = max(train) + 1

vocab = word2index  

In [0]:
# 学習用パラメータの設定
parser = argparse.ArgumentParser()
# 学習Device設定
parser.add_argument('--device', '-d', type=str, default='0',
                    help='Device specifier. Either ChainerX device '
                    'specifier or an integer. If non-negative integer, '
                    'CuPy arrays with specified device id are used. If '
                    'negative integer, NumPy arrays are used')
# 単語ベクトルの次元
parser.add_argument('--unit', '-u', default=200, type=int,
                    help='number of units')
# ウインドサイズ
parser.add_argument('--window', '-w', default=10, type=int,
                    help='window size')
# バッチサイズ
parser.add_argument('--batchsize', '-b', type=int, default=1000,
                    help='learning minibatch size')
# 学習エポック数
parser.add_argument('--epoch', '-e', default=3, type=int,
                    help='number of epochs to learn')
# モデル選択: CBOW/Skipgram
parser.add_argument('--model', '-m', choices=['skipgram', 'cbow'],
                    default='skipgram',
                    help='model type ("skipgram", "cbow")')
# ネガティブサンプリング際のサンプル数
parser.add_argument('--negative-size', default=5, type=int,
                    help='number of negative samples')
# モデル学習方式: 階層ソフトマックス/ネガティブサンプリング/近似なし
parser.add_argument('--out-type', '-o', choices=['hsm', 'ns', 'original'],
                    default='hsm',
                    help='output model type ("hsm": hierarchical softmax, '
                    '"ns": negative sampling, "original": '
                    'no approximation)')
#　ログ保存先
parser.add_argument('--out', default='result',
                    help='Directory to output the result')
group = parser.add_argument_group('deprecated arguments')
# GPUID
group.add_argument('--gpu', '-g', dest='device',
                   type=int, nargs='?', const=0,
                   help='GPU ID (negative value indicates CPU)')
args = parser.parse_args(args=[])

if chainer.get_dtype() == np.float16:
    warnings.warn(
        'This example may cause NaN in FP16 mode.', RuntimeWarning)

device = chainer.get_device(args.device)
device.use()

    
print('Device: {}'.format(device))
print('# unit: {}'.format(args.unit))
print('Window: {}'.format(args.window))
print('Minibatch-size: {}'.format(args.batchsize))
print('# epoch: {}'.format(args.epoch))
print('Training model: {}'.format(args.model))
print('Output type: {}'.format(args.out_type))
print('')

print('n_vocab: %d' % n_vocab)
print('data length: %d' % len(train))

if args.out_type == 'hsm':
    HSM = L.BinaryHierarchicalSoftmax
    tree = HSM.create_huffman_tree(counts)
    loss_func = HSM(args.unit, tree)
    loss_func.W.array[...] = 0   
elif args.out_type == 'ns':
    cs = [counts[w] for w in range(len(counts))]
    loss_func = L.NegativeSampling(args.unit, cs, args.negative_size)
    loss_func.W.array[...] = 0
elif args.out_type == 'original':
    loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab)
else:
    raise Exception('Unknown output type: {}'.format(args.out_type))

# モデル設定
if args.model == 'skipgram':
    model = SkipGram(n_vocab, args.unit, loss_func)
    
elif args.model == 'cbow':
    model = ContinuousBoW(n_vocab, args.unit, loss_func)
    
else:
    raise Exception('Unknown model type: {}'.format(args.model))

model.to_device(device)

# オプティマイザ設定
optimizer = O.Adam()
optimizer.setup(model)

# 学習イテレータ設定
train_iter = WindowIterator(train, args.window, args.batchsize)


# アプデータ設定
updater = training.updaters.StandardUpdater(
    train_iter, optimizer, converter=convert, device=device)

# トレーナー設定
trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport(
    ['epoch', 'main/loss']))
trainer.extend(extensions.ProgressBar())

# 学習開始
trainer.run()



In [0]:
# word2vec modelを保存
with open('word2vec.model', 'w') as f:
    f.write('%d %d\n' % (len(index2word), args.unit))
    w = cuda.to_cpu(model.embed.W.array)
    for i, wi in enumerate(w):
        v = ' '.join(map(str, wi))
        f.write('%s %s\n' % (index2word[i], v))

## word2vec.modelの存在を確認

In [0]:
!ls -l 

## 自分のGoogle Driveに学習済みのword2vec.modelを保存
ここでもう一度認証をする必要がある

In [0]:
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

In [15]:
from googleapiclient.http import MediaFileUpload

file_metadata = {
  'name': 'word2vec.model',
  'mimeType': 'text/plain'
}
media = MediaFileUpload('word2vec.model', 
                        mimetype='text/plain',
                        resumable=True)
created = drive_service.files().create(body=file_metadata,
                                       media_body=media,
                                       fields='id').execute()
print('File ID: {}'.format(created.get('id')))

File ID: 1kF-VZKvj7J75eStnNOYm3OQ3ncqHJowG


## 学習済みのword2vec.modelを試してみる
下記のコードはGoogle Driveからword2vec.modelをローカルにダウンロードして、test.pyを実行しても良い

UserWarningは無視して大丈夫

In [16]:
import logging
from gensim.models import KeyedVectors

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.setLevel(level=logging.INFO)


model = KeyedVectors.load_word2vec_format('word2vec.model', binary=False)

2019-08-29 13:01:05,265 : INFO : loading projection weights from word2vec.model
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-08-29 13:02:03,578 : INFO : loaded (361995, 200) matrix from word2vec.model


In [17]:
model.most_similar(positive=["トヨタ"], topn=10)

2019-08-29 13:02:03,584 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('レクサス', 0.6787608861923218),
 ('日産', 0.6739553213119507),
 ('マツダ', 0.6163288354873657),
 ('フォルクスワーゲン', 0.6144090294837952),
 ('カワサキ', 0.5994932651519775),
 ('スバル', 0.592795729637146),
 ('ダットサン', 0.5844794511795044),
 ('いすゞ', 0.5839172601699829),
 ('ホンダ', 0.5827198624610901),
 ('ブランド', 0.5732850432395935)]

In [18]:
model.most_similar(positive=["声優"], topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('ナレーター', 0.6846827268600464),
 ('俳優', 0.684380292892456),
 ('脚本家', 0.668969452381134),
 ('タレント', 0.6612478494644165),
 ('女優', 0.6289736032485962),
 ('歌手', 0.622085452079773),
 ('。声優', 0.6102961897850037),
 ('演出家', 0.6075438261032104),
 ('子役', 0.5924990177154541),
 ('お笑い芸人', 0.5792001485824585)]

In [19]:
model.most_similar(positive=["Python"], topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('C言語', 0.7438325881958008),
 ('C++', 0.6913949251174927),
 ('ライブラリ', 0.6636326313018799),
 ('プログラミング言語', 0.6612603068351746),
 ('実装', 0.6601348519325256),
 ('RPC', 0.6445109248161316),
 ('Java', 0.6394205093383789),
 ('MATLAB', 0.6346997022628784),
 ('NumPy', 0.6321592330932617),
 ('Mops', 0.6255452036857605)]