<a href="https://colab.research.google.com/github/irasin/jp_w2v/blob/master/jp_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -U chainer

Requirement already up-to-date: chainer in /usr/local/lib/python3.6/dist-packages (6.2.0)


In [3]:
!pip install 'cupy-cuda100>=6.2.0,<7.0.0'



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!ls -l drive/My\ Drive/ 

total 728676
-rw------- 1 root root  58070450 Aug  6 10:42  wiki50_tokenzied.txt
-rw------- 1 root root 680778353 Aug  5 13:07 'result word2vec.model'
drwx------ 3 root root      4096 Aug  5 09:29  P
-rw------- 1 root root   7132316 Aug  5 06:49  p1780-tang.pdf
drwx------ 2 root root      4096 Aug  5 06:40  p2v_src
drwx------ 2 root root      4096 Jul 22 05:39  data
-rw------- 1 root root     83492 Jul 21 11:35  c2v.ipynb
drwx------ 2 root root      4096 Jul 16 07:36  201708-201807
drwx------ 2 root root      4096 Jul  8 14:06  ra
-rw------- 1 root root      3552 Jun 20 00:54  create_c2v_w2v.py
drwx------ 2 root root      4096 Jun 14 08:42  VISA更新
-rw------- 1 root root     20862 Jun 12 04:55  paper.tex
drwx------ 2 root root      4096 Jun  3 11:51  コンペ2018
drwx------ 2 root root      4096 May 21 01:19  randd_eight_skill_prediction
drwx------ 2 root root      4096 May 16 04:52  Avatar-Net-Pytorch
drwx------ 2 root root      4096 May 16 04:19  有価証券報告書関連論文
drwx------ 2 root root      40

In [0]:
with open('drive/My Drive/wiki50_tokenzied.txt', 'r') as f:
    data = f.read().split()

import collections
import numpy as np

index2word = {}
word2index = {}


idx = 0
for w in set(data):
    word2index[w] = idx
    index2word[idx] = w
    idx += 1

data_array = []

for w in data:
    data_array.append(word2index[w])

data_array = np.array(data_array, dtype='int32')

In [0]:
# Set up the dataset
train = data_array[:]
val = data_array[-100:]


counts = collections.Counter(train)
counts.update(collections.Counter(val))
n_vocab = max(train) + 1

vocab = word2index  

In [0]:
import argparse
import collections
import os
import six
import warnings

import numpy as np

import chainer
from chainer.backends import cuda
import chainer.functions as F
import chainer.initializers as I
import chainer.links as L
import chainer.optimizers as O
from chainer import reporter
from chainer import training
from chainer.training import extensions


class ContinuousBoW(chainer.Chain):
    """Definition of Continuous Bag of Words Model"""

    def __init__(self, n_vocab, n_units, loss_func):
        super(ContinuousBoW, self).__init__()

        with self.init_scope():
            self.embed = L.EmbedID(
                n_vocab, n_units, initialW=I.Uniform(1. / n_units))
            self.loss_func = loss_func

    def forward(self, x, contexts):
        e = self.embed(contexts)
        h = F.sum(e, axis=1) * (1. / contexts.shape[1])
        loss = self.loss_func(h, x)
        reporter.report({'loss': loss}, self)
        return loss


class SkipGram(chainer.Chain):
    """Definition of Skip-gram Model"""

    def __init__(self, n_vocab, n_units, loss_func):
        super(SkipGram, self).__init__()

        with self.init_scope():
            self.embed = L.EmbedID(
                n_vocab, n_units, initialW=I.Uniform(1. / n_units))
            self.loss_func = loss_func

    def forward(self, x, contexts):
        e = self.embed(contexts)
        batch_size, n_context, n_units = e.shape
        x = F.broadcast_to(x[:, None], (batch_size, n_context))
        e = F.reshape(e, (batch_size * n_context, n_units))
        x = F.reshape(x, (batch_size * n_context,))
        loss = self.loss_func(e, x)
        reporter.report({'loss': loss}, self)
        return loss


class SoftmaxCrossEntropyLoss(chainer.Chain):
    """Softmax cross entropy loss function preceded by linear transformation.
    """

    def __init__(self, n_in, n_out):
        super(SoftmaxCrossEntropyLoss, self).__init__()
        with self.init_scope():
            self.out = L.Linear(n_in, n_out, initialW=0)

    def forward(self, x, t):
        return F.softmax_cross_entropy(self.out(x), t)


class WindowIterator(chainer.dataset.Iterator):
    """Dataset iterator to create a batch of sequences at different positions.
    This iterator returns a pair of the current words and the context words.
    """

    def __init__(self, dataset, window, batch_size, repeat=True):
        self.dataset = np.array(dataset, np.int32)
        self.window = window  # size of context window
        self.batch_size = batch_size
        self._repeat = repeat
        # order is the array which is shuffled ``[window, window + 1, ...,
        # len(dataset) - window - 1]``
        self.order = np.random.permutation(
            len(dataset) - window * 2).astype(np.int32)
        self.order += window
        self.current_position = 0
        # Number of completed sweeps over the dataset. In this case, it is
        # incremented if every word is visited at least once after the last
        # increment.
        self.epoch = 0
        # True if the epoch is incremented at the last iteration.
        self.is_new_epoch = False

    def __next__(self):
        """This iterator returns a list representing a mini-batch.
        Each item indicates a different position in the original sequence.
        """
        if not self._repeat and self.epoch > 0:
            raise StopIteration

        i = self.current_position
        i_end = i + self.batch_size
        position = self.order[i:i_end]
        w = np.random.randint(self.window - 1) + 1
        offset = np.concatenate([np.arange(-w, 0), np.arange(1, w + 1)])
        pos = position[:, None] + offset[None, :]
        contexts = self.dataset.take(pos)
        center = self.dataset.take(position)

        if i_end >= len(self.order):
            np.random.shuffle(self.order)
            self.epoch += 1
            self.is_new_epoch = True
            self.current_position = 0
        else:
            self.is_new_epoch = False
            self.current_position = i_end

        return center, contexts

    @property
    def epoch_detail(self):
        return self.epoch + float(self.current_position) / len(self.order)

    def serialize(self, serializer):
        self.current_position = serializer('current_position',
                                           self.current_position)
        self.epoch = serializer('epoch', self.epoch)
        self.is_new_epoch = serializer('is_new_epoch', self.is_new_epoch)
        if self.order is not None:
            serializer('order', self.order)


@chainer.dataset.converter()
def convert(batch, device):
    center, contexts = batch
    center = device.send(center)
    contexts = device.send(contexts)
    return center, contexts

In [9]:
parser = argparse.ArgumentParser()
parser.add_argument('--device', '-d', type=str, default='0',
                    help='Device specifier. Either ChainerX device '
                    'specifier or an integer. If non-negative integer, '
                    'CuPy arrays with specified device id are used. If '
                    'negative integer, NumPy arrays are used')
parser.add_argument('--unit', '-u', default=200, type=int,
                    help='number of units')
parser.add_argument('--window', '-w', default=10, type=int,
                    help='window size')
parser.add_argument('--batchsize', '-b', type=int, default=1000,
                    help='learning minibatch size')
parser.add_argument('--epoch', '-e', default=3, type=int,
                    help='number of epochs to learn')
parser.add_argument('--model', '-m', choices=['skipgram', 'cbow'],
                    default='skipgram',
                    help='model type ("skipgram", "cbow")')
parser.add_argument('--negative-size', default=5, type=int,
                    help='number of negative samples')
parser.add_argument('--out-type', '-o', choices=['hsm', 'ns', 'original'],
                    default='hsm',
                    help='output model type ("hsm": hierarchical softmax, '
                    '"ns": negative sampling, "original": '
                    'no approximation)')
parser.add_argument('--out', default='result',
                    help='Directory to output the result')
parser.add_argument('--resume', '-r', type=str,
                    help='Resume the training from snapshot')
parser.add_argument('--snapshot-interval', type=int,
                    help='Interval of snapshots')
parser.add_argument('--test', dest='test', action='store_true')
parser.set_defaults(test=False)
group = parser.add_argument_group('deprecated arguments')
group.add_argument('--gpu', '-g', dest='device',
                   type=int, nargs='?', const=0,
                   help='GPU ID (negative value indicates CPU)')
args = parser.parse_args(args=[])

if chainer.get_dtype() == np.float16:
    warnings.warn(
        'This example may cause NaN in FP16 mode.', RuntimeWarning)

device = chainer.get_device(args.device)
device.use()

    
if args.snapshot_interval is None:
    args.snapshot_interval = args.epoch
args.snapshot_interval = min(args.snapshot_interval, args.epoch)

print('Device: {}'.format(device))
print('# unit: {}'.format(args.unit))
print('Window: {}'.format(args.window))
print('Minibatch-size: {}'.format(args.batchsize))
print('# epoch: {}'.format(args.epoch))
print('Training model: {}'.format(args.model))
print('Output type: {}'.format(args.out_type))
print('')

if args.test:
    train = train[:100]
    val = val[:100]

print('n_vocab: %d' % n_vocab)
print('data length: %d' % len(train))

if args.out_type == 'hsm':
    HSM = L.BinaryHierarchicalSoftmax
    tree = HSM.create_huffman_tree(counts)
    loss_func = HSM(args.unit, tree)
    loss_func.W.array[...] = 0
elif args.out_type == 'ns':
    cs = [counts[w] for w in range(len(counts))]
    loss_func = L.NegativeSampling(args.unit, cs, args.negative_size)
    loss_func.W.array[...] = 0
elif args.out_type == 'original':
    loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab)
else:
    raise Exception('Unknown output type: {}'.format(args.out_type))

# Choose the model
if args.model == 'skipgram':
    model = SkipGram(n_vocab, args.unit, loss_func)
elif args.model == 'cbow':
    model = ContinuousBoW(n_vocab, args.unit, loss_func)
else:
    raise Exception('Unknown model type: {}'.format(args.model))

model.to_device(device)

# Set up an optimizer
optimizer = O.Adam()
optimizer.setup(model)

# Set up an iterator
train_iter = WindowIterator(train, args.window, args.batchsize)
val_iter = WindowIterator(val, args.window, args.batchsize, repeat=False)

# Set up an updater
updater = training.updaters.StandardUpdater(
    train_iter, optimizer, converter=convert, device=device)

# Set up a trainer
trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

trainer.extend(extensions.Evaluator(
    val_iter, model, converter=convert, device=device))
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport(
    ['epoch', 'main/loss', 'validation/main/loss']))
trainer.extend(extensions.ProgressBar())

trainer.extend(
    extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}'),
    trigger=(args.snapshot_interval, 'epoch'))

if args.resume is not None:
    chainer.serializers.load_npz(args.resume, trainer)
trainer.run()



Device: @cupy:0
# unit: 200
Window: 10
Minibatch-size: 1000
# epoch: 3
Training model: skipgram
Output type: hsm

n_vocab: 312996
data length: 8964042
epoch       main/loss   validation/main/loss
[J     total [..................................................]  0.37%
this epoch [..................................................]  1.12%
       100 iter, 0 epoch / 3 epochs
       inf iters/sec. Estimated time to finish: 0:00:00.
[4A[J     total [..................................................]  0.74%
this epoch [#.................................................]  2.23%
       200 iter, 0 epoch / 3 epochs
    47.628 iters/sec. Estimated time to finish: 0:09:20.423202.
[4A[J     total [..................................................]  1.12%
this epoch [#.................................................]  3.35%
       300 iter, 0 epoch / 3 epochs
    47.666 iters/sec. Estimated time to finish: 0:09:17.885518.
[4A[J     total [.................................................

In [0]:
# Save the word2vec model
with open('word2vec.model', 'w') as f:
    f.write('%d %d\n' % (len(index2word), args.unit))
    w = cuda.to_cpu(model.embed.W.array)
    for i, wi in enumerate(w):
        v = ' '.join(map(str, wi))
        f.write('%s %s\n' % (index2word[i], v))

In [29]:
!ls -l drive/My\ Drive

total 2161234
drwx------ 2 root root      4096 Jul 16 07:36  201708-201807
drwx------ 2 root root      4096 Jun  3 11:51  コンペ2018
drwx------ 2 root root      4096 May 16 04:52  Avatar-Net-Pytorch
-rw------- 1 root root     83492 Jul 21 11:35  c2v.ipynb
drwx------ 2 root root      4096 Dec 14  2018 'Colab Notebooks'
-rw------- 1 root root      3552 Jun 20 00:54  create_c2v_w2v.py
drwx------ 2 root root      4096 Jul 22 05:39  data
drwx------ 2 root root      4096 May 20  2018  miku
drwx------ 3 root root      4096 Aug  5 09:29  P
-rw------- 1 root root   7132316 Aug  5 06:49  p1780-tang.pdf
drwx------ 2 root root      4096 Aug  5 06:40  p2v_src
-rw------- 1 root root     20862 Jun 12 04:55  paper.tex
drwx------ 2 root root      4096 May 20  2018  programming
drwx------ 2 root root      4096 Jul  8 14:06  ra
drwx------ 2 root root      4096 May 21 01:19  randd_eight_skill_prediction
-rw------- 1 root root 680778353 Aug  5 13:07 'result word2vec.model'
drwx------ 2 root root      4096 Ma

In [0]:
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

In [16]:
from googleapiclient.http import MediaFileUpload

file_metadata = {
  'name': 'word2vec.model',
  'mimeType': 'text/plain'
}
media = MediaFileUpload('word2vec.model', 
                        mimetype='text/plain',
                        resumable=True)
created = drive_service.files().create(body=file_metadata,
                                       media_body=media,
                                       fields='id').execute()
print('File ID: {}'.format(created.get('id')))

File ID: 1p2num0jZubBum_4jg9ggwALRnoktcXP2


In [17]:
import logging
from gensim.models import KeyedVectors

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.setLevel(level=logging.INFO)


model = KeyedVectors.load_word2vec_format('word2vec.model', binary=False)

2019-08-06 11:24:42,491 : INFO : loading projection weights from word2vec.model
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-08-06 11:25:44,919 : INFO : loaded (312996, 200) matrix from word2vec.model


In [18]:
model.most_similar(positive=["トヨタ"], topn=10)


2019-08-06 11:25:50,049 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('ホンダ', 0.7037820219993591),
 ('WRC', 0.6866999864578247),
 ('ルノー', 0.6658177375793457),
 ('BMW', 0.657233476638794),
 ('F1', 0.6424689292907715),
 ('モータースポーツ', 0.6383865475654602),
 ('ワークス', 0.63553786277771),
 ('日産', 0.6238956451416016),
 ('シャーシ', 0.6003409624099731),
 ('日産自動車', 0.5971512794494629)]

In [26]:
model.most_similar(positive=["声優"], topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('ナレーター', 0.73166823387146),
 ('俳優', 0.7121175527572632),
 ('女優', 0.6773413419723511),
 ('タレント', 0.6704140901565552),
 ('役者', 0.662054181098938),
 ('アイドル', 0.6618469953536987),
 ('子役', 0.6615896224975586),
 ('歌手', 0.6539620757102966),
 ('演出家', 0.6408377289772034),
 ('音響監督', 0.6380734443664551)]

In [23]:
model.most_similar(positive=["任天堂"], topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('HAL研究所', 0.6787598729133606),
 ('ソニー・コンピュータエンタテインメント', 0.6694126129150391),
 ('ニンテンドーDS', 0.6624517440795898),
 ('Wii', 0.6299006938934326),
 ('バンダイ', 0.6289756298065186),
 ('ファミリーコンピュータ', 0.6161845326423645),
 ('バーチャルコンソール', 0.6160342693328857),
 ('SCE', 0.6102538108825684),
 ('アークシステムワークス', 0.6055625677108765),
 ('スーパーファミコン', 0.5968215465545654)]