In [1]:
#!/usr/bin/env python
import globals

import os
import sys

import spacy
from spacy.tokens import DocBin
import tqdm

from nltk import tokenize
import json
nlp = spacy.load("en_core_web_sm")

from pathlib import Path
from shutil import rmtree

from wasabi import msg


In [2]:
def fasttextTrainVectors(
    fasttext_bin,
    in_dir,
    out_dir,
    n_threads=10,
    min_count=50,
    vector_size=300,
    verbose=2,
):
    """
    Step 4: Train the vectors

    Expects a directory of preprocessed .s2v input files, will concatenate them
    (using a temporary file on disk) and will use fastText to train a word2vec
    model. See here for installation instructions:
    https://github.com/facebookresearch/fastText

    Note that this script will call into fastText and expects you to pass in the
    built fasttext binary. The command will also be printed if you want to run
    it separately.
    """
    input_path = Path(in_dir)
    output_path = Path(out_dir)
    if not Path(fasttext_bin).exists():
        msg.fail("Can't find fastText binary", fasttext_bin, exits=1)
    if not input_path.exists() or not input_path.is_dir():
        msg.fail("Not a valid input directory", in_dir, exits=1)
    if output_path.exists():
        rmtree(output_path)
    output_path.mkdir(parents=True)
    print(f"ReCreated vector output directory {out_dir}")
    output_file = output_path / f"vectors_w2v_{vector_size}dim"

    # fastText expects only one input file and only reads from disk and not
    # stdin, so we need to create a temporary file that concatenates the inputs
    tmp_path = input_path / "s2v_input.tmp"
    input_files = [p for p in input_path.iterdir() if p.suffix == ".txt"]
    if not input_files:
        print("Input directory contains no .s2v files: " + in_dir)
        sys.exit()
    with tmp_path.open("a", encoding="utf8") as tmp_file:
        for input_file in input_files:
            with input_file.open("r", encoding="utf8") as f:
                tmp_file.write(f.read())
                tmp_file.write(f.read())
    msg.info("Created temporary merged input file", tmp_path)

    #tmp_path = input_path / "content.merged"
    
    msg.info("Training vectors")
    cmd = (
        f"{fasttext_bin} skipgram -thread {n_threads} -input {tmp_path} "
        f"-output {output_file} -dim {vector_size} -minn 0 -maxn 0 "
        f"-minCount {min_count} -verbose {verbose}"
    )
    print(cmd)
    train_cmd = os.system(cmd)
    tmp_path.unlink()
    msg.good("Deleted temporary input file", tmp_path)
    if train_cmd != 0:
        msg.fail("Failed training vectors", exits=1)
    msg.good("Successfully trained vectors", out_dir)

    msg.info("Creating vocabulary")
    vocab_file = output_path / "vocab.txt"
    cmd = f"{fasttext_bin} dump {output_file.with_suffix('.bin')} dict > {vocab_file}"
    print(cmd)
    vocab_cmd = os.system(cmd)
    if vocab_cmd != 0:
        msg.fail("Failed creating vocabulary", exits=1)
    msg.good("Successfully created vocabulary file", vocab_file)

def _get_shape(file_):
    """Return a tuple with (number of entries, vector dimensions). Handle
    both word2vec/FastText format, which has a header with this, or GloVe's
    format, which doesn't."""
    first_line = next(file_).split()
    if len(first_line) == 2:
        return tuple(int(size) for size in first_line), file_
    count = 1
    for line in file_:
        count += 1
    file_.seek(0)
    shape = (count, len(first_line) - 1)
    return shape, file_


def export(in_file, vocab_file, out_dir):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        print("Can't find input file: " + in_file)
        sys.exit()
    if input_path.suffix == ".bin":
        print("Need text-based vectors file, not binary: " + in_file)
        sys.exit()
    if not vocab_path.exists():
        print("Can't find vocab file: " + vocab_file)
        sys.exit()
    if output_path.exists():
        rmtree(output_path)
    output_path.mkdir(parents=True)
    print(f"ReCreated vector output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab_data = f.readlines()
    data = []
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            print(f"Wrong vector size: {len(vec)} (expected {vector_size})")
            sys.exit()
        all_senses.add(sense)
        data.append((key, numpy.asarray(vec, dtype=numpy.float32)))
    s2v = Sense2Vec(shape=(len(data), vector_size), senses=all_senses)
    for key, vector in data:
        s2v.add(key, vector)
    for item in vocab_data:
        item = item.rstrip()
        if item.endswith(" word"):  # for fastText vocabs
            item = item[:-5]
        try:
            key, freq = item.rsplit(" ", 1)
        except ValueError:
            continue
        s2v.set_freq(key, int(freq))
    print("Created the sense2vec model")
    print(f"{len(data)} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    print("Saved model to directory", out_dir)

def precompileCache(
    vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0, start=0, end=None
):
    """
    Step 6: Precompute nearest-neighbor queries (optional)

    Precompute nearest-neighbor queries for every entry in the vocab to make
    Sense2Vec.most_similar faster. The --cutoff option lets you define the
    number of earliest rows to limit the neighbors to. For instance, if cutoff
    is 100000, no word will have a nearest neighbor outside of the top 100k
    vectors.
    """
    if gpu_id == -1:
        xp = numpy
    else:
        import cupy as xp
        import cupy.cuda.device

        cupy.take_along_axis = take_along_axis
        cupy.put_along_axis = put_along_axis
        device = cupy.cuda.device.Device(gpu_id)
        device.use()
    vectors_dir = Path(vectors)
    vectors_file = vectors_dir / "vectors"
    if not vectors_dir.is_dir() or not vectors_file.exists():
        err = "Are you passing in the exported sense2vec directory containing a vectors file?"
        print(f"Can't load vectors from {vectors}: " +  err)
        sys.exit()
    with msg.loading(f"Loading vectors from {vectors}"):
        vectors = xp.load(str(vectors_file))
    msg.good(f"Loaded {vectors.shape[0]:,} vectors with dimension {vectors.shape[1]}")
    norms = xp.linalg.norm(vectors, axis=1, keepdims=True)
    norms[norms == 0] = 1
    # Normalize to unit norm
    vectors /= norms
    if cutoff < 1:
        cutoff = vectors.shape[0]
    if end is None:
        end = vectors.shape[0]
    mean = float(norms.mean())
    var = float(norms.var())
    msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})")
    msg.info(f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent")
    n = min(n_neighbors, vectors.shape[0])
    subset = vectors[:cutoff]
    best_rows = xp.zeros((end - start, n), dtype="i")
    scores = xp.zeros((end - start, n), dtype="f")
    for i in tqdm.tqdm(list(range(start, end, batch_size))):
        size = min(batch_size, end - i)
        batch = vectors[i : i + size]
        sims = xp.dot(batch, subset.T)
        # Set self-similarities to -inf, so that we don't return them.
        indices = xp.arange(i, min(i + size, sims.shape[1])).reshape((-1, 1))
        xp.put_along_axis(sims, indices, -xp.inf, axis=1)
        # This used to use argpartition, to do a partial sort...But this ended
        # up being a ratsnest of terrible numpy crap. Just sorting the whole
        # list isn't really slower, and it's much simpler to read.
        ranks = xp.argsort(sims, axis=1)
        batch_rows = ranks[:, -n:]
        # Reverse
        batch_rows = batch_rows[:, ::-1]
        batch_scores = xp.take_along_axis(sims, batch_rows, axis=1)
        best_rows[i : i + size] = batch_rows
        scores[i : i + size] = batch_scores
    msg.info("Saving output")
    if not isinstance(best_rows, numpy.ndarray):
        best_rows = best_rows.get()
    if not isinstance(scores, numpy.ndarray):
        scores = scores.get()
    output = {
        "indices": best_rows,
        "scores": scores.astype("float16"),
        "start": start,
        "end": end,
        "cutoff": cutoff,
    }
    output_file = vectors_dir / "cache"
    with msg.loading("Saving output..."):
        srsly.write_msgpack(output_file, output)
    msg.good(f"Saved cache to {output_file}")


# These functions are missing from cupy, but will be supported in cupy 7.
def take_along_axis(a, indices, axis):
    """Take values from the input array by matching 1d index and data slices.

    Args:
        a (cupy.ndarray): Array to extract elements.
        indices (cupy.ndarray): Indices to take along each 1d slice of ``a``.
        axis (int): The axis to take 1d slices along.

    Returns:
        cupy.ndarray: The indexed result.

    .. seealso:: :func:`numpy.take_along_axis`
    """
    import cupy

    if indices.dtype.kind not in ("i", "u"):
        raise IndexError("`indices` must be an integer array")

    if axis is None:
        a = a.ravel()
        axis = 0

    ndim = a.ndim

    if not (-ndim <= axis < ndim):
        raise IndexError("Axis overrun")

    axis %= a.ndim

    if ndim != indices.ndim:
        raise ValueError("`indices` and `a` must have the same number of dimensions")

    fancy_index = []
    for i, n in enumerate(a.shape):
        if i == axis:
            fancy_index.append(indices)
        else:
            ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1)
            fancy_index.append(cupy.arange(n).reshape(ind_shape))

    return a[fancy_index]


def put_along_axis(a, indices, value, axis):
    import cupy

    if indices.dtype.kind not in ("i", "u"):
        raise IndexError("`indices` must be an integer array")

    if axis is None:
        a = a.ravel()
        axis = 0

    ndim = a.ndim

    if not (-ndim <= axis < ndim):
        raise IndexError("Axis overrun")

    axis %= a.ndim

    if ndim != indices.ndim:
        raise ValueError("`indices` and `a` must have the same number of dimensions")

    fancy_index = []
    for i, n in enumerate(a.shape):
        if i == axis:
            fancy_index.append(indices)
        else:
            ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1)
            fancy_index.append(cupy.arange(n).reshape(ind_shape))

    a[fancy_index] = value


In [5]:
# Train Vectors (FastText)
fasttextTrainVectors(globals.fasttext_bin, globals.merged_text_dir, globals.vectors_out_dir, globals.n_threads, globals.min_count, globals.vector_size, globals.verbose)


ReCreated vector output directory /data/processed/gaming/fasttext_vectors
[38;5;4mℹ Created temporary merged input file[0m
/data/processed/gaming/merged-terms/s2v_input.tmp
[38;5;4mℹ Training vectors[0m
/data/home/liamca/notebooks/fasttext/fastText/fasttext skipgram -thread 16 -input /data/processed/gaming/merged-terms/s2v_input.tmp -output /data/processed/gaming/fasttext_vectors/vectors_w2v_300dim -dim 300 -minn 0 -maxn 0 -minCount 10 -verbose 2
[38;5;2m✔ Deleted temporary input file[0m
/data/processed/gaming/merged-terms/s2v_input.tmp
[38;5;2m✔ Successfully trained vectors[0m
/data/processed/gaming/fasttext_vectors
[38;5;4mℹ Creating vocabulary[0m
/data/home/liamca/notebooks/fasttext/fastText/fasttext dump /data/processed/gaming/fasttext_vectors/vectors_w2v_300dim.bin dict > /data/processed/gaming/fasttext_vectors/vocab.txt
[38;5;2m✔ Successfully created vocabulary file[0m
/data/processed/gaming/fasttext_vectors/vocab.txt
