In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gzip

import numpy as np
import gensim.models.keyedvectors

In [3]:
def read_gz_lines(gz_fpath, start=0, end=None, keyvec_format=True, verbose=False):
    vprint = print if verbose else lambda x: None
    if keyvec_format:
        keys = []
        vecs = []
    else:
        lines = []
    with gzip.open(gz_fpath, 'rb') as infile:
        for line_num, line in enumerate(infile):
            # If we're not at start yet, just read and continue
            if line_num < start:
                continue
            if end and line_num > end:
                break
            if line_num % 100000 == 0:
                vprint(f"On line {line_num}")
            # If we're here, we're at start, so start saving the lines
            cur_line = infile.readline().decode().strip()
            if len(cur_line) == 0:
                # Empty line, just continue
                continue
            if keyvec_format:
                line_elts = cur_line.split()
                key = line_elts[0]
                keys.append(key)
                vec = np.array([float(x) for x in line_elts[1:]])
                vecs.append(vec)
            else:
                lines.append(cur_line)
    if keyvec_format:
        return keys, vecs
    return lines

In [4]:
#start = 180000
#num_to_load = 100
#end = start + num_to_load
#lines = read_gz_lines(gz_fpath, start=start, end=end)

In [5]:
langs_to_keep = ["am","de","en","ru","vi","zh"] # though, it doesn't have am :/

In [6]:
gz_fpath = "../Embeddings/numberbatch-19.08.txt.gz"

In [7]:
# We want to load just the /en and /zh lines
all_langs = []
all_lines = []
counter = 0
with gzip.open(gz_fpath, 'rb') as infile:
    for line in infile:
        if counter % 1000000 == 0:
            print(f"On line {counter}")
        # The first line of the whole file is just {total_num_embeddings} {dimension}
        if counter == 0:
            counter = counter + 1
            continue
        line_str = line.decode()
        # First we get the lang code
        line_elts = line_str.split()
        emb_str = line_elts[0]
        emb_elts = emb_str.split("/")
        # First elt is blank, second is "c", third is the lang, and 4th is the word
        lang_str = emb_elts[2]
        # If we've never encountered this lang before, add it to all_langs
        if lang_str not in all_langs:
            all_langs.append(lang_str)
        if lang_str in langs_to_keep:
            all_lines.append(line_str)
        counter = counter + 1

On line 0
On line 1000000
On line 2000000
On line 3000000
On line 4000000
On line 5000000
On line 6000000
On line 7000000
On line 8000000
On line 9000000


In [8]:
print(set(all_langs), end="")

{'lt', 'en', 'gd', 'ms', 'ang', 'sa', 'no', 'grc', 'cy', 'it', 'he', 'gv', 'sk', 'sw', 'ro', 'ko', 'kk', 'bg', 'fi', 'xcl', 'eu', 'uk', 'fa', 'el', 'tr', 'is', 'mul', 'ka', 'cs', 'pt', 'nl', 'ur', 'fr', 'hi', 'sv', 'io', 'lv', 'ku', 'ja', 'vo', 'hy', 'ru', 'ast', 'sh', 'ar', 'es', 'th', 'fo', 'mg', 'fro', 'nrf', 'la', 'be', 'ga', 'zh', 'mk', 'da', 'nv', 'pl', 'se', 'hu', 'vi', 'af', 'fil', 'az', 'et', 'ta', 'de', 'rup', 'non', 'sl', 'gl', 'hsb', 'oc', 'te', 'sq', 'ca', 'eo'}

In [11]:
gz_output_fpath = "../Embeddings/numberbatch_custom.txt"

In [10]:
# Old: external gzip
with open(gz_output_fpath, 'w', encoding='utf-8') as outfile:
    outfile.writelines(all_lines)
# And then go gzip it via cygwin

In [None]:
# New: gzip on the fly
#with open(gz_output_fpath, 'wt', encoding='utf-8') as gz_outfile:
#    gz_outfile.writelines(all_lines)

### Trying for even more efficient: serialize the KeyedVectors object

(*After* running gzip)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gzip

import numpy as np
import gensim.models.keyedvectors
from tqdm import tqdm

In [3]:
xlang_model = gensim.models.keyedvectors.KeyedVectors(300)

In [4]:
# Load the embs from the now-gzipped file
#gz_fpath = "../Embeddings/numberbatch_custom.txt.gz"
gz_fpath = "../Embeddings/numberbatch-19.08.txt.gz"

In [5]:
gz_lines = []
with gzip.open(gz_fpath, 'rb') as infile:
    for line in tqdm(infile):
        gz_lines.append(line)

9161913it [01:18, 117339.40it/s]


In [6]:
len(gz_lines)

9161913

In [None]:
get_key = lambda x: x.split()[0]
keys = [get_key(line.decode()) for line in tqdm(gz_lines)]

  2%|▏         | 188853/9161913 [00:03<02:59, 49958.90it/s]

In [19]:
line_to_np = lambda gz_line: np.array([float(x) for x in gz_line.decode().split()[1:]])

In [20]:
gz_np = [line_to_np(l) for l in tqdm(gz_lines)]

100%|██████████| 1913806/1913806 [02:12<00:00, 14408.39it/s]


In [21]:
xlang_model.add_vectors(keys, gz_np)

In [22]:
xlang_model.save("xlang_model.kv")

In [23]:
len(xlang_model.key_to_index)

1913806

In [24]:
"/c/en/yes" in xlang_model.key_to_index

True

In [25]:
start = 553620
list(xlang_model.key_to_index.keys())[start:start+10]

['/c/de/warenumschlages',
 '/c/de/warenumschlags',
 '/c/de/warenumschlagskredit',
 '/c/de/warenumschläge',
 '/c/de/warenumschlägen',
 '/c/de/warenuntersuchung',
 '/c/de/warenursprung',
 '/c/de/warenverfügbarkeits_garantie',
 '/c/de/warenverkauf',
 '/c/de/warenverkaufe']

In [26]:
'/c/en/weird' in xlang_model.key_to_index

True

In [27]:
'/c/en/yes_no' in xlang_model

True

### New version, with the language-separated files

From https://zenodo.org/record/4911598

In [1]:
import zipfile

In [4]:
def extract_zip(zip_fpath):
    input_zip = zipfile.ZipFile(zip_fpath)
    return {name: input_zip.read(name) for name in input_zip.namelist()}

In [36]:
vi_fpath = "../Embeddings/numberbatch-19.08-vi.zip"
vi_extracted_fpath = "../Embeddings/numberbatch-19.08-vi/numberbatch-19.08-vi.bin"

In [32]:
#with zipfile.ZipFile(de_fpath) as myzip:
#    with myzip.open('numberbatch-19.08-vi.bin', 'r') as myfile:
#        result = myfile.read()

In [74]:
import os

import gensim

In [75]:
lang = "it"
zip_fpath = f"../Embeddings/numberbatch-19.08-{lang}.zip"

In [76]:
with zipfile.ZipFile(zip_fpath) as myzip:
    bin_fname = f"numberbatch-19.08-{lang}.bin"
    extracted_fpath = myzip.extract(bin_fname)

In [77]:
lang_model = gensim.models.KeyedVectors.load_word2vec_format(extracted_fpath, binary=True)

In [78]:
lang_model.save(f"./nb_models/nb_{lang}.kv")

In [79]:
# And now we can delete the extracted .bin file
os.remove(extracted_fpath)