In [10]:
import io

def load_vectors(path="datasets/wiki-news-300d-1M.vec"):
    fin = io.open(path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data


In [68]:
import numpy as np
import tqdm

def read_english_w2v(path="datasets/wiki-news-300d-1M.vec", lim=100000):
    """
    Read word 2 vec into a pandas DataFrame
    Limite the total number of words for performance issues
    """
    with open(path) as f:
        nb_words, dims = [int(d) for d in f.readline().split()]
        nb_words = min(lim, nb_words)
        words = []
        vectors = np.zeros((nb_words, dims), dtype=np.float32)
        current = 0
        
        for l in tqdm.tqdm(f, total=nb_words):
            idx = l.index(" ")
            w = l[:idx]
            words.append(w)
            vec = np.fromstring(l[idx :], sep=' ', dtype=np.float32)
            vectors[current] = vec
            current += 1
            if current >= nb_words:
                break
    return pd.DataFrame(vectors, index=words)

In [69]:
df = read_english_w2v()

100%|█████████▉| 99999/100000 [00:02<00:00, 35099.73it/s]


In [71]:
def test_english_w2v():
    d = read_english_w2v()
    assert not df.isnull().values.any()
test_english_w2v()

100%|█████████▉| 99999/100000 [00:02<00:00, 35605.32it/s]


In [41]:
list(d.keys())[:2]

['999994', ',']

In [42]:
del d["999994"]

In [43]:
pd.DataFrame(d)

Unnamed: 0,",",the,.,and,of,to,in,a,"""",:,...,32.37,Majnoun,Bartenura,Melkam,calligraffiti,whitespotted,sacoglossan,Iseya,Bayyah,Vilaya
0,0.1073,0.0897,0.0004,-0.0314,-0.0063,0.0495,-0.0234,0.0047,-0.0899,-0.0221,...,0.0141,0.0056,-0.0238,-0.0314,-0.0134,0.0112,-0.0870,0.0461,-0.0907,0.0855
1,0.0089,0.0160,0.0032,0.0149,-0.0253,0.0411,-0.0268,0.0223,-0.0402,-0.0133,...,-0.0294,-0.0204,0.0075,-0.0084,0.0627,0.0070,-0.0164,-0.0508,-0.0645,0.0531
2,0.0006,-0.0571,-0.0204,-0.0205,-0.0338,0.0041,-0.0838,-0.0087,-0.0220,0.0161,...,0.0236,-0.1280,0.0108,0.0726,-0.0136,0.0031,0.0312,-0.1492,-0.1101,0.0040
3,0.0055,0.0405,0.0479,0.0557,0.0178,0.0309,0.0386,0.0250,0.0476,0.0824,...,-0.0400,-0.0566,0.0106,-0.0413,-0.1702,0.0462,0.0745,-0.0736,0.0780,0.0444
4,-0.0646,-0.0696,-0.0450,0.0205,-0.0966,-0.0044,-0.0321,-0.0660,0.0262,0.1872,...,-0.0036,-0.0172,-0.0046,-0.0105,-0.0258,0.0041,-0.0854,-0.1078,-0.0094,-0.0811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.0276,0.0559,0.0304,-0.0214,0.0517,-0.0327,-0.0179,0.0463,0.0226,-0.0386,...,0.0688,0.1301,0.0212,-0.0600,0.0694,0.0944,0.0599,-0.0228,0.0359,0.0867
296,0.0186,0.0591,0.0290,0.0267,0.0088,0.0070,-0.0142,0.0178,-0.0189,-0.0474,...,0.0821,0.1030,0.0273,0.0605,-0.0358,-0.0071,0.0236,0.0262,0.0772,0.0782
297,0.0050,0.1559,0.2070,0.0980,0.1155,0.2371,0.1048,0.1479,-0.0963,-0.1911,...,-0.0114,-0.0159,-0.0836,-0.2814,0.0646,-0.1502,0.1157,-0.2905,-0.2081,-0.1947
298,0.1173,-0.0254,0.0689,0.0893,0.0073,-0.0298,-0.0148,0.1324,0.0613,-0.0541,...,-0.0156,0.0099,0.0843,0.0494,-0.0019,-0.0010,0.0791,-0.0642,0.0312,-0.0116
