In [1]:
%load_ext autotime

In [2]:
import numpy as np
from sklearn.preprocessing import normalize

In [3]:
# # for debugging and testing
# embeddings = np.random.randint(low=0, high=10, size=(10,3))
# common_idcs = np.array([0,1,5,8])

# lex_neigh_to_comm_idcs = [[2,6],
#                       [7,8,9],
#                       [3],
#                       [1,12]]
# embeddings

In [4]:
def read_embeddings(filepath, norm=None):
    r"""provide norm='l1' or 'l2' in case vectors are not already normalized"""
    embeddings = []
    w2i = dict()
    i2w = list()
    open_fn = open if not filepath.endswith('.gz') else gzip.open
    
    with open_fn(filepath, "r") as f:
        for i, line in enumerate(f):
            
            line = line.strip().split()
            word = line[0].lower()
            w2i[word] = len(i2w)
            i2w.append(word)
            embeddings.append([float(i) for i in line[1:]])
    
    embeddings = np.array(embeddings)
    if norm:
        embeddings = normalize(embeddings, axis=1, norm=norm)
    
    return w2i, i2w, embeddings

In [5]:
def read_lexicon(filepath, w2i=None):
    lexicon = dict()
    
    with open(filepath, "r") as f:
        for line in f:
            words = line.lower().strip().split()
            if words[0].isalpha():
                if w2i:
                    if words[0] in w2i:
                        lexicon[w2i[words[0]]] = [w2i[x] for x in words[1:] 
                                                 if x.isalpha() and x in w2i]
                else:
                    lexicon[words[0]] = [w for w in words[1:] if w.isalpha()]
                    
    # filter word/word_idx with empty neighbours
    lexicon = {k:v for k,v in lexicon.items() if v}
    return lexicon

In [6]:
def indexify_lexicon(w2i, lexicon):
    indexed_lexicon = dict()
    for word, neighbours in lexicon.items():
        if word in w2i:
            indexed_lexicon[w2i[word]] = [w2i[x] for x in neighbours
                                          if x in w2i]
    # filter word/word_idx with empty neighbours
    indexed_lexicon = {k:v for k,v in indexed_lexicon.items() if v}
    return indexed_lexicon

In [7]:
def retrofit(embeddings, neighbours, n_iter=10):

    # append extra row with all 0s as unk token
    unk_emb = np.zeros((embeddings.shape[1],))
    embeddings = np.vstack((embeddings, unk_emb))
    new_embeddings = np.array(embeddings)
    
    update_idcs, neigh_idcs = [], []
    for idx, neigh in neighbours.items():
        if idx < len(embeddings)-1:
            neigh = [i for i in neigh if i < len(embeddings)-1]
            if neigh:
                update_idcs.append(idx)
                neigh_idcs.append(neigh)
    
    update_idcs = np.array(update_idcs)
    pad_len = max(map(len, neigh_idcs))
    neigh_idcs = np.array([arr + [-1]*(pad_len-len(arr)) 
                           for arr in neigh_idcs])   
    neigh_counts = ((neigh_idcs > -1).sum(axis=1)).reshape((-1,1))
    
    for i in range(n_iter):
        new_emb = (neigh_counts*embeddings[update_idcs] + 
                    1*new_embeddings[neigh_idcs].sum(axis=1))
        new_emb /= (2*neigh_counts)

        new_embeddings[update_idcs,:] = new_emb
    
    # remove last appended rows with 0s as unk weights
    return new_embeddings[:-1]

In [8]:
w2v_filepath = "sample_vec.txt"
lexicon_filepath = "framenet.txt"

In [9]:
w2i, i2w, embeddings = read_embeddings(w2v_filepath, norm=None)
neighbours = read_lexicon(lexicon_filepath, w2i=w2i)

In [10]:
new_embeddings = retrofit(embeddings, neighbours, n_iter=10)