In [1]:
import numpy as np
import torch, pdb
import torch.nn as nn
import torch.nn.init as init
from torch.autograd import Variable
from itertools import ifilter
from IPython.core.debugger import set_trace
from random import randint

In [2]:
def get_sim(wrd, k, mat, word2index):
    if wrd not in word2index:
        return None
    vec = mat[word2index[wrd], :].unsqueeze(1)
    othrs = torch.mm(mat, vec)
    othrs, ind = torch.sort(othrs, 0, descending=True)
    topk = ind[:k]
    for i in range(topk.size()[0]):
        print(index2word[topk[i][0]])

def get_glovedict(glove_path):
    vocab_d = set()
    with open(glove_path) as f:
        for line in f:
            word, vec = line.split(' ', 1)
            word = word.strip().lower()
            vocab_d.add(word)
            
    return vocab_d
    
def get_gloveready(glove_path, vocab_size, dim, word2index):
    pretrained_weight = torch.FloatTensor(vocab_size, dim)
    fnd = 0
    with open(glove_path) as f:
        for line in f:
            word, vec = line.split(' ', 1)
            word = word.strip().lower()
            if word in word2index:
                ind = word2index[word]
                pretrained_weight[ind, :] = torch.from_numpy(np.array(list(map(float, vec.split()))))
                fnd += 1

    print('Found {0} words with glove vectors, total was {1}'.format(fnd, vocab_size))
    return pretrained_weight

def process_lines(data):
    pairs, vocab = set(), {}
    for cn, l in enumerate(data):
        dt = l.split("|||")
        score = float(dt[3].split(" ")[1].split("=")[1])
        if score < 3.3:
            continue
        wrd1, wrd2 = dt[1], dt[2]
        wrd1, wrd2 = wrd1.strip(), wrd2.strip()

        if ".pdf" not in wrd1 and ".pdf" not in wrd2 and wrd1.isalpha() and wrd2.isalpha():
            sc = editdist_score(wrd1, wrd2)
            if sc > min(len(wrd1), len(wrd2))/2 + 2:
                if wrd1 + " " + wrd2 not in pairs and wrd2 + " " + wrd1 not in pairs:
                    pairs.add(wrd1 + " " + wrd2)
                    if wrd1 not in vocab:
                        vocab[wrd1] = 1
                    else:
                        vocab[wrd1] += 1

                    if wrd2 not in vocab:
                        vocab[wrd2] = 1
                    else:
                        vocab[wrd2] += 1

    return pairs, vocab

def get_vocab(min_freq, flName=None, lines=None):
    if flName is not None:
        with open(flName) as fp:
            lines = fp.readlines()
    
    return process_lines(lines)

def get_chunks(lines, cn):
    chunks = []
    chunk_size = len(lines)//cn
    for i in range(0, chunk_size*cn + 1):
        chunk = lines[i*chunk_size:i*chunk_size + chunk_size]
        chunks.append(chunk)
    return chunks

def editdist_score(p1, p2):
    n, m = len(p1), len(p2)
    dp = [[0 for x in range(m+1)] for x in range(n+1)]

    for i in range(n+1):
        for j in range(m+1):
            if i == 0:
                dp[0][j] = j
            elif j == 0:
                dp[i][0] = i            
            elif p1[i-1] == p2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
    return dp[n][m]

def filter_data(pairs, word2index):
    new_pairs = set()
    fp = open("ppdb-processed.txt", "w")
    for line in pairs:        
        p1, p2 = line.split(" ")
        if p1 in word2index and p2 in word2index:
            new_pairs.add(p1 + " " + p2)
            fp.write(line)
            fp.write("\n")
            
    fp.close()
    return new_pairs

In [3]:
glove_path, dim, min_count, neg_exmpl = "glove.6B.50d.txt", 50, 1, 10
g_vocab = get_glovedict(glove_path)
pairs, tok_freq = get_vocab(min_count, flName="ppdb-2.0-l-lexical")

vocab = set(tok_freq.keys())
vocab = vocab.intersection(g_vocab)
word2index, index2word = {}, {}

for wrd in vocab:
    if tok_freq[wrd] >= min_count:
        index2word[len(index2word)] = wrd
        word2index[wrd] = len(index2word) - 1
    else:
        tok_freq[wrd] = 0

pairs = filter_data(pairs, word2index)
vocab_size = len(index2word)
print("Data ready: {} {} {}".format(vocab_size, len(pairs), len(vocab)))

Data ready: 21232 193269 21232


In [4]:
pretrained_weight = get_gloveready(glove_path, vocab_size, dim, word2index)
pretrained_weight = torch.nn.functional.normalize(pretrained_weight)

Found 21232 words with glove vectors, total was 21232


In [5]:
get_sim('young', 10, pretrained_weight, word2index)

young
who
friends
fellow
younger
man
parents
friend
couple
boys


In [7]:
get_sim('occur', 10, pretrained_weight, word2index)

occur
occurring
occurs
arise
affect
affected
occurrence
possibly
cause
due


In [9]:
get_sim('summer', 10, pretrained_weight, word2index)

summer
winter
beginning
starting
day
during
days
year
next
years


In [10]:
get_sim('eat', 10, pretrained_weight, word2index)

eat
eating
ate
eaten
eats
cooked
fish
vegetables
feed
eggs


In [11]:
get_sim('fear', 10, pretrained_weight, word2index)

fear
worry
danger
fears
anger
blame
fearing
worried
threatening
cause


In [13]:
get_sim('hi', 10, pretrained_weight, word2index)

hi
ho
ai
tu
yo
ya
techs
wow
ok
hurts


In [14]:
get_sim('salary', 10, pretrained_weight, word2index)

salary
salaries
payroll
bonuses
minimum
pay
payments
guaranteed
paid
paying
