In [1]:
import numpy as np
import operator
from utils.methods import length, unit, cosine, find_vecs, sort_disct, find_sim_cos, hyper_test, find_insts
from utils.tree import Tree
from random import shuffle
from utils.corpuscont import retrieve_corpus
import numpy as np

In [2]:
class Glove():
    
    def __init__(self, glove_filename):
        self.loadGloveModel(glove_filename)
        
    def __getitem__(self, key):
        return self.__dict__[key]

    def __setitem__(self, key, value):
        self.__dict__[key] = value
        
    def loadGloveModel(self, gloveFile):
        f = open(gloveFile,'r')
        model = {}

        print(f)
        cnt = 0
        index2word = {}
        for line in f:
            splitLine = line.split()
            word = splitLine[0]
            embedding = np.array([float(val) for val in splitLine[1:]])
            self[word] = embedding
            index2word[cnt] = word
            cnt += 1
            
        self.index2word = index2word 
        
        print("Done.",cnt," words loaded!")


In [3]:
def find_vecs_norm(lst, model):
    # print('support vectors:')
    sup_vecs = []
    for w in lst: 
        # sup_vecs.append(model[w])
        sup_vecs.append(unit(model[w]))
        #print(w)
    # print('')
    return sup_vecs


class Tree:
    """Tree class"""
        
    def __init__(self, model, word_lst, vec_lst, normalize = False):
        if vec_lst == None:
            if normalize == True:
                self.vec_lst = find_vecs_norm(word_lst, model)
            else:
                self.vec_lst = find_vecs(word_lst, model)
            self.word_lst = word_lst
        else:
            self.vec_lst = vec_lst
            self.word_lst = word_lst
        
        self.model = model
        self.root()
        # self.root_unit()
        # self.find_branches()
        
    def root(self):
        v = self.model['dummy']
        H = [0]*v.shape[0]
        for vec in self.vec_lst:
            H = H + vec
        h_u = unit(H)

        sup_vec_lens = {}
        indzs = range(len(self.word_lst))
        for i in indzs: 
            w = self.word_lst[i]
            v = self.vec_lst[i]
            v_len = length(v)
            cos = cosine(v, H)
            h_len = v_len*cos
            sup_vec_lens[w] = h_len
        self.sup_vec_sort = sort_disct(sup_vec_lens)
        self.len = list(self.sup_vec_sort.values())[0]
        self.root = self.len * h_u
        
    def root_unit(self):
        v = self.model['dummy']
        H = [0]*v.shape[0]
        for vec in self.vec_lst:
            H = H + unit(vec)
        h_u = unit(H)

        sup_vec_lens = {}
        indzs = range(len(self.word_lst))
        for i in indzs: 
            w = self.word_lst[i]
            v = self.vec_lst[i]
            v_len = length(v)
            cos = cosine(v, H)
            h_len = v_len*cos
            sup_vec_lens[w] = h_len
        self.sup_vec_sort = sort_disct(sup_vec_lens)
        self.len = list(self.sup_vec_sort.values())[0]
        self.root = self.len * h_u
        
    def subvec_properties(self):
        output = 'sup. vec. order: '
        for key in self.sup_vec_sort.keys():
            val = sup_vec_sort[key]
            output = output + ' ' + key + ' ' + str(val) + '; '
        # print(output)
        
    def hyper_test(self, v, h):
        cos = cosine(v, h)
        thresh = np.linalg.norm(h)
        v_proj = np.linalg.norm(v)*cos
        allowance = 0.0000000001 # allowance to include support vectors 
        instance = 0
        if thresh - allowance < v_proj:
            instance = 1
        return instance
    
    def find_insts(self):
        # print('instances of the root:')
        words = []
        for i in range(11000):
            word = self.model.index2word[i]
            instance = self.model[word]
            sub_inst = self.hyper_test(instance, self.root)
            if sub_inst == 1:
                words.append(word)
        len_dict = {}
        for w in words:
            w_vec = self.model[w]
            inst_len = length(w_vec)
            cos = cosine(self.root, w_vec)
            inst_len_root = cos * inst_len
            len_dict[w] = inst_len_root
        sorted_keys = sorted(len_dict.items(), key=operator.itemgetter(1))
        reversed_keys = reversed(sorted_keys)
        cnt = 0
        for k in reversed_keys:
            # print(k)
            if cnt > 50:
                break
            cnt += 1
        # print('')
    
def find_insts(model, source, normalized = False):
    # print('instances of the source:')
    words = []
    voc_size = 50000 # here change vocabulary size
    try: # handling differenence word2vec and other embeddings
        for i in range(voc_size):
            word = model.index2word[i]
            if normalized:
                instance = unit(model[word])
            else:
                instance = model[word]
            sub_inst = hyper_test(instance, source)
            if sub_inst == 1:
                words.append(word)
    except:
        vocabulary = model.vocabulary.words
        for i in range(voc_size):
            if normalized:
                instance = unit(model[vocabulary[i]])
            else:
                instance = model[vocabulary[i]]
            
            sub_inst = hyper_test(instance, source)
            if sub_inst == 1:
                words.append(vocabulary[i]) 
                    
    len_dict = {}
    for w in words:
        w_vec = model[w]
        inst_len = length(w_vec)
        cos = cosine(source, w_vec)
        inst_len_trunk = cos * inst_len
        len_dict[w] = inst_len_trunk
    sorted_keys = sorted(len_dict.items(), key=operator.itemgetter(1))
    reversed_keys = reversed(sorted_keys)
    
    cnt = 0
    lst_out = []
    for k in reversed_keys:
        # print(k)
        lst_out.append(k[0].lower())
        if cnt > 1000:
            break
        cnt += 1
    # print('')
    return lst_out



In [4]:
def run(cats, inst_pp, itter):

    normalization = True

    for pp in inst_pp:
        precision, recall = 0.0, 0.0 
        for cat_upper in cats:
            cat = []
            [cat.append(t.lower()) for t in cat_upper]
            percent = pp
            examples = int(round(len(cat)*percent))
            # print('number of examples: ',examples)
            # print('examples: ', cat[:examples])
            prec, rec, f1 = 0.0, 0.0, 0.0
            for i in range(itter):
                if dict_low:
                    shuffle(cat)
                    h_mail = Tree(model, cat[:examples], None, normalization)
                else:    
                    shuffle(cat_upper)
                    h_mail = Tree(model, cat_upper[:examples], None, normalization)

                instances_all = find_insts(model, h_mail.root, normalization)
                instances = []
                [instances.append(el) for i, el in enumerate(instances_all) if el not in instances_all[:i]] # remove duplicates
                hit, mis = 0, 0
                if not len(instances) == 0:
                    for ins in instances:
                        # print(ins)
                        if ins in cat:
                            hit += 1
                        else:
                            mis += 1
                    prec_tmp = hit/(hit+mis)
                else:
                    prec_tmp = 0.0
                rec_tmp = hit/len(cat)
                rec += rec_tmp
                prec += prec_tmp

            '''
            print('exaple prediction: ', instances)
            print('recall   : ', rec/itter)
            print('precision: ', prec/itter)
            if rec+prec == 0.0:
                print('f1: ', 0.0)
            else:
                print('f1: ', 2*rec*prec/(rec+prec)/itter)
            print('')
            '''

            recall += rec/itter
            precision += prec/itter
        print('pp: ', pp)
        print('Overall results:')
        print('recall   : ', recall/len(cats))
        print('precision: ', precision/len(cats))
        print('F1: ', round(2*recall*precision/(recall+precision)/len(cats),3) )
        print(' ')

In [5]:
# load corpora 
dict_low = True
path1 = "datasets/googe_cat_corpus.txt"
# path2 = "datasets/wordnet_cat_corpus.txt"
path3 = "datasets/closed_cat_corpus.txt"

googe_cat_corpus = retrieve_corpus(path1, dict_low)
# wordnet_cat_corpus = retrieve_corpus(path2, dict_low)
closed_cat_corpus = retrieve_corpus(path3, dict_low)

# load model 
model = Glove('data/glove/glove.6B.300d.txt') 

<_io.TextIOWrapper name='data/glove/glove.6B.300d.txt' mode='r' encoding='UTF-8'>
Done. 400001  words loaded!


In [None]:
# run experiments 

inst_pp = [0.1, 0.2, 0.3, 0.4] # different percentage of reference data 0.1 = 10%, 0.2 = 20% on so on
itter = 5 # the experiments are repeated 5 times and averaged to reduce variation 

print('Experiments on the Closed Category corpus:')
run(closed_cat_corpus, inst_pp, itter)

print('Experiments on the Google Category corpus:')
run(googe_cat_corpus,  inst_pp, itter)



Experiments on the Closed Category corpus:
pp:  0.1
Overall results:
recall   :  0.2074161087954192
precision:  0.9377728804038097
F1:  0.34
 
pp:  0.2
Overall results:
recall   :  0.3403835627973559
precision:  0.8766773475648341
F1:  0.49
 
pp:  0.3
Overall results:
recall   :  0.5805597659045935
precision:  0.7329449647923203
F1:  0.648
 
pp:  0.4
Overall results:
recall   :  0.7322516946654877
precision:  0.6286080939288706
F1:  0.676
 
Experiments on the Google Category corpus:
