In [1]:
import numpy as np
import operator
from utils.methods import length, unit, cosine, find_vecs, sort_disct, find_sim_cos, hyper_test, find_insts
from utils.tree import Tree
from random import shuffle
from utils.corpuscont import retrieve_corpus
import numpy as np
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, recall_score, precision_score
import random
import time

In [2]:
class Glove():
    """Glove worde embeddings class"""
    # class to load and process Glove word embeddings
    
    def __init__(self, glove_filename):
        self.loadGloveModel(glove_filename)
        
    def __getitem__(self, key):
        return self.__dict__[key]

    def __setitem__(self, key, value):
        self.__dict__[key] = value
        
    def loadGloveModel(self, gloveFile):
        f = open(gloveFile,'r')
        model = {}

        print(f)
        cnt = 0
        index2word = {}
        words = []
        for line in f:
            splitLine = line.split()
            word = splitLine[0]
            words.append(word)
            embedding = np.array([float(val) for val in splitLine[1:]])
            self[word] = embedding
            index2word[cnt] = word
            cnt += 1
            
        self.index2word = index2word 
        self.words_lst = words
        
        print("Done.",cnt," words loaded!")

model = Glove('data/glove/glove.6B.300d.txt') 
print(model.index2word[0])


<_io.TextIOWrapper name='data/glove/glove.6B.300d.txt' mode='r' encoding='UTF-8'>
Done. 400001  words loaded!
the


In [8]:
# load corpora 
dict_low = True
path1 = "datasets/google_cat_corpus.txt"
# path2 = "datasets/wordnet_cat_corpus.txt"
path3 = "datasets/closed_cat_corpus.txt"

googe_cat_corpus = retrieve_corpus(path1, dict_low)
# wordnet_cat_corpus = retrieve_corpus(path2, dict_low)
closed_cat_corpus = retrieve_corpus(path3, dict_low)

# load model 
model = Glove('data/glove/glove.6B.300d.txt') 

<_io.TextIOWrapper name='data/glove/glove.6B.300d.txt' mode='r' encoding='UTF-8'>
Done. 400001  words loaded!


In [4]:
def trainData(single_class, pp):
    # function to extract training data
    X_in, Y_in = [], []
    sum_examples = 0
    examples = int(round(len(single_class)*pp))
    # print('number of examples: '+str(examples))
    sum_examples += examples
    for i in range(examples):
        X_in.append(model[single_class[i]])
        Y_in.append(1)
    return X_in, Y_in, sum_examples        

def testData(single_class, pp):
    # function to prepare testing data   
    X_t, Y_t = [], []
    examples = int(round(len(single_class)*pp))
    for i in range(examples, len(single_class)):
        X_t.append(model[single_class[i]])
        Y_t.append(1)
    return X_t, Y_t              
            
def nagitveClassData(model, add_data, single_class):
    # function to generate negative instances
    
    words = model.words_lst
    dic = [words[i] for i in range(len(words))]
    random.shuffle(dic)
    X_a, Y_a = [], []
    cnt = 0
    while add_data > cnt:
        incldue_word = True
        if dic[cnt] in single_class:
            incldue_word = False
            break
        if incldue_word:
            # print(dic[cnt])
            X_a.append(model[dic[cnt]])
            Y_a.append(0)
            cnt += 1
    return X_a, Y_a           
                
def printScores(Y_true, Y_pred):
    target_names = []
    for i in range(0, 2):
        target_names.append('class '+str(i))
    print(classification_report(    Y_true, Y_pred, target_names = target_names))
    print('precision_score: '+str(precision_score(Y_true, Y_pred, average='macro')))
    print('recall_score: '+   str(   recall_score(Y_true, Y_pred, average='macro')))
    print('f1 macro: '+       str(       f1_score(Y_true, Y_pred, average='macro')))

def evaluate(Y_true, Y_pred):
    inst_sum = Y_true.count(1)
    cor_pred, sum_pred = 0.0, 0.0
    for t, p in zip(Y_true, Y_pred):
        if p == 1:
            sum_pred += 1.0
            if t == 1:
                cor_pred += 1.0
    prec  =  cor_pred/sum_pred           
    recall = cor_pred/inst_sum 
    f1 = 2*prec*recall/(prec+recall)
    '''
    print('')
    print('prec  : ', str(prec))
    print('recall: ', str(recall))
    print('f1    : ', str(f1))
    '''
    return prec, recall, f1
    
 

In [7]:
def run(corp, inst_pp, itter, sum_examples):
    # function to run the category completion experiments
    
    num_classes = len(corp)
    add_data = 50000
    for pp in inst_pp:
        f1_overall, recall_overall, precision_overall = 0.0, 0.0, 0.0
        for j in range(num_classes): 
            X_in, Y_in, sum_examples = trainData(corp[j], pp)
            X_t, Y_t   = testData(corp[j], pp)   
            
            X, Y = [], []
            f1_scrore_average, recall_scrore_average, precision_scrore_average = 0.0, 0.0, 0.0 
            for i in range(itter):

                # negative samples and samples for prediction
                X_a, Y_a   = nagitveClassData(model, add_data, corp[j])
                X_a_r, X_a_t = X_a[:sum_examples], X_a[sum_examples:]
                Y_a_r, Y_a_t = Y_a[:sum_examples], Y_a[sum_examples:]

                # randomize training examples    
                combined = list(zip(X_in+X_a_r, Y_in+Y_a_r))
                random.shuffle(combined)
                X[:], Y[:] = zip(*combined)

                clf = svm.SVC(kernel='linear', class_weight='balanced') 
                clf.fit(X, Y) 
                
                X_joint = X+X_t+X_a_t
                X_joint = np.asarray(X_joint)
                Y_pred = clf.predict(X_joint)
                
                Y_true = Y+Y_t+Y_a_t
                prec, recall, f1 = evaluate(Y_true, Y_pred)

                f1_scrore_average += f1
                recall_scrore_average += recall
                precision_scrore_average += prec
                
                
            '''
            # additional outputs
            print('')
            print('precision_scrore_average: '+ str(precision_scrore_average/itter))
            print('recall_scrore_average: '+ str(recall_scrore_average/itter))
            print('f1_score_average: '+ str(f1_scrore_average/itter))
            '''
            precision_overall += precision_scrore_average/itter
            recall_overall += recall_scrore_average/itter
            f1_overall += f1_scrore_average/itter


        print('')
        print('pp: ', pp)
        print('precicion_overall: '+ str(precision_overall/num_classes))
        print('recall_overall: '+ str(recall_overall/num_classes))
        print('f1 overall: '+ str(round(f1_overall/num_classes, 3)))
        print('')   

In [9]:
# run experiments 

# parameters
itter = 5 # 5 # the experiments are repeated 5 times and averaged to reduce variation 
inst_pp = [0.1, 0.2, 0.3, 0.4] # different percentage of reference data 0.1 = 10%, 0.2 = 20% on so on 
negative_samples = 500 # 100 # additional samples from the remaining words in the vocabulary for the negative class

print('Experiments on the Closed Category corpus:')
run(closed_cat_corpus, inst_pp, itter, negative_samples)

print('Experiments on the Google Category corpus:')
run(googe_cat_corpus, inst_pp, itter, negative_samples)



Experiments on the Closed Category corpus:

pp:  0.1
precicion_overall: 0.19331893987564996
recall_overall: 0.986611090059366
f1 overall: 0.236



KeyboardInterrupt: 