In [1]:
from nltk.corpus import wordnet as wn
from copy import deepcopy

import random

In [2]:
# Trucks Wiki:
#terms_vocab = "/home/johannes/thesis_code/data_experimentation/trucks/term_vocab.txt"
#full_vocab = "/home/johannes/thesis_code/data_experimentation/vocab_concat/vocabulary.txt"

# Volvo manual
terms_vocab = "/home/johannes/thesis_code/data_experimentation/volvo_data/term_vocab.txt"
full_vocab = "/home/johannes/thesis_code/data_experimentation/volvo_data/vocabulary.txt"

In [3]:
tf = open(terms_vocab,'r')
terms_list = [w.strip('\n') for w in tf]
tf.close()
print(len(terms_list))

5346


In [4]:
def check_synsets(terms):
    '''
    Returns a list with all terms that have a wordnet synset
    '''
    return [w for w in terms if wn.synsets(w.replace(' ','_'))]

def select_synsets(terms_with_synsets):
    '''
    Returns a dictionary with first available synset with
    a noun POS tag
    '''
    synset_dict = {}
    for t in terms_with_synsets:
        ss = wn.synsets(t.replace(' ','_'))
        for s in ss:
            if '.n' in s.name():
                synset_dict[t] = s
                break
    return synset_dict
        
def make_hypernym_dict(synset_dict,num_levels=2):
    '''
    Returns a dictionary of terms associated with
    its hypernym synsets, traverses num_levels levels
    in the wordnet hierarchy
    '''
    hypernym_dict = {}
    for term in synset_dict:
        cur_level = 0
        next_hypernyms = synset_dict[term].hypernyms()
        
        hypernym_dict[term] = next_hypernyms
        cur_level += 1
        while cur_level < num_levels:
            _next_hypernyms = []
            for h in next_hypernyms:
                _next_hypernyms += h.hypernyms()
            hypernym_dict[term] += _next_hypernyms
            next_hypernyms = _next_hypernyms
            cur_level += 1
            
    return hypernym_dict

def keep_vocab_hypernyms(hypernym_dict, full_vocab_list):
    '''
    Removes hypernyms that do not exist in the
    vocabulary
    '''
    num_found = 0
    processed_terms = 0
    new_dict = {}
    for term in hypernym_dict:
        hypernyms_to_keep = []
        for h in hypernym_dict[term]:
            hypernym_text = h.name().split('.n')[0].replace('_',' ') 
            if hypernym_text in full_vocab_list:
                hypernyms_to_keep += [hypernym_text]
        if hypernyms_to_keep:
            new_dict[term] = hypernyms_to_keep
        processed_terms += 1
        if processed_terms%100 == 0:
            print("Processed ", str(processed_terms), " terms")
    return new_dict
        
    
def sample_terms(hypernym_dict, num_samples=100):
    '''
    Returns a random sample of num_samples terms
    '''
    all_terms = [t for t in hypernym_dict]
    return random.sample(all_terms, num_samples)

def write_training_data(queries, hypernym_dict, query_file, gold_file):
    '''
    Writes a query file and a gold file in accordance with the 
    SemEval-2018 Task 9 standard
    '''
    qf = open(query_file,'w+')
    gf = open(gold_file,'w+')
    for q in queries:
        qf.write(q)
        qf.write('\t')
        qf.write('Concept')
        qf.write('\n')
        unique_hypernyms = list(set(hypernym_dict[q]))
        for i in range(len(unique_hypernyms)):
            gf.write(unique_hypernyms[i])
            if not i == (len(unique_hypernyms)-1):
                gf.write('\t')
        gf.write('\n')
    qf.close()
    gf.close()
    
def sanity_check(hypernym_dict, full_vocab_list):
    '''
    Performs a sanity check to make sure that no 
    out-of-vocabulary terms remain
    '''
    for term in hypernym_dict:
        if term not in full_vocab_list:
            print("Found out of vocab term: ", term ," something went wrong!")
            return
        for h in hypernym_dict[term]:
            if h not in full_vocab_list:
                print("Found out of vocab term: ", h , " something went wrong!")
                return
    print("All good!\n")

In [5]:
terms_with_synsets = check_synsets(terms_list)
print(len(terms_with_synsets))

534


In [6]:
term_synset_dict = select_synsets(terms_with_synsets)
print_num =2
for key in term_synset_dict:
    print(key, term_synset_dict[key], term_synset_dict[key].hypernyms())
    print_num -= 1
    if not print_num:
        break

time signal Synset('time_signal.n.01') [Synset('signal.n.01')]
fuel gauge Synset('fuel_gauge.n.01') [Synset('indicator.n.03')]


In [7]:
hypernym_dict = make_hypernym_dict(term_synset_dict,num_levels=3)

In [8]:
print_num = 2
for t in hypernym_dict:
    print(t,hypernym_dict[t])
    print_num -= 1
    if not print_num:
        break

fuel gauge [Synset('indicator.n.03'), Synset('device.n.01'), Synset('instrumentality.n.03')]
merchantability [Synset('state.n.02'), Synset('attribute.n.02'), Synset('abstraction.n.06')]


In [9]:
fvf = open(full_vocab,'r')
full_vocab_list = [w.strip('\n') for w in fvf]
fvf.close()
print(len(full_vocab_list))

223407


In [10]:
ready_hypernyms = keep_vocab_hypernyms(hypernym_dict, full_vocab_list)

Processed  100  terms
Processed  200  terms
Processed  300  terms
Processed  400  terms
Processed  500  terms


In [11]:
print_num = 5
for t in ready_hypernyms:
    print(t, ready_hypernyms[t])
    print_num -= 1
    if not print_num:
        break

time signal ['signal']
fuel gauge ['indicator']
merchantability ['state']
federal communications commission ['independent agency']
seal ring ['ring', 'jewelry', 'adornment']


In [12]:
queries = sample_terms(ready_hypernyms,300)

In [13]:
print(len(queries))

300


In [14]:
# Trucks wiki
#query_file = "/home/johannes/thesis_code/data_experimentation/new_training_data/queries.txt"
#gold_file = "/home/johannes/thesis_code/data_experimentation/new_training_data/gold.txt"

# Volvo manual
query_file = "/home/johannes/thesis_code/data_experimentation/volvo_training_data/queries.txt"
gold_file = "/home/johannes/thesis_code/data_experimentation/volvo_training_data/gold.txt"

write_training_data(queries, ready_hypernyms, query_file, gold_file)

In [15]:
sanity_check(ready_hypernyms, full_vocab_list)

All good!



In [16]:
def concat_training_data(qf1, qf2, gf1, gf2, write_q, write_g):
    _wq = open(write_q,'w+')
    _qf1 = open(qf1,'r')
    for line in _qf1:
        _wq.write(line)
    _qf1.close()
    _wq.write('\n')
    _qf2 = open(qf2,'r')
    for line in _qf2:
        _wq.write(line)
    _qf2.close()
    _wq.close()
    
    _wg = open(write_g,'w+')
    _gf1 = open(gf1,'r')
    for line in _gf1:
        _wg.write(line)
    _gf1.close()
    _gf2 = open(gf2,'r')
    _wg.write('\n')
    for line in _gf2:
        _wg.write(line)
    _gf2.close()
    _wg.close()
    

In [17]:
q1 = "/home/johannes/hypernym_discovery_data/SemEval2018-Task9/training/data/1A.english.training.data.txt"
q2 = "/home/johannes/thesis_code/data_experimentation/volvo_training_data/queries.txt"
g1 = "/home/johannes/hypernym_discovery_data/SemEval2018-Task9/training/gold/1A.english.training.gold.txt"
g2 = "/home/johannes/thesis_code/data_experimentation/volvo_training_data/gold.txt"
write_queries = "/home/johannes/thesis_code/data_experimentation/volvo_training_data/concat_queries.txt"
write_gold = "/home/johannes/thesis_code/data_experimentation/volvo_training_data/concat_gold.txt"

In [18]:
concat_training_data(q1, q2, g1, g2, write_queries, write_gold)