## Exercise “Content2form”

Using the data on definitions ("defs" exercise), for each concept:
- take the available definitions,
- search WordNet for the correct synset

In [1]:
import csv
import json
import nltk 
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk.wsd import lesk
import os

print("Libraries imported successfully ✓")

Libraries imported successfully ✓


1. Read document definitions and create a data structure

In [2]:
defs_path = f'../Esercizio1-DEFS/resource/definitions.csv'
defs_path_json = f'../Esercizio1-DEFS/resource/definitions.json'
slang_path = f'../Esercizio1-DEFS/resource/slang.txt'

if not os.path.exists(defs_path_json):
    with open(defs_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        definitions = []
        for row in reader:
            definitions.append(row)

    with open(defs_path_json, 'w', encoding='utf-8') as f:
        json.dump(definitions, f, indent=4)


with open(defs_path_json, 'r', encoding='utf-8') as f: 
    definitions = json.load(f)


with open(slang_path, 'r', encoding='utf-8') as f:
    slang = f.read().splitlines()
    slangs = []
    for pair in slang: 
        list_pair = pair.split("=")
        slangs.append((list_pair[0].lower(), list_pair[1].lower()))

def expand_slangs(tokens: list, slangs: list):
    for i, token in enumerate(tokens):
        for slang in slangs:
            if token == slang[0]:
                tokens[i] = slang[1]
    return tokens

def expand_abbr(tokens: list): 
    for i, token in enumerate(tokens): 
        if token == "e.g." or token == "eg":
            tokens[i] = "for example"
        elif token == "i.e." or token == "ie":
            tokens[i] = "that is"
        elif token == "e.i." or token == "ei":
            tokens[i] = "for example that is"
    return tokens

2.  pre processing - stopwords removal, lemmatization, slang expansion, abbreviation expansion

In [3]:
stopwords = nltk.corpus.stopwords.words('english')

tokens_concepts = {}

for concept in definitions:  
    keys = list(concept.keys()) 
    keys.remove('Concept')
    tokens = set()
    for key in keys: # for all possible definition for the concept  
        definition = concept[key].lower() 
        if definition != '':
            def_tok = nltk.word_tokenize(definition)

            def_tok = expand_slangs(def_tok, slangs) 
            def_tok = expand_abbr(def_tok) 

            def_tok = [token.lower() for token in def_tok if token not in stopwords and token.isalpha()]
            lemmatizer = WordNetLemmatizer() 
            def_lem = [lemmatizer.lemmatize(token) for token in def_tok] 
            #tokens.update(def_lem) 
            concept[key] = def_lem  
        else: 
            del concept[key] 
    
    tokens_concepts[concept['Concept']] = list(tokens) 


3. filter definition that are too short -> create lexial material

In [4]:
for concept in definitions:
    too_short = []
    for key, value in concept.items():
        if key != 'Concept':
            if len(value) < 3:
                too_short.append(key)
    for key in too_short:
        del concept[key]


4. Find the right concept using wordnet and onomasiologic search

In [5]:
def get_top_words(definitions: list, n_top: int) -> list: 
    concept_counter = Counter()
    
    for definition in definitions:
        concept_counter.update(definition)
    
    most_frequent_words = [word for word, _ in concept_counter.most_common(n_top)]

    print("top words:", most_frequent_words, '\n')

    return list(set(most_frequent_words))

def get_synset(word: str, definitions: list): 
    synset_counter = Counter()
    for definition in definitions: 
        synset = lesk(definition, word, 'n')
        if synset:
            synset_counter.update([synset])

    synset = None
    if len(synset_counter) != 0:
        synset = synset_counter.most_common(1)[0][0]

    return synset

def create_dict_word_dictionary(most_frequent_words: list, definitions: list):
    dict_word_dictionary = {}
    for word in most_frequent_words:
        for definition in definitions: 
            if word in definition:
                if word in dict_word_dictionary:
                    dict_word_dictionary[word].append(definition)
                else:
                    dict_word_dictionary[word] = [definition]

    return dict_word_dictionary

def rank(top_words: list, important_words: list): 
    top_words_5 = top_words[:5]
    top_words_10 = top_words[5:10]
    top_words_end = top_words[10:]

    rank = len(important_words)

    count_5 = 0
    count_10 = 0
    count_end = 0

    for word in important_words:
        if word in top_words_5:
            count_5 += 1
        if word in top_words_10:
            count_10 += 1
        if word in top_words_end:
            count_end += 1

    return rank+(count_5)+(count_10*0.5)+(count_end*(-0.25))


In [6]:
def onomasiologic_search(concept: dict, n_top: int):
    
    results = []
    del concept['Concept']
    definitions = list(concept.values())

    most_frequent_words = get_top_words(definitions, n_top)
    dict_word_dictionary = create_dict_word_dictionary(most_frequent_words, definitions)

    hypernyms = []
    for word in most_frequent_words:
        synset = get_synset(word, dict_word_dictionary[word])
        if synset:
            hypernyms.extend(synset.hypernyms())
    
    hypernyms = list(set(hypernyms))
    res = []
    for hyp in hypernyms:
        hyp_def = hyp.definition() + " " + ', '.join(hyp.examples())
        
        match_words = []
        for word in most_frequent_words:
            if word in hyp_def:
                match_words.append(word) 
        
        res.append([hyp, match_words])

     # sort the list using the number of important words found
    sorted_res = sorted(res, key=lambda x: rank(most_frequent_words, x[1]), reverse=True)
    for synset, match_words in sorted_res[:5]:
        results.append((synset.name(), match_words))

    return results

In [7]:
for concept in definitions:
    print(concept['Concept'])
    result = onomasiologic_search(concept, 30)
    print(result, '\n\n\n')

Emotion
top words: ['feeling', 'human', 'feel', 'something', 'state', 'being', 'living', 'concept', 'certain', 'animal', 'sensation', 'mind', 'express', 'emotion', 'mental', 'range', 'situation', 'think', 'make', 'good', 'bad', 'arising', 'form', 'percieve', 'towards', 'others', 'sentiment', 'entity', 'throw', 'word'] 

[('feeling.n.01', ['feeling', 'emotion', 'state', 'feel']), ('body.n.01', ['animal', 'human', 'being']), ('emotional_state.n.01', ['emotion', 'state', 'good']), ('idea.n.01', ['good', 'think', 'mind']), ('living_thing.n.01', ['living', 'entity'])] 



Person
top words: ['human', 'person', 'certain', 'ability', 'single', 'living', 'homo', 'sapiens', 'individual', 'answer', 'question', 'mean', 'may', 'say', 'generic', 'describe', 'precise', 'feature', 'belonging', 'group', 'society', 'mammal', 'descending', 'ape', 'entity', 'sentient', 'see', 'touch', 'member', 'specie'] 

[('causal_agent.n.01', ['entity']), ('organism.n.01', ['ability', 'living']), ('currency.n.01', ['ap