## Exercise "Hanks"

Choose a transitive verb (at least 2 arguments)
- retrieve from a corpus n (> 200) instances (sentences) in which it is used
- perform parsing and disambiguation (palmer, nltk)
- obtaining senses 
- use WordNet (or CSI) super senses on the arguments (subj and obj in the case of 2 arguments) of the chosen verb
- combine meanings from verbs, automatically following hanks theory
- aggregate the results, calculate the frequencies, print the obtained semantic clusters (semantic types)

In [1]:
from nltk.corpus import wordnet as wn
import nltk
import csv
import spacy
from nltk.wsd import lesk
import os.path

nlp = spacy.load("en_core_web_sm")

print("Libraries imported successfully ✓")

Libraries imported successfully ✓


1. Detection of 300 sentences where the verb 'handle' is used

In [2]:
path = f'resource/medium_articles.csv'
path_corpus = f'resource/corpus.txt'

sentences = []
if not os.path.exists(path_corpus): 
    with open(path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        i = 0
        articles = []
        for article in reader:
            if i == 0: 
                i += 1
            elif i < 30000:
                text = article[1]
                articles.append(text)
                sentences_text = nltk.sent_tokenize(text)
                for sentence in sentences_text:
                    if 'handle' in sentence and len(sentence) < 300 and not sentence.__contains__("\n"):
                        sentences.append(sentence)
                i += 1
            else: 
                break

    print("Length possible sentences: ", len(sentences))

else: 
    with open(path_corpus, 'r', encoding='utf-8') as f:
        sentences = f.readlines()

2. Detection of subjects and objects of the verb using dependecy parser (SpaCy) 

- subj: csubj, csubjpass, nsubj, nsubjpass
- obj: dobj, pobj

In [3]:
corpus = []
for sentence in sentences: 
    doc = nlp(sentence)
    subj = ""
    subj_pos = ""
    obj = ""
    obj_pos = ""
    token_sent = []
    for token in doc: 
        token_sent.append(token.lemma_)
        if token.text == "handle" and token.pos_ == "VERB": 
            indirect_obj = False
            active = True
            children = token.children   
            for child in children:
                if child.dep_ == "pobj":
                    indirect_obj = True
                #if subj != "" and obj != "":
                #    break
                if child.dep_.__contains__("subj") and subj == "" and child.text != "that":
                    active = not child.dep_.__contains__("pass")
                    subj = child.lemma_
                    subj_pos = child.pos_
                elif (child.dep_ == "dobj" or child.dep_ == "dative") and obj == "":
                    obj = child.lemma_
                    obj_pos = child.pos_
            if subj == "" or obj == "" and not indirect_obj:
                if ("VERB" == token.head.pos_ or "AUX" == token.head.pos_) and ("comp" in token.dep_ or "conj" in token.dep_): 
                    head_children = token.head.children
                    for head_child in head_children:
                        if head_child.dep_.__contains__("subj") and subj == "":
                            subj = head_child.lemma_
                            subj_pos = head_child.pos_
                        elif (head_child.dep_ == "dobj" or head_child.dep_ == "dative") and obj == "":
                            obj = head_child.lemma_
                            obj_pos = head_child.pos_

    if subj != "" and obj != "" and not indirect_obj: # transitive verb -> if subj and obj and indirect_obj : ditransitive verb!
        corpus.append([subj, subj_pos, obj, obj_pos, sentence, token_sent, str(active)])

print("Length corpus: ", len(corpus))

Length corpus:  855


In [4]:
if not os.path.exists(path_corpus): 
    with open (path_corpus, 'w', encoding='utf-8') as f:
        for sentence in corpus:
            f.write('"'+sentence[4]+'"'+"\n")

corpus_passive = [[subj, subj_pos, obj, obj_pos, sentence, token_sent, active] for subj, subj_pos, obj, obj_pos, sentence, token_sent, active in corpus if not active]

print("Length corpus passive: ", len(corpus_passive))

Length corpus passive:  0


3. Detection of the synset for each object and subject


In [5]:
synsets = []

for subj, subj_pos, obj, obj_pos, sentence, token_sent, _ in corpus:
    synset_subj = None
    synset_obj = None
    if subj_pos == "PRON": 
        if subj.lower() != "it" and subj.lower() != "its" and subj.lower() != "itself":
            synset_subj = wn.synset('person.n.03')
        else: 
            synset_subj = wn.synset('thing.n.6') # thing (a vaguely specified concern) "several matters to attend to"; "it is none of your affair"; "things are going well"
    elif obj_pos == "PRON":
        if obj.lower() != "it" and obj.lower() != "its" and obj.lower() != "itself":
            synset_obj = wn.synset('person.n.03')
        else: 
            synset_obj = wn.synset('thing.n.6')     
    else:     
        pos = ""
        if subj_pos == "NOUN":
            pos = "n"
        elif subj_pos == "VERB":
            pos = "v"
        elif subj_pos == "ADJ":
            pos = "a"
        elif subj_pos == "ADV":
            pos = "r"
        synset_subj = lesk(token_sent, subj, pos)
        pos = ""
        if obj_pos == "NOUN":
            pos = "n"
        elif obj_pos == "VERB":
            pos = "v"
        elif obj_pos == "ADJ":
            pos = "a"
        elif obj_pos == "ADV":
            pos = "r"
        synset_obj = lesk(token_sent, obj, pos)

    if synset_subj is not None and synset_obj is not None:
        synsets.append([synset_subj, synset_obj, token_sent])

4. Detection of the supersense for each object and subject (lexname in wordnet)

In [6]:
supersenses = []
for synset_subj, synset_obj, token_sent in synsets: 
    supersense_subj = synset_subj.lexname()
    supersense_obj = synset_obj.lexname()
    supersenses.append([supersense_subj, supersense_obj, token_sent])

5. Combine meaning of verbs - Hanks theory and results aggregation with frequency of semantics of use 


In [7]:
semantic_use= {}

for supersense_subj, supersense_obj, _ in supersenses:
    if (supersense_subj, supersense_obj) not in semantic_use:
        semantic_use[(supersense_subj, supersense_obj)] = 1
    else: 
        semantic_use[(supersense_subj, supersense_obj)] += 1

semantic_use = dict(sorted(semantic_use.items(), key=lambda item: item[1], reverse=True))

for key, value in semantic_use.items():
    print(key, value)

('noun.artifact', 'noun.communication') 9
('noun.communication', 'noun.communication') 8
('noun.person', 'noun.communication') 7
('noun.group', 'noun.cognition') 7
('noun.act', 'noun.cognition') 6
('noun.group', 'noun.attribute') 6
('noun.person', 'noun.artifact') 6
('noun.communication', 'noun.cognition') 5
('noun.artifact', 'noun.act') 5
('noun.cognition', 'noun.cognition') 5
('noun.act', 'noun.communication') 5
('noun.group', 'noun.act') 5
('noun.cognition', 'noun.artifact') 4
('noun.artifact', 'noun.artifact') 4
('noun.person', 'noun.act') 4
('noun.person', 'noun.location') 4
('noun.person', 'noun.attribute') 4
('noun.act', 'noun.act') 4
('noun.body', 'noun.event') 3
('noun.artifact', 'noun.process') 3
('noun.group', 'noun.communication') 3
('noun.body', 'noun.cognition') 3
('noun.person', 'noun.cognition') 3
('noun.cognition', 'noun.communication') 3
('noun.cognition', 'noun.act') 3
('noun.artifact', 'noun.state') 3
('noun.attribute', 'noun.state') 2
('noun.artifact', 'noun.group'

6. Detection synset of verb in context and mapping with cluster semantics

In [8]:
analysis = {}
for supersense_subj, supersense_obj, token_sent in supersenses: 
    synset_context = lesk(token_sent, "handle", "v")
    if synset_context is not None:
        if synset_context.name() not in analysis:
            analysis[synset_context.name()] = {}
            analysis[synset_context.name()][(supersense_subj, supersense_obj)] = 1
        else: 
            if (supersense_subj, supersense_obj) not in analysis[synset_context.name()]:
                analysis[synset_context.name()][(supersense_subj, supersense_obj)] = 1
            else: 
                analysis[synset_context.name()][(supersense_subj, supersense_obj)] += 1
    else: 
        print("No synset found for 'handle' in sentence: ", sentence)

for synset in analysis: 
    print("\n \n \n \t \tSynset: ", synset)
    print("Definizion synset: ", wn.synset(synset).definition())
    print("Semantic use: ")
    for subj, obj in analysis[synset]: 
        print(subj, obj, "\nFrequency: ", analysis[synset][(subj, obj)])
    
    print("\n\nSummary: ")

    count_subj = {}
    count_obj = {}
    for subj, obj in analysis[synset]:
        if subj not in count_subj:
            count_subj[subj] = analysis[synset][(subj, obj)]
        else: 
            count_subj[subj] += analysis[synset][(subj, obj)]
        if obj not in count_obj:
            count_obj[obj] = analysis[synset][(subj, obj)]
        else: 
            count_obj[obj] += analysis[synset][(subj, obj)]

    print("\nSUPERSENSES SUBJECT")
    for subj in count_subj:
        print(subj, ":", count_subj[subj])
    
    print("\n\nSUPERSENSES OBJECT")
    for obj in count_obj:
        print(obj, ":", count_obj[obj])


 
 
 	 	Synset:  wield.v.02
Definizion synset:  handle effectively
Semantic use: 
noun.body noun.event 
Frequency:  2
noun.communication noun.communication 
Frequency:  3
verb.competition noun.communication 
Frequency:  1
noun.communication noun.cognition 
Frequency:  5
noun.artifact noun.process 
Frequency:  2
verb.contact noun.artifact 
Frequency:  1
noun.group noun.communication 
Frequency:  3
noun.object noun.object 
Frequency:  1
noun.communication adj.all 
Frequency:  1
noun.communication noun.location 
Frequency:  1
noun.attribute noun.state 
Frequency:  2
noun.act noun.cognition 
Frequency:  3
noun.animal noun.communication 
Frequency:  1
noun.group noun.cognition 
Frequency:  6
noun.group noun.attribute 
Frequency:  5
noun.person noun.artifact 
Frequency:  4
noun.artifact noun.artifact 
Frequency:  3
noun.person noun.act 
Frequency:  4
noun.body noun.cognition 
Frequency:  3
noun.artifact noun.communication 
Frequency:  6
adj.all noun.communication 
Frequency:  1
verb.stative