# **Mapping of frames in WordNet synsets**

Frames estratti con la funzione getFrameSetForStudent() per il cognome Grandi:
| ID | Frame |
| --- | --- |
| 2658 | Suicide_attack |
| 1633 | Board_vehicle |
| 1260 | Simple_name |
| 1871 | Access_scenario |
| 652 | Eclipse |

Dato che: 
 - Suicide_attack
 - Board_vehicle 
 - Simple_name 
 - Access_scenario 

non sono presenti in WordNet, sono stati scelti rispettivamente:
- Suicide
- Vehicle
- Name
- Access

In pratica dovrò: associare dei synset al frame name, frame elements e LUs presi dai 5 del frameset.
Considerando come contesto di disambiguazione la frame name definition, frame element definition e LU definition, rispettivamente.
Come contesto dei sensi considero le varie definizioni del termine principale del frame name, la gloss, esempi e iponimi e iperonimi. E lo stesso per FE e LU


    

In [9]:
from nltk.corpus import framenet as fn
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
import pandas as pd

In [10]:
# Frame ids for student Grandi
frame_ids = [2658, 1633, 1260, 1871, 652]

# Simplified frame names
simplified_names = ['Suicide', 'Vehicle', 'Name', 'Access', 'Eclipse']

## Creo manualmente le annotazioni

Per ogni frame assegno un synset a frame name, frame elements e le prime 10 lexical units

In [11]:
# Create a dictionary with: 
# - key = frame 
# - value = dictionary with: 
#           - key = word (Frame name, FE, LU
#           - value = 
annotations = {
    'Suicide_attack': {
        'Suicide': 'suicide.n.01',
        'Victim': 'victim.n.01',
        'Weapon': 'weapon.n.01',
        'Assailant': 'attacker.n.01',
        'Manner': 'manner.n.01',
        'Means': 'means.n.01',
        'Place': 'topographic_point.n.01',
        'Purpose': 'purpose.n.01',
        'Time': 'fourth_dimension.n.01',
        'Source': 'beginning.n.04',
        'Depictive': 'delineative.s.01',
        'Re-encoding': None,
        'Result': 'consequence.n.01',
        'Containing_event': None,
        'Explanation': 'explanation.n.01',
        'Particular_iteration': None,
        'Circumstances': 'circumstance.n.01',
        'Path': 'path.n.03',
        'suicide attack.n': 'suicide_bombing.n.01',
        'suicide bombing.n': 'suicide_bombing.n.01',
        'kamikaze [attack].n': 'kamikaze.n.02',
        'suicide bomber.n': 'suicide_bomber.n.01',
        'suicide attacker.n': 'suicide_bomber.n.01',
        'kamikaze [person].n': 'kamikaze.n.02'
    },
    'Board_vehicle': {
        'Vehicle': 'vehicle.n.01',
        'Traveller': 'traveler.n.01',
        'Vehicle': 'vehicle.n.01',
        'Source': 'beinning.n.04',
        'Manner': 'manner.n.01',
        'Path': 'path.n.03',
        'Duration_of_final_state': 'duration.n.01',
        'Depictive': 'delineative.s.01',
        'Time': 'fourth_dimension.n.01',
        'Cotheme': None,
        'Purpose': 'purpose.n.01',
        'Place': 'topographic_point.n.01',
        'Circumstances': 'circumstance.n.01',
        'board.v': 'board.v.01',
        'get.v': 'board.v.01',
        'embark.v': 'embark.v.01',
        'emplane.v': 'emplane.v.01',
        'entrain.v': 'entrain.v.01',
        'mount.v': 'hop_on.v.01',
        'hop.v': 'hop.v.03',
        'embarkation.n': 'boarding.n.01',
        'embarkment.n': 'boarding.n.01',
        'embus.v': None
    },
    'Simple_name': {
        'Name': 'name.n.01',
        'Term': 'term.n.01',
        'Entity': 'entity.n.01',
        'Speaker': 'speaker.n.01',
        'term.n': 'term.n.01',
        'word.n': 'word.n.01'
    },
    'Access_scenario': {
        'Access': 'access.n.06',
        'Theme': 'subject.n.01',
        'Useful_location': 'location.n.01',
        'Barrier': 'barrier.n.01'
    },
    'Eclipse': {
        'Eclipse': 'eclipse.n.01',
        'Obstruction': 'obstruction.n.01',
        'Eclipsed': 'eclipse.v.02',
        'Vantage_point': 'vantage_point.n.01',
        'Degree': 'degree.n.01',
        'Subregion': 'region.n.01',
        'hide.v': 'hide.v.01',
        'eclipse.v': 'eclipse.v.02',
        'mask.v': 'mask.v.05',
        'cover.v': 'cover.v.01',
        'obscure.v': 'obscure.v.01',
        'screen.v': 'screen.v.05',
        'veil.v': 'veil.v.01',
        'hidden.a': 'hide.v.01',
        'blot out.v': 'obscure.v.05',
        'obscured.a': 'obscure.n.01'
    }
}

### Funzioni di utility per il preprocessing

In [12]:
# Remove stopwords and punctuation from a given sentence
def remove_stopwords(sentence):
    stop_words = []
    with open('/Users/jak/Documents/Uni/TLN/TLN/Radicioni/data/stop_words_FULL.txt', 'r') as f:
        for line in f:
            stop_words.append(line.strip())
    sentence = [w for w in sentence if w not in stop_words]
    return sentence

# Remove punctuation from a given list of words
def remove_punctuation(list):
    final_list = []
    punct = set(['.', ',', '!', '?', ':', ';', '(', ')', '[', ']', '{', '}', '"', "'", '``', "''", '...', '’', '“', '”']) # Keeping '-' between words
    for word in list:
        for c in word:
            if c in punct:
                word = word.replace(c, '')
            if c == '_':
                word = word.replace(c, ' ')
        final_list.append(word)
    return final_list

# Lemmatize a given list of words
def lemmatize(list):
    lemmatizer = WordNetLemmatizer()
    final_set = set()
    for word in list:
        final_set.add(lemmatizer.lemmatize(word))
    return final_set

# Full preprocessing of a given list of words
def preprocess(list):
    list = remove_stopwords(list) # Don't get why if I remove stopwords even here I get less words
    list = remove_punctuation(list)
    list = lemmatize(list)
    list = remove_stopwords(list)
    list = [x for x in list if x] # Remove empty strings
    return set(list)

## Estrazione del contesto dei sensi

Creo una lista di content words per un dato synset, cercando all'interno di:
- nome del synset
- definizione del synset
- esempi del synset
- definizioni di iponimi e iperonimi
- esempi di iponimi e iperonimi

In [13]:
# Get context for a given synset
def get_context(synset):
    context = set(synset.name().split('.')[0].split('_')) 
    context.update(synset.definition().lower().split())
    for example in synset.examples():
        context.update(example.lower().split())
    for hypo in synset.hyponyms():
        context.update(hypo.name().split('.')[0].split('_'))
        context.update(hypo.definition().lower().split())
        for example in hypo.examples():
            context.update(example.lower().split())
    for hyper in synset.hypernyms():
        context.update(hyper.name().split('.')[0].split('_'))
        context.update(hyper.definition().lower().split())
        for example in hyper.examples():
            context.update(example.lower().split())
    context = preprocess(context)
    return context

## Estrazione del contesto del frame

Creo una lista di content words per un dato frame, cercando all'interno di:
- Frame name
- Frame definition
- Frame Element name
- Frame definition
- Lexical Unit name
- Lexical Unit definition

In [14]:
# Get context for a given frame
def get_frame_context(frame):
    context = set(frame.definition.lower().split())
    context.update(frame.name.lower().split())
    for fe in frame.FE:
        context.update(frame.FE[fe].definition.lower().split())
        context.update(fe.lower().split())
    for lu in frame.lexUnit:
        context.update(frame.lexUnit[lu].lexemes[0].name.lower().split())
        context.update(frame.lexUnit[lu].definition.lower().split())
    context = preprocess(context)
    return context

## Estrazione del contesto del frame element

Creo una lista di content words per un dato frame element, cercando all'interno di:
- Frame Element name
- Frame Element definition

In [15]:
# Get context for a given frame element
def get_fe_context(frame_element):
    context = set(frame_element.definition.lower().split())
    context.update(frame_element.name.lower().split())
    context = preprocess(context)
    return context

## Estrazione del contesto del lexical unit

Creo una lista di content words per un dato lexical unit, cercando all'interno di:
- Lexical Unit name
- Lexical Unit definition

In [16]:
# Get context for a given lexical unit
def get_lu_context(lexical_unit):
    context = set(lexical_unit.lexemes[0].name.lower().split())
    context.update(lexical_unit.definition.lower().split())
    context = preprocess(context)
    return context

## Mapping - Approccio bag of words

In [18]:
# Find best synset for a given word and context
def find_best_syn(word, context):
    best = 0
    best_syn = None
    for synset in wn.synsets(word):
        syn_context = get_context(synset)
        score = len(syn_context.intersection(context)) + 1 # + 1 According to the paper
        if score > best:
            best = score
            best_syn = synset
    return best_syn

## Disambiguazione dei synset di: frame names, FEs e LUs

In [26]:
# Find best synset for:
# - frame (with the simplified name)
#   - frame elements
#   - lexical units
def disambiguate():
    total = 0
    dict = {}
    for frame_id in frame_ids:
        frame = fn.frame_by_id(frame_id)
        dict[frame.name] = {}

        # Frame
        word = simplified_names[frame_ids.index(frame_id)]
        frame_context = get_frame_context(frame)
        best_syn = find_best_syn(word, frame_context)
        dict[frame.name][word] = best_syn
        print(f'FRAME: {frame.name}\n\tName: {word} - {best_syn}')
        
        # Frame elements
        for fe in frame.FE:
            fe_context = get_fe_context(frame.FE[fe])
            best_syn = find_best_syn(fe, fe_context)
            dict[frame.name][fe] = best_syn
            print(f'\tFE: {fe} - {best_syn}')
            
        # Lexical units
        count = 0
        for lu in frame.lexUnit:
            # Disambiguate only the first 10 lexical units -- too many to manually annotate
            if count >= 10:
                break
            count += 1
            lu_context = get_lu_context(frame.lexUnit[lu])
            best_syn = find_best_syn(frame.lexUnit[lu].lexemes[0].name, lu_context)
            dict[frame.name][lu] = best_syn
            print(f'\tLU: {lu} - {best_syn}')
        
        # Total words to disambiguate = 1 frame name + len(frame elements) + count (max 10 lexical units)
        print(f'Total words for frame {frame.name}: {len(frame.FE) + count + 1}')
        print('-' * 70)
        total += len(frame.FE) + count + 1

    print(f'Total words for all frames: {total}')
    return dict

result = disambiguate()

FRAME: Suicide_attack
	Name: Suicide - Synset('suicide.n.01')
	FE: Victim - Synset('victim.n.01')
	FE: Weapon - Synset('weapon.n.01')
	FE: Assailant - Synset('attacker.n.01')
	FE: Manner - Synset('manner.n.01')
	FE: Means - Synset('means.n.01')
	FE: Place - Synset('topographic_point.n.01')
	FE: Purpose - Synset('purpose.n.01')
	FE: Time - Synset('time.n.01')
	FE: Source - Synset('beginning.n.04')
	FE: Depictive - Synset('delineative.s.01')
	FE: Re-encoding - None
	FE: Result - Synset('consequence.n.01')
	FE: Containing_event - None
	FE: Explanation - Synset('explanation.n.01')
	FE: Particular_iteration - None
	FE: Circumstances - Synset('circumstance.n.01')
	FE: Path - Synset('path.n.03')
	LU: suicide attack.n - Synset('suicide.n.01')
	LU: suicide bombing.n - Synset('suicide.n.02')
	LU: kamikaze [attack].n - Synset('kamikaze.n.01')
	LU: suicide bomber.n - Synset('suicide.n.01')
	LU: suicide attacker.n - Synset('suicide.n.02')
	LU: kamikaze [person].n - Synset('kamikaze.n.01')
Total wor

## Valutare i risultati con le annotations manuali

In [36]:
# Compute the accuracy of the disambiguation using the manually annotated data
def compute_accuracy():
    total = 0
    correct = 0
    for key in result:
        for word in result[key]:
            if word in annotations[key]:
                total += 1
                if result[key][word] is None and annotations[key][word] is None:
                    correct += 1 # I will consider None as correct if the annotation is None
                elif result[key][word] is not None:
                    if result[key][word].name() == annotations[key][word]:
                        correct += 1
    return correct, total

correct, total = compute_accuracy()
print(f'Accuracy: {(correct / total) * 100}%')
print(f'Correct: {correct}')
print(f'Total: {total}') 
# Total words is 1 less than the total words for all frames because there is a duplicate in the frame Board_vehicle:
# - Simplified frame name: Vehicle
# - Second frame element: Vehicle

Accuracy: 65.27777777777779%
Correct: 47
Total: 72


In [34]:
# Save the result in a csv file in the output folder
def save_result():
    header = ['Word', 'Found', 'Gold']
    with open('/Users/jak/Documents/Uni/TLN/TLN/Radicioni/output/disambiguation-results.csv', 'w', newline='') as f:
        for key in result:
            cap_title = key.upper()
            f.write(f'{cap_title}\n')
            for word in result[key]:
                cap_word = word.upper()
                if result[key][word] is not None:
                    synset = result[key][word].name()
                else:
                    synset = 'None'
                if synset == annotations[key][word] or (synset == 'None' and annotations[key][word] is None):
                    check = '✅'
                else:
                    check = '❌'
                f.write(f'\t - {cap_word},\t Found: {synset},\t Gold: {annotations[key][word]},\t {check}\n')
                    
save_result()