# **Mapping of frames in WordNet synsets**

Frames estratti con la funzione getFrameSetForStudent() per il cognome Grandi:
| ID | Frame |
| --- | --- |
| 2658 | Suicide_attack |
| 1633 | Board_vehicle |
| 1260 | Simple_name |
| 1871 | Access_scenario |
| 652 | Eclipse |

Dato che: 
 - Suicide_attack
 - Board_vehicle 
 - Simple_name 
 - Access_scenario 

non sono presenti in WordNet, sono stati scelti rispettivamente:
- Suicide
- Vehicle
- Name
- Access

In pratica dovrò: associare dei synset al frame name, frame elements e LUs presi dai 5 del frameset.
Considerando come contesto di disambiguazione la frame name definition, frame element definition e LU definition, rispettivamente.
Come contesto dei sensi considero le varie definizioni del termine principale del frame name, la gloss, esempi e iponimi e iperonimi. E lo stesso per FE e LU


    

In [157]:
from nltk.corpus import framenet as fn
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
import pandas as pd

In [235]:
# Frame ids for student Grandi
frame_ids = [2658, 1633, 1260, 1871, 652]

# Simplified frame names
simplified_names = ['Suicide', 'Vehicle', 'Name', 'Access', 'Eclipse']

## Creo manualmente le annotazioni

In [252]:
# Create a dictionary with: 
# - key = frame 
# - value = dictionary with: 
#           - key = word (Frame name, FE, LU
#           - value = 
annotations = {
    2658: {
        'Suicide': 'suicide.n.01',
        'Victim': 'victim.n.01',
        'Weapon': 'weapon.n.01',
        'Assailant': 'attacker.n.01',
        'Manner': 'manner.n.01',
        'Means': 'means.n.01',
        'Place': 'topographic_point.n.01',
        'Purpose': 'purpose.n.01',
        'Time': 'fourth_dimension.n.01',
        'Source': 'beginning.n.04',
        'Depictive': 'delineative.s.01',
        'Re-encoding': None,
        'Result': 'consequence.n.01',
        'Containing_event': None,
        'Explanation': 'explanation.n.01',
        'Particular_iteration': None,
        'Circumstances': 'circumstance.n.01',
        'Path': 'path.n.03',
        'suicide attack.n': 'suicide_bombing.n.01',
        'suicide bombing.n': 'suicide_bombing.n.01',
        'kamikaze [attack].n': 'kamikaze.n.02',
        'suicide bomber.n': 'suicide_bomber.n.01',
        'suicide attacker.n': 'suicide_bomber.n.01',
        'kamikaze [person].n': 'kamikaze.n.02'
    },
    1633: {
        'Vehicle': 'vehicle.n.01',
        'Traveller': 'traveler.n.01',
        'Vehicle': 'vehicle.n.01',
        'Source': 'beinning.n.04',
        'Manner': 'manner.n.01',
        'Path': 'path.n.03',
        'Duration_of_final_state': 'duration.n.01',
        'Depictive': 'delineative.s.01',
        'Time': 'fourth_dimension.n.01',
        'Cotheme': None,
        'Purpose': 'purpose.n.01',
        'Place': 'topographic_point.n.01',
        'Circumstances': 'circumstance.n.01',
        'board.v': 'board.v.01',
        'get.v': 'board.v.01',
        'embark.v': 'embark.v.01',
        'emplane.v': 'emplane.v.01',
        'entrain.v': 'entrain.v.01',
        'mount.v': 'hop_on.v.01',
        'hop.v': 'hop.v.03',
        'embarkation.n': 'boarding.n.01',
        'embarkment.n': 'boarding.n.01',
        'embus.v': None
    },
    1260: {
        'Name': 'name.n.01',
        'Term': 'term.n.01',
        'Entity': 'entity.n.01',
        'Speaker': 'speaker.n.01',
        'term.n': 'term.n.01',
        'word.n': 'word.n.01'
    },
    1871: {
        'Access': 'access.n.06',
        'Theme': 'subject.n.01',
        'Useful_location': 'location.n.01',
        'Barrier': 'barrier.n.01'
    },
    652: {
        'Eclipse': 'eclipse.n.01',
        'Obstruction': 'obstruction.n.01',
        'Eclipsed': 'eclipse.v.02',
        'Vantage_point': 'vantage_point.n.01',
        'Degree': 'degree.n.01',
        'Subregion': 'region.n.01',
        'hide.v': 'hide.v.01',
        'eclipse.v': 'eclipse.v.02',
        'mask.v': 'mask.v.05',
        'cover.v': 'cover.v.01',
        'obscure.v': 'obscure.v.01',
        'screen.v': 'screen.v.05',
        'veil.v': 'veil.v.01',
        'hidden.a': 'hide.v.01',
        'blot out.v': 'obscure.v.05',
        'obscured.a': 'obscure.n.01'
    }
}

### Funzioni di utility per il preprocessing

In [95]:
# Remove stopwords and punctuation from a given sentence
def remove_stopwords(sentence):
    stop_words = []
    with open('/Users/jak/Documents/Uni/TLN/TLN/Radicioni/data/stop_words_FULL.txt', 'r') as f:
        for line in f:
            stop_words.append(line.strip())
    sentence = [w for w in sentence if w not in stop_words]
    return sentence

# Remove punctuation from a given list of words
def remove_punctuation(list):
    final_list = []
    punct = set(['.', ',', '!', '?', ':', ';', '(', ')', '[', ']', '{', '}', '"', "'", '``', "''", '...', '’', '“', '”']) # Keeping '-' between words
    for word in list:
        for c in word:
            if c in punct:
                word = word.replace(c, '')
            if c == '_':
                word = word.replace(c, ' ')
        final_list.append(word)
    return final_list

# Lemmatize a given list of words
def lemmatize(list):
    lemmatizer = WordNetLemmatizer()
    final_set = set()
    for word in list:
        final_set.add(lemmatizer.lemmatize(word))
    return final_set

# Full preprocessing of a given list of words
def preprocess(list):
    list = remove_stopwords(list) # Don't get why if I remove stopwords even here I get less words
    list = remove_punctuation(list)
    list = lemmatize(list)
    list = remove_stopwords(list)
    list = [x for x in list if x] # Remove empty strings
    return set(list)


## Estrazione del contesto dei sensi

Creo una lista di content words per un dato synset, cercando all'interno di:
- definizione del synset
- esempi del synset
- definizioni di iponimi e iperonimi
- esempi di iponimi e iperonimi

In [96]:
# Get context for a given synset
def get_context(synset):
    context = set(synset.name().split('.')[0].split('_')) 
    context.update(synset.definition().lower().split())
    for example in synset.examples():
        context.update(example.lower().split())
    for hypo in synset.hyponyms():
        context.update(hypo.name().split('.')[0].split('_'))
        context.update(hypo.definition().lower().split())
        for example in hypo.examples():
            context.update(example.lower().split())
    for hyper in synset.hypernyms():
        context.update(hyper.name().split('.')[0].split('_'))
        context.update(hyper.definition().lower().split())
        for example in hyper.examples():
            context.update(example.lower().split())
    context = preprocess(context)
    return context

context = get_context(wn.synset('eclipse.n.01'))
print(context)
print(len(context))

{'eclipse', 'total', 'action', 'celestial', 'activity', 'completely', 'body', 'obscures', 'abrupt', 'eclipsed', 'occurrence', 'obscured', 'player', 'light', 'sun', 'place', 'annoying', 'telephone', 'solar', 'ongoing', 'interrupt', 'moon', 'shining', 'hurt', 'lunar', 'partial', 'partially', 'interruption', 'earth', 'break'}
30


Estraggo il contesto dei sensi per ogni synset di una parola data

In [90]:
# TODO: da cambiare di sihuro deh
# Get context for all synsets of a given word
def get_context_all(word):
    context = []
    for synset in wn.synsets(word):
        context.append(get_context(synset))
    return context

context = get_context_all('eclipse')
print(context)


[['eclipse', 'total', 'action', 'celestial', 'activity', 'completely', 'body', 'obscures', 'abrupt', 'eclipsed', 'occurrence', 'obscured', 'player', 'light', 'sun', 'place', 'annoying', 'telephone', 'solar', 'ongoing', 'interrupt', 'moon', 'shining', 'hurt', 'lunar', 'partial', 'partially', 'interruption', 'earth', 'break'], ['overshadowed', 'happiness', 'threatening', 'day', 'dark', 'couple', 'greater', 'vision', 'brood', 'significance', 'long', 'overshadow', 'terrible', 'menacing', 'hang', 'tragedy', 'brooded'], ['house', 'intervention', 'overshadowed', 'eclipse', 'moon', 'today', 'planet', 'shadow', 'tree', 'celestial', 'tall', 'cast', 'star', 'overshadow', 'occulted', 'body', 'sun']]


## Estrazione del contesto del frame

Creo una lista di content words per un dato frame, cercando all'interno di:
- Frame name
- Frame definition
- Frame Element name
- Frame definition
- Lexical Unit name
- Lexical Unit definition

In [251]:
# Get context for a given frame
def get_frame_context(frame):
    context = set(frame.definition.lower().split())
    context.update(frame.name.lower().split())
    for fe in frame.FE:
        context.update(frame.FE[fe].definition.lower().split())
        context.update(fe.lower().split())
    for lu in frame.lexUnit:
        context.update(frame.lexUnit[lu].lexemes[0].name.lower().split())
        context.update(frame.lexUnit[lu].definition.lower().split())
    context = preprocess(context)
    return context

context = get_frame_context(fn.frame('Board_vehicle'))
print(context)
print(len(context))

{'starting-point', 'woman', 'transport', 'brandhoek', 'embarked', 'expression', 'entrain', 'describes', 'bus', 'flight', 'ahead', 'boarding', 'cotheme', 'hellespont', 'main', 'shuttle', 'mounted', 'area', 'hour', 'drunk', '0730', '19', 'airborne', 'embarkation', 'bank', 'passenger', 'delayed', 'mount', 'embarkment', 'brave', 'hitting', 'time', 'describe', 'animal', 'order', 'property', 'lawler', 'vessel', 'southward', 'duration of final state', 'cod', 'rhine', 'co-traveller', 'boarded', 'independent', 'embarking', 'handsome', 'moving', 'place', 'merchant', 'mile', 'battalion', '1834', 'gather', 'station', 'embussed', 'headed', 'event', 'december', 'ride', 'transportation', 'kate', 'horse', 'participant', 'power', 'glittered', 'path', 'aboard', 'looked', 'board', '5th', 'element', 'aircraft', 'circling', 'fn', 'jump', 'circumstance', 'entrained', 'bicycle', 'conveyance', 'sun', 'reportedly', 'bart', 'plane', 'october', 'depictive', 'motion', 'bar', 'board vehicle', 'embark', 'armour', '

## Estrazione del contesto del frame element

Creo una lista di content words per un dato frame element, cercando all'interno di:
- Frame Element name
- Frame Element definition

In [174]:
# Get context for a given frame element
def get_fe_context(frame_element):
    context = set(frame_element.definition.lower().split())
    context.update(frame_element.name.lower().split())
    context = preprocess(context)
    return context

## Estrazione del contesto del lexical unit

Creo una lista di content words per un dato lexical unit, cercando all'interno di:
- Lexical Unit name
- Lexical Unit definition

In [195]:
# Get context for a given lexical unit
def get_lu_context(lexical_unit):
    context = set(lexical_unit.lexemes[0].name.lower().split())
    context.update(lexical_unit.definition.lower().split())
    context = preprocess(context)
    return context

## Mapping dei synset con i frame

Approccio bag of words

In [185]:
# Find best synset for a given word and context
def find_best_syn(word, context):
    best = 0
    best_syn = None
    for synset in wn.synsets(word):
        syn_context = get_context(synset)
        score = len(syn_context.intersection(context)) + 1 # + 1 According to the paper
        # print(synset, score)
        if score > best:
            best = score
            best_syn = synset
    return best_syn

## Disambiguazione dei synset di: Frames, FEs e LUs

In [257]:
# Find best synset for:
# - frame (with the simplified name)
#   - frame elements
#   - lexical units

def disambiguate():
    i = 0

    for frame_id in frame_ids:
        frame = fn.frame_by_id(frame_id)
        gold_annotation = annotations[frame_id]
        print(gold_annotation.keys())
        
        # Frame
        word = simplified_names[frame_ids.index(frame_id)]
        frame_context = get_frame_context(frame)
        best_syn = find_best_syn(word, frame_context)
        print(f'FRAME: {frame.name} - {word} - {best_syn}\t GOLD: {gold_annotation}')

        # Frame elements
        for fe in frame.FE:
            fe_context = get_fe_context(frame.FE[fe])
            best_syn = find_best_syn(fe, fe_context)
            print(f'FE: {fe} - {best_syn}')

        # Lexical units
        count = 0
        for lu in frame.lexUnit:
            # Disambiguate only the first 10 lexical units -- too many to manually annotate
            if count >= 10:
                break
            count += 1
            lu_context = get_lu_context(frame.lexUnit[lu])
            best_syn = find_best_syn(frame.lexUnit[lu].lexemes[0].name, lu_context)
            print(f'LU: {lu} - {best_syn}')
        
        # number of FEs + LUs
        print(f'Total FEs + LUs + frame name: {len(frame.FE) + count + 1}')
        # Separator
        print('-' * 70)

disambiguate()


dict_keys(['Suicide', 'Victim', 'Weapon', 'Assailant', 'Manner', 'Means', 'Place', 'Purpose', 'Time', 'Source', 'Depictive', 'Re-encoding', 'Result', 'Containing_event', 'Explanation', 'Particular_iteration', 'Circumstances', 'Path', 'suicide attack.n', 'suicide bombing.n', 'kamikaze [attack].n', 'suicide bomber.n', 'suicide attacker.n', 'kamikaze [person].n'])
FRAME: Suicide_attack - Suicide - Synset('suicide.n.01')	 GOLD: {'Suicide': 'suicide.n.01', 'Victim': 'victim.n.01', 'Weapon': 'weapon.n.01', 'Assailant': 'attacker.n.01', 'Manner': 'manner.n.01', 'Means': 'means.n.01', 'Place': 'topographic_point.n.01', 'Purpose': 'purpose.n.01', 'Time': 'fourth_dimension.n.01', 'Source': 'beginning.n.04', 'Depictive': 'delineative.s.01', 'Re-encoding': None, 'Result': 'consequence.n.01', 'Containing_event': None, 'Explanation': 'explanation.n.01', 'Particular_iteration': None, 'Circumstances': 'circumstance.n.01', 'Path': 'path.n.03', 'suicide attack.n': 'suicide_bombing.n.01', 'suicide bombin

In [237]:
# Create a dictionary with: key = frame, value = list of words to annotate
def create_list():
    words = []
    final_list = {}
    i = 0
    for frame_id in frame_ids:
        words = []
        frame = fn.frame_by_id(frame_id)
        final_list[frame.name] = []
        words.append(frame.name)
        for fe in frame.FE:
            words.append(fe)
        count = 0
        for lu in frame.lexUnit:
            if count >= 10:
                break
            count += 1
            words.append(lu)
        for word in words:
            if word not in final_list[frame.name]:
                final_list[frame.name].append(word)
    # substitute the first word of each list (which is the multiword frame name) with the simplified name
    for key in final_list:
        final_list[key][0] = simplified_names[i]
        i += 1
        # final_list[key] = pd.unique(final_list[key]).tolist() # I use pd.unique in order to maintain the order
    return final_list

words = create_list()
for frame in words:
    print(frame)
    print(words[frame])
    print(len(words[frame]))

Suicide_attack
['Suicide', 'Victim', 'Weapon', 'Assailant', 'Manner', 'Means', 'Place', 'Purpose', 'Time', 'Source', 'Depictive', 'Re-encoding', 'Result', 'Containing_event', 'Explanation', 'Particular_iteration', 'Circumstances', 'Path', 'suicide attack.n', 'suicide bombing.n', 'kamikaze [attack].n', 'suicide bomber.n', 'suicide attacker.n', 'kamikaze [person].n']
24
Board_vehicle
['Vehicle', 'Traveller', 'Vehicle', 'Source', 'Manner', 'Path', 'Duration_of_final_state', 'Depictive', 'Time', 'Cotheme', 'Purpose', 'Place', 'Circumstances', 'board.v', 'get.v', 'embark.v', 'emplane.v', 'entrain.v', 'mount.v', 'hop.v', 'embarkation.n', 'embarkment.n', 'embus.v']
23
Simple_name
['Name', 'Term', 'Entity', 'Speaker', 'term.n', 'word.n']
6
Access_scenario
['Access', 'Theme', 'Useful_location', 'Barrier']
4
Eclipse
['Eclipse', 'Obstruction', 'Eclipsed', 'Vantage_point', 'Degree', 'Subregion', 'hide.v', 'eclipse.v', 'mask.v', 'cover.v', 'obscure.v', 'screen.v', 'veil.v', 'hidden.a', 'blot out.v'

In [250]:
# Create a dictionary with: 
# - key = frame 
# - value = dictionary with: 
#           - key = word (Frame name, FE, LU
#           - value = 
annotations = {
    'Suicide_attack': {
        'Suicide': 'suicide.n.01',
        'Victim': 'victim.n.01',
        'Weapon': 'weapon.n.01',
        'Assailant': 'attacker.n.01',
        'Manner': 'manner.n.01',
        'Means': 'means.n.01',
        'Place': 'topographic_point.n.01',
        'Purpose': 'purpose.n.01',
        'Time': 'fourth_dimension.n.01',
        'Source': 'beginning.n.04',
        'Depictive': 'delineative.s.01',
        'Re-encoding': None,
        'Result': 'consequence.n.01',
        'Containing_event': None,
        'Explanation': 'explanation.n.01',
        'Particular_iteration': None,
        'Circumstances': 'circumstance.n.01',
        'Path': 'path.n.03',
        'suicide attack.n': 'suicide_bombing.n.01',
        'suicide bombing.n': 'suicide_bombing.n.01',
        'kamikaze [attack].n': 'kamikaze.n.02',
        'suicide bomber.n': 'suicide_bomber.n.01',
        'suicide attacker.n': 'suicide_bomber.n.01',
        'kamikaze [person].n': 'kamikaze.n.02'
    },
    'Board_vehicle': {
        'Vehicle': 'vehicle.n.01',
        'Traveller': 'traveler.n.01',
        'Vehicle': 'vehicle.n.01',
        'Source': 'beinning.n.04',
        'Manner': 'manner.n.01',
        'Path': 'path.n.03',
        'Duration_of_final_state': 'duration.n.01',
        'Depictive': 'delineative.s.01',
        'Time': 'fourth_dimension.n.01',
        'Cotheme': None,
        'Purpose': 'purpose.n.01',
        'Place': 'topographic_point.n.01',
        'Circumstances': 'circumstance.n.01',
        'board.v': 'board.v.01',
        'get.v': 'board.v.01',
        'embark.v': 'embark.v.01',
        'emplane.v': 'emplane.v.01',
        'entrain.v': 'entrain.v.01',
        'mount.v': 'hop_on.v.01',
        'hop.v': 'hop.v.03',
        'embarkation.n': 'boarding.n.01',
        'embarkment.n': 'boarding.n.01',
        'embus.v': None
    },
    'Simple_name': {
        'Name': 'name.n.01',
        'Term': 'term.n.01',
        'Entity': 'entity.n.01',
        'Speaker': 'speaker.n.01',
        'term.n': 'term.n.01',
        'word.n': 'word.n.01'
    },
    'Access_scenario': {
        'Access': 'access.n.06',
        'Theme': 'subject.n.01',
        'Useful_location': 'location.n.01',
        'Barrier': 'barrier.n.01'
    },
    'Eclipse': {
        'Eclipse': 'eclipse.n.01',
        'Obstruction': 'obstruction.n.01',
        'Eclipsed': 'eclipse.v.02',
        'Vantage_point': 'vantage_point.n.01',
        'Degree': 'degree.n.01',
        'Subregion': 'region.n.01',
        'hide.v': 'hide.v.01',
        'eclipse.v': 'eclipse.v.02',
        'mask.v': 'mask.v.05',
        'cover.v': 'cover.v.01',
        'obscure.v': 'obscure.v.01',
        'screen.v': 'screen.v.05',
        'veil.v': 'veil.v.01',
        'hidden.a': 'hide.v.01',
        'blot out.v': 'obscure.v.05',
        'obscured.a': 'obscure.n.01'
    }
}

In [173]:
words = create_list()
disambiguate(words)

Suicide_attack ['suicide', 'victim', 'weapon', 'assailant', 'manner', 'means', 'place', 'purpose', 'time', 'source', 'depictive', 're-encoding', 'result', 'containing_event', 'explanation', 'particular_iteration', 'circumstances', 'path', 'kamikaze']
Board_vehicle ['vehicle', 'traveller', 'source', 'manner', 'path', 'duration_of_final_state', 'depictive', 'time', 'cotheme', 'purpose', 'place', 'circumstances', 'board', 'get', 'embark', 'emplane', 'entrain', 'mount', 'hop', 'embarkation', 'embarkment', 'embus']
Simple_name ['name', 'term', 'entity', 'speaker', 'word']
Access_scenario ['access', 'theme', 'useful_location', 'barrier']
Eclipse ['eclipse', 'obstruction', 'eclipsed', 'vantage_point', 'degree', 'subregion', 'hide', 'mask', 'cover', 'obscure', 'screen', 'veil', 'hidden', 'blot', 'obscured', 'block', 'screened', 'masked', 'veiled', 'covered', 'occlude', 'cloak', 'shroud', 'shrouded', 'cloaked', 'occlusion', 'occultation', 'conceal', 'concealed', 'obstruct', 'becloud', 'befog', 

In [8]:
# Save the frameset in a dictionary
# frames = {
#     '2658': 'Suicide_attack',
#     '1633': 'Board_vehicle',
#     '1260': 'Simple_name',
#     '1871': 'Access_scenario',
#     '652': 'Eclipse'
# }

# for key in frames:
#     print(fn.frame(frames[key]))