### Pre-sort samples
This notebook filters the model predictions.
It filters usages that have incomplete headwords and sorts the remaining usages by theire similarity to the nearest sense.

### Usage
Set the file paths in the second cell to the predictions of the models you want to filter.

In [1]:
import csv
from spacy.symbols import IS_PUNCT
import json
from nltk.corpus import wordnet as wn
import pandas as pd

In [2]:
historical_sample_file = "../data/outputs/model_predictions/SAMPLE_ccoha1_token-wordnet_sense_id_example[4]_embeddings.json"
modern_sample_file = "../data/outputs/model_predictions/SAMPLE_eng_news_2020_1M-sentences-wordnet_sense_id_example[4]_embeddings.json"


output_file = f"../data/outputs/annotation_phase_2/eng_combined_data.json"

In [7]:
with open(historical_sample_file, "r") as f:
    historical_sample = json.load(f)

    complete_historical = []

    for lemma in historical_sample:
        for usage in historical_sample[lemma]:
            if usage['empty_synsets'] == 0:
                complete_historical.append({
                    "lemma": lemma,
                    "sentence": usage['usage']['sentence'],
                    "target": usage['usage']['target'],
                    "highest_similarity": usage['usage']['closest'][0][1],
                    "corpus_id": f"ccoha1"
                })

complete_historical = pd.DataFrame(complete_historical)
complete_historical = complete_historical.drop_duplicates(subset=['sentence'])
complete_historical = complete_historical.sort_values(by=['highest_similarity'])

print(complete_historical.shape)
display(complete_historical.head())

historical_dict = complete_historical.to_dict(orient='records')
display(historical_dict[0])

(11, 5)


Unnamed: 0,lemma,sentence,target,highest_similarity,corpus_id
8,wage,This came barely 24 hours after Governor Fayem...,"[138, 142]",0.225902,ccoha1
9,resuscitate,Following mounting pressure from the Bretton W...,"[191, 202]",0.22819,ccoha1
3,qualification,The majority of the squad agreed to a 12.5 per...,"[115, 128]",0.258717,ccoha1
6,segment,The addition of 360 Blue to Natural Retreats' ...,"[166, 173]",0.285176,ccoha1
1,instrumental,The Red Bull junior program has played an inst...,"[42, 54]",0.288447,ccoha1


{'lemma': 'wage',
 'sentence': 'This came barely 24 hours after Governor Fayemis statewide broadcast where he appealed to wellmeaning Ekiti stakeholders to join hands in waging war against coronavirus.',
 'target': [138, 142],
 'highest_similarity': 0.22590182295190173,
 'corpus_id': 'ccoha1'}

In [9]:
with open(modern_sample_file, "r") as f:
    modern_sample = json.load(f)

    complete_modern = []

    for lemma in modern_sample:
        for usage in modern_sample[lemma]:
            if usage['empty_synsets'] == 0:
                complete_modern.append({
                    "lemma": lemma,
                    "sentence": usage['usage']['sentence'],
                    "target": usage['usage']['target'],
                    "highest_similarity": usage['usage']['closest'][0][1],
                    "corpus_id": f"leipzig_eng_news"
                })

complete_modern = pd.DataFrame(complete_modern)
complete_modern = complete_modern.drop_duplicates(subset=['sentence'])
complete_modern = complete_modern.sort_values(by=['highest_similarity'])

print(complete_modern.shape)
display(complete_modern.head())

modern_dict = complete_modern.to_dict(orient='records')
display(modern_dict[0])

(259, 5)


Unnamed: 0,lemma,sentence,target,highest_similarity,corpus_id
70,qualification,Slovan Bratislava's Champions League qualifica...,"[37, 50]",0.173049,leipzig_eng_news
68,qualification,Streichs team is currently eighth in the 18tea...,"[120, 133]",0.188805,leipzig_eng_news
67,qualification,"But they have fallen away badly, winning only ...","[144, 157]",0.192252,leipzig_eng_news
66,qualification,The bid for Aberdeens first Scottish Cup since...,"[65, 78]",0.19447,leipzig_eng_news
195,wage,Davis gained fame as a state legislator by wag...,"[43, 47]",0.194945,leipzig_eng_news


{'lemma': 'qualification',
 'sentence': "Slovan Bratislava's Champions League qualification tie against Faroe Islands side KI Klaksvik has been cancelled after a player from the Slovakian club tested positive for COVID19, European soccer's governing body UEFA said on Saturday.",
 'target': [37, 50],
 'highest_similarity': 0.17304900400543594,
 'corpus_id': 'leipzig_eng_news'}

In [6]:
# combine both datasets
combined_data = historical_dict + modern_dict
combined_data = pd.DataFrame(combined_data)
combined_data = combined_data.drop_duplicates(subset=['lemma', 'sentence'])
combined_data = combined_data.sort_values(by=['corpus_id', 'highest_similarity'])

combined_data.to_json(output_file, orient='records', indent=4)

print(combined_data.shape)
display(combined_data.head())

(4531, 5)


Unnamed: 0,lemma,sentence,target,highest_similarity,corpus_id
0,et,hugade Kipare f underrttelse om tw stycke goda...,"[73, 75]",0.034321,kubhist2a
1,et,Nsta Michaeli stundas at fra i sjelfwa Staben ...,"[46, 48]",0.052518,kubhist2a
2,et,en bestickning til Kejsaren i China den bestod...,"[88, 90]",0.069148,kubhist2a
3,tank,"Lsare mste gista sig till Skriftens innehll , ...","[157, 164]",0.072811,kubhist2a
4,patriot,Till Skellefte r lgenhet fr Fraktgods och Patz...,"[100, 109]",0.076772,kubhist2a


---

In [218]:
complete_synset_usages = []
with open(sample_file) as f:
    sample = json.load(f)

    for lemma in sample:
       # print(lemma)
        for usage in sample[lemma]: # ["unassigned_usages"]:
#            #u = usage['usage']
            if usage['empty_synsets'] == 0:
                complete_synset_usages.append({
                    "lemma": lemma,
                    "sentence": usage['usage']['sentence'],
                    "target": usage['usage']['target'],
                    "highest_similarity": usage['usage']['closest'][0][1],
                    #"corpus_id": "leipzig_eng_news",
                    "corpus_id": f"ccoha1"
                })
# sort by similarity
complete_synset_usages = sorted(complete_synset_usages, key=lambda x: x['highest_similarity'])
print(f"Number of usages with no missing synsets: {len(complete_synset_usages)}")
display(pd.DataFrame(complete_synset_usages).head(15))
# remove duplicate sentences
df = pd.DataFrame(complete_synset_usages).drop_duplicates(subset=['sentence'])
print(f"Number of usages with no missing synsets and no duplicate sentences: {len(df)}")
display(df.head(15))
# write backt to dict
complete_synset_usages = df.to_dict('records')


Number of usages with no missing synsets: 170


Unnamed: 0,lemma,sentence,target,highest_similarity,corpus_id
0,unused,"During my voyage , affrighted by the dangers w...","[93, 99]",0.223719,ccoha1
1,trample,"It were less cruel for me to have slain her , ...","[168, 175]",0.251587,ccoha1
2,strongly,When the wind blows strongly from the north an...,"[20, 28]",0.257914,ccoha1
3,strongly,When the wind blows strongly from the north an...,"[20, 28]",0.257914,ccoha1
4,warmly,"Besides , had it been warmly espoused at the f...","[22, 28]",0.267301,ccoha1
5,shirk,"to the river and washed , first for , it was i...","[282, 287]",0.267521,ccoha1
6,trample,Silence in the mean time held forth largely on...,"[128, 135]",0.269351,ccoha1
7,different,The terms of office of the Regents expire at d...,"[45, 54]",0.277505,ccoha1
8,fashioned,"But it ought to be known that , on this occasi...","[140, 149]",0.285888,ccoha1
9,different,Three parties had at different times visited P...,"[21, 30]",0.293393,ccoha1


Number of usages with no missing synsets and no duplicate sentences: 165


Unnamed: 0,lemma,sentence,target,highest_similarity,corpus_id
0,unused,"During my voyage , affrighted by the dangers w...","[93, 99]",0.223719,ccoha1
1,trample,"It were less cruel for me to have slain her , ...","[168, 175]",0.251587,ccoha1
2,strongly,When the wind blows strongly from the north an...,"[20, 28]",0.257914,ccoha1
4,warmly,"Besides , had it been warmly espoused at the f...","[22, 28]",0.267301,ccoha1
5,shirk,"to the river and washed , first for , it was i...","[282, 287]",0.267521,ccoha1
6,trample,Silence in the mean time held forth largely on...,"[128, 135]",0.269351,ccoha1
7,different,The terms of office of the Regents expire at d...,"[45, 54]",0.277505,ccoha1
8,fashioned,"But it ought to be known that , on this occasi...","[140, 149]",0.285888,ccoha1
9,different,Three parties had at different times visited P...,"[21, 30]",0.293393,ccoha1
10,immediate,"You are so direct and immediate , my beloved f...","[22, 31]",0.298968,ccoha1


In [219]:
# group by lemma
sample = {}
total_samples = 0
for usage in complete_synset_usages:
    if usage['lemma'] not in sample:
        sample[usage['lemma']] = {
            "samples" : 0,
            "unassigned_usages": [],
        }

instances_size = 0

for usage in complete_synset_usages:
    if instances_size > 900:
        break
    
    
    if sample[usage['lemma']]['samples'] < 8:
        instances_size += len(wn.synsets(usage['lemma']))
        sample[usage['lemma']]['samples'] += 1
        sample[usage['lemma']]['unassigned_usages'].append(usage)
        total_samples += 1


# remove lemmas with no samples
sample = {lemma: sample[lemma] for lemma in sample if sample[lemma]['samples'] > 0}

print(f"Number of lemmas with samples: {len(sample)}")
print(f"Number of instances: {instances_size}")
print(f"Number of samples: {total_samples}")
display(sample)

Number of lemmas with samples: 49
Number of instances: 381
Number of samples: 106


{'unused': {'samples': 2,
  'unassigned_usages': [{'lemma': 'unused',
    'sentence': 'During my voyage , affrighted by the dangers which surrounded me , and to which I was wholly unused , I heartily repented of my resolution but now , methinks , I have reason to rejoice at my perseverance .',
    'target': [93, 99],
    'highest_similarity': 0.2237192327980831,
    'corpus_id': 'ccoha1'},
   {'lemma': 'unused',
    'sentence': 'he said , with a modest air , unaccustomed as I am to public speaking , and unused as I am to receiving from ladies assurances of a character similar to There , now !',
    'target': [76, 82],
    'highest_similarity': 0.3073406431049281,
    'corpus_id': 'ccoha1'}]},
 'trample': {'samples': 8,
  'unassigned_usages': [{'lemma': 'trample',
    'sentence': 'It were less cruel for me to have slain her , after the brutal fashion of our people , than to have taught her hopes and feelings on which the cruel necessity makes me trample , as if I cared for them nothing 

In [220]:
with open(output_file, 'w') as f:
    json.dump(sample, f, indent=4)

In [221]:
with open("../data_sampling_2/data/annotation_data/sp_f3_150k_unassigned_prediction_sorted_reevaluated[700].json", 'r') as f:
    sample2 = json.load(f)
    print(f"Number of lemmas with samples: {len(sample2)}")
    display(sample2)

    for lemma in sample2:
        if lemma not in sample:
            sample[lemma] = sample2[lemma]
        else:
            sample[lemma]['unassigned_usages'].extend(sample2[lemma]['unassigned_usages'])
            sample[lemma]['samples'] += sample2[lemma]['samples']

Number of lemmas with samples: 82


{'qualification': {'samples': 15,
  'unassigned_usages': [{'lemma': 'qualification',
    'sentence': "Slovan Bratislava's Champions League qualification tie against Faroe Islands side KI Klaksvik has been cancelled after a player from the Slovakian club tested positive for COVID19, European soccer's governing body UEFA said on Saturday.",
    'target': [37, 50],
    'highest_similarity': 0.17304900400543594,
    'corpus_id': 'leipzig_eng_news'},
   {'lemma': 'qualification',
    'sentence': 'Streichs team is currently eighth in the 18team Bundesliga, two points behind Hoffenheim in the last place for European qualification.',
    'target': [120, 133],
    'highest_similarity': 0.18880463888491522,
    'corpus_id': 'leipzig_eng_news'},
   {'lemma': 'qualification',
    'sentence': 'But they have fallen away badly, winning only five times in the league in 2020 and they are in real danger of missing out on a Champions League qualification spot.',
    'target': [144, 157],
    'highest_sim

In [222]:
samples = 0
for lemma in sample:
    samples += sample[lemma]['samples']
print(f"Number of lemmas with samples: {len(sample)}")
print(f"Number of samples: {samples}")

Number of lemmas with samples: 115
Number of samples: 332


In [216]:
with open("../data_sampling_2/data/annotation_data/combined_results.json", 'w') as f:
    json.dump(sample, f, indent=4)
