### Sample Data
This notebook samples word usages from a corpus for the headwords of a dictionary.
It is used to create the sample data for the first human annotation.
Since searching every sentence for all headwords is very time consuming, the notebook samples a number of headwords and then searches for usages of these headwords in the corpus.
It keeps all usages of the sampled headwords so further processing, i.e., reducing the number of usages is done in another notebook.

### Usage
Various parameters need to be set in the second cell of the notebook, including:
- `input_file_headwords`: path to the dictionary file
- `input_file_sentences`: path to the corpus file
- `number_of_words`: number of headwords to be sampled (Only needs to be large enough to find enough headwords with usages to reach the desired sample size.)
- `size_of_sample`: number of headwords to find usages for
- `max_senses`: maximum number of senses for a headword to be considered (to not inflate the human annotation with headwords with many senses)

The notebook outputs a json file with the sampled headwords and their usages.

In [1]:
import json
import random
from random import randrange
from Levenshtein import distance as lev

In [2]:
size_of_sample = 170
number_of_words = 2000
seed = randrange(100000)
print(f"seed: {seed}")
random.seed(seed)

seed: 56266


In [3]:
## ENGLISH
#input_file_headwords = '../data/dictionaries/wordnet_sense_id.json'
#input_file_sentences = '../data/corpora/PROCESSED_eng_news_2020_1M-sentences.json'
#input_file_sentences = '../data/corpora/PROCESSED_ccoha1.json'

## SWEDISH
input_file_headwords = '../data/dictionaries/sw_dict_sense_id.json'
input_file_sentences = '../data/corpora/PROCESSED_swe_news_2022_1M-sentences.json'
#input_file_sentences = '../data/corpora/PROCESSED_kubhist2a.json'

max_senses = 5


output_file = f'../data/outputs/SAMPLED_{input_file_sentences.split("/")[-1].split(".")[0]}.json'
results = []

In [4]:
# finds a sublist in a list
def find_multiword(word, sentence):
    ln = len(word)
    for i in range(len(sentence) - ln + 1):
        if all(word[j] == sentence[i+j] for j in range(ln)):
            return i
    return -1

In [5]:
# finds all positions of a word in a sentence
def find_multiple_occurences(word, sentence):
    return [i for i in range(len(sentence)) if sentence.startswith(word, i)]

In [6]:
# load headwords with less than max_senses senses
headwords = []

with open(input_file_headwords, encoding='utf-8') as f:
    data = json.load(f)
    for d in data:
        if len(d['entries']) > max_senses: # skip words with too many senses
            continue
        if all(d['sense'] == "" for d in d['entries']): # skip words without sense definitions (as they are needed for human evaluation)
            continue
        if '_' in d['key']: # split multiword expressions
            headwords.append(d['key'].split('_'))
        else:
            headwords.append(d['key'].split('-'))

    print(f"number of headwords: {len(headwords)}")

number of headwords: 39069


In [7]:
# take a random sample of headwords
headword_sample = []

for i in range(number_of_words):
    index = random.randint(0, len(headwords)-1) # pick a random headword
    headword_sample.append(headwords.pop(index)) # remove it from the list and add it to the sample

In [8]:
# find sentences containing the headwords
with open(input_file_sentences, encoding='utf-8') as f:
    data = json.load(f)
    matches = {"file": f"{input_file_sentences.split('/')[-1]}", "entries": {}}
    for d in data:
        if len(matches["entries"]) >= size_of_sample: # stop when sample size is reached
            break
        for headword in headword_sample: # for all headwords in sample
                       
            first_word = headword[0]
            lemma = '_'.join(headword)

            if first_word in d['lemmatized']:
                if len(headword) > 1: # if multiword expression
                    if find_multiword(headword, d['lemmatized']) == -1:
                        continue
                if matches["entries"].get(lemma) == None: # if first entry
                    matches["entries"][lemma] = [{"sentence_id":d["sentence_number"], "entry": {"sentence": d['sentence'], "tokenized": d["tokenized"], "lemmatized": d['lemmatized'], "pos_tag": d['pos_tag']}}]
                else:
                    is_duplicate = False
                    for usage in matches["entries"][lemma]:
                        if lev(usage["entry"]["sentence"], d['sentence']) < 10: # if another usage is similar
                            is_duplicate = True
                    if not is_duplicate:
                        matches["entries"][lemma].append({"sentence_id":d["sentence_number"], "entry": {"sentence": d['sentence'], "tokenized": d["tokenized"], "lemmatized": d['lemmatized'], "pos_tag": d['pos_tag']}})

results = {"seed": seed, "headwords_searched": number_of_words, "headwords_found": 0, "max_senses": max_senses, "entries": {}}

In [9]:
# construct result file
for lemma in matches['entries']:
    for s in range(len(matches['entries'][lemma])):

        sentence_number = matches['entries'][lemma][s]['sentence_id']
        
        # retrieve sentences
        sentence = matches['entries'][lemma][s]['entry']['sentence']
        tokenized = matches['entries'][lemma][s]['entry']['tokenized']
        lemmatized = matches['entries'][lemma][s]['entry']['lemmatized']
        pos_tag = matches['entries'][lemma][s]['entry']['pos_tag']

        # position of headword in lemmatized sentence
        lemma_index_1 = find_multiword(lemma.split('_'), lemmatized)
        lemma_index_2 = lemma_index_1 + len(lemma.split('_')) - 1

        # construct multiwords
        tokens = []
        for i in range(lemma_index_1, lemma_index_2 + 1):
            tokens += [tokenized[i]]
        multiwords = [' '.join(tokens)]
        multiwords.append('-'.join(tokens))
        multiwords.append('/'.join(tokens))
        multiwords.append(', '.join(tokens))
        multiwords.append(' , '.join(tokens))

        # get pos information
        pos_tags = []
        for i in range(lemma_index_1, lemma_index_2 + 1):
            pos_tags += [pos_tag[i]]
        
        # sanity check if any possible word usage is in sentence
        if all(multiword not in sentence for multiword in multiwords):
            print(f"ERROR: {word_usages[0]} not in {sentence}")
            continue
    
        # find used multiword in sentence
        for w in multiwords:
            if w in sentence:
                multiword = w
                break
    
        sentence_indieces = []

        # find all occurences of word in sentence
        if sentence.startswith(multiword):
            sentence_indieces = [0]
        
        for i in range(1, len(sentence) - len(multiword) + 1):
            if sentence.startswith(multiword, i):
                if not sentence[i-1].isalpha():
                    sentence_indieces.append(i)
        if len(sentence_indieces) == 0:
            print(f"ERROR: {multiword} not in {sentence}")
            continue

        sentence_index_1 = sentence_indieces[0]
        # go through all occurences
        for i in sentence_indieces:
            # check if character after word usage is a [a-z]
            if i + len(multiword) < len(sentence):
                if not sentence[i + len(multiword)].isalpha(): # if not, continue
                    sentence_index_1 = i
                    break
        sentence_index_2 = sentence_index_1 + len(multiword)

        if lev(multiword, sentence[sentence_index_1:sentence_index_2]) > 0:
            print(f"ERROR: word usage {multiword} not at index {[sentence_index_1, sentence_index_2]}\nInstead: {sentence[sentence_index_1:sentence_index_2]}\n")


        if results["entries"].get(lemma) == None:
            results["entries"][lemma] = []

        results['entries'][lemma].append({
            "word_usage": multiword,
            "lemma": lemma,
            "pos": pos_tags,
            "identifier": f"{matches['file']}-{sentence_number}-{lemma_index_1}:{lemma_index_2}",
            "sentence": sentence,
            "tokenized": tokenized,
            "lemmatized": lemmatized,
            "character_index_sentence": f"{sentence_index_1}:{sentence_index_2}",
            "index_lemmatized": f"{lemma_index_1}:{lemma_index_2}"
            })
    results["headwords_found"] += 1

In [10]:
hw = [' '.join(h) for h in headword_sample]
found_lemmas = [l for l in results['entries']]
print(f"{found_lemmas[:5]} ({len(set(found_lemmas))})")
print(len(results['entries']))

['flytt', 'förstärka', 'roman', 'ockupera', 'undersökning'] (170)
170


In [11]:
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)