Zunächst müssen einige Module importiert werden, die wir nutzen wollen

In [43]:
from xml.etree import ElementTree as ET
import random
from random import *
import deepl

Jetzt machen wir die OdeNet-XML-Datei auf, parsen das XML und öffnen eine Datei, in die die Ausgabe geschrieben wird.

In [44]:
de_wn = open(r"C:\Users\melaniesiegel\Documents\05_Projekte\WordNet\OdeNet\odenet.git\trunk\deWordNet.xml","r",encoding="utf-8")

out_lex = open("out_lex.txt","w",encoding="utf-8")

tree = ET.parse(de_wn)

root = tree.getroot()

lexicon = root.find('Lexicon')


Mit check_word_lemma greift man auf Lexikon-Einträge zu, bekommt die Lexikon-ID für ein Wort, den Lemma-Wert, POS und die IDs der Synsets, in denen das Wort enthalten ist.

In [45]:
def check_word_lemma(word_to_check):    
    for lexentry in lexicon.iter('LexicalEntry'):
        lemma = lexentry.find('Lemma')
        lemma_value = lemma.attrib['writtenForm']
        lemma_id = lexentry.attrib['id']
        if lemma_value == word_to_check:
            pos = lemma.attrib['partOfSpeech']
            senses = []
            for sense in lexentry.iter('Sense'):
                sense_id = sense.attrib['id']
                synset_id = sense.attrib['synset']
#                senserelation_type = lexentry.find('SenseRelation').attrib['relType']
#                senserelation_target = lexentry.find('SenseRelation').attrib['target']
                senses.append([sense_id,synset_id])
#            print("LEMMA: " + lemma_value + "\nPOS: " + pos + "\nSENSE ID: " + sense_id)
            return(lemma_id, lemma_value, pos, senses)

In [46]:
check_word_lemma("Leumund")

('w24078', 'Leumund', 'n', [['w24078_5598-n', 'odenet-5598-n']])

Hier bekommt man die Lexikon-IDs für eine Liste von Wörtern.

In [47]:
def words2ids(wordlist):
    word_id_list = []
    for word in wordlist:
        try:
            lemma_id, lemma, pos, senses = check_word_lemma(word)
            word_id_list.append(lemma_id)
        except:
            print(word + " NOT IN ODENET")
    return(word_id_list)

In [48]:
words2ids(['Frühling','Sommer','Herbst','Winter'])

['w14145', 'w44811', 'w1202374', 'w25612']

Mit check_word_id bekommt man für eine Lexikon-ID Lemma, POS, Synsets und Relationen

In [49]:
def check_word_id(id):    
    for lexentry in lexicon.iter('LexicalEntry'):
        if lexentry.attrib['id'] == id:
            lemma = lexentry.find('Lemma')
            lemma_value = lemma.attrib['writtenForm']
            pos = lemma.attrib['partOfSpeech']
            senses = []
            for sense in lexentry.iter('Sense'):
                sense_id = sense.attrib['id']
                synset_id = sense.attrib['synset']
#                senserelation_type = lexentry.find('SenseRelation').attrib['relType']
#                senserelation_target = lexentry.find('SenseRelation').attrib['target']
                senses.append(synset_id)
                relations = []
                if sense.find('SenseRelation') != None:
                    for relation in sense.iter('SenseRelation'):
                        reltype = relation.attrib['relType']
                        reltarget = relation.attrib['target']
                        relations.append((reltype,reltarget))
    return(lemma_value, pos, senses,relations)

In [50]:
check_word_id('w14145')

('Frühling', 'n', ['odenet-3067-n'], [])

Mit words_in_synset bekommt man die Wörter, die in einem Synset sind.

In [51]:
def words_in_synset(id):
    words = []
    for lexentry in lexicon.iter('LexicalEntry'):
        for sense in lexentry.iter('Sense'):
            if sense.attrib['synset'] == id:
                lemma = lexentry.find('Lemma').attrib['writtenForm']
                words.append(lemma)
    return(words)

In [52]:
words_in_synset('odenet-12371-n')

['Gesellschaftssystem', 'Gesellschaftsformation', 'Gesellschaftsform']

Mit check_synset bekommt man alle Informationen zu einem Synset.

In [53]:
def check_synset(id):
    words = words_in_synset(id)
    for synset in lexicon.iter('Synset'):
        if id == synset.attrib['id']:
            ili = synset.attrib['ili']
            try:
                en_definition = synset.attrib["{http://purl.org/dc/elements/1.1/}description"]
            except KeyError:
                en_definition = []
            if synset.find('Definition') != None:
                de_definition = synset.find('Definition').text.strip()
            else:
                de_definition = []
            relations = []
            for relation in synset.iter('SynsetRelation'):
                reltype = relation.attrib['relType']
                reltarget = relation.attrib['target']
                relations.append((reltype,reltarget))
            return(ili,en_definition,de_definition, relations, words)

In [54]:
check_synset("odenet-10251-a")

('i10885',
 'remote in manner',
 [],
 [('antonym', 'odenet-3403-a')],
 ['distanziert',
  'zugeknöpft',
  'in sich gekehrt',
  'zurückhaltend',
  'verschlossen',
  'verschwiegen',
  'unzugänglich',
  'introvertiert',
  'nach innen gekehrt',
  'unaufgeschlossen',
  'nicht mitteilsam',
  'reserviert',
  'nicht erreichbar'])

In [55]:
def hypernyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    hyp_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "hypernym":
                hypernym_synset = relation[1]
                hypernym_words = words_in_synset(relation[1])
#            else:
#                hypernym_synset = []
#                hypernym_words = []               
                hyp_list.append((sense[0],hypernym_synset,hypernym_words))
    return(hyp_list)

def hyponyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    hyp_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "hyponym":
                hyponym_synset = relation[1]
                hyponym_words = words_in_synset(relation[1])
 #           else:
 #               hyponym_synset = []
 #               hyponym_words = []               
                hyp_list.append((sense[0],hyponym_synset,hyponym_words))
    return(hyp_list)

def meronyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    mero_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "mero_part":
                meronym_synset = relation[1]
                meronym_words = words_in_synset(relation[1])
                mero_list.append((sense[0],meronym_synset,meronym_words))
    return(mero_list)

def antonyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    anto_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "antonym":
                antonym_synset = relation[1]
                antonym_words = words_in_synset(relation[1])
                anto_list.append((sense[0],antonym_synset,antonym_words))
    return(anto_list)

In [56]:
hypernyms_word("Notwehr")

[('w5185_1043-n',
  'odenet-10660-n',
  ['Widerstand', 'Gegenwehr', 'Verteidigung', 'Defensive', 'Abwehr'])]

In [91]:
meronyms_word("Morgenland")

[('w12715_2720-n',
  'odenet-14514-n',
  ['Saudi-Arabien', 'Königreich Saudi-Arabien', 'Saudisch-Arabien']),
 ('w12715_2720-n', 'odenet-14519-n', ['Syrien', 'Arabische Republik Syrien']),
 ('w12715_2720-n', 'odenet-14516-n', ['Türkei', 'Republik Türkei']),
 ('w12715_9897-n', 'odenet-9309-n', ['Ferner Osten', 'Fernost'])]

In [90]:
myword = "Planung"
(lemma_id, lemma_value, pos, senses) = check_word_lemma(myword)
print (lemma_value + " " + pos + " ")
for sense in senses:
    print("SENSE: " + str(sense[1]) + "  " + str(check_synset(sense[1])))
print("HYPERNYMS: " + str(hypernyms_word(myword)))
print("HYPONYMS: " + str(hyponyms_word(myword)))
print("MERONYMS: " + str(meronyms_word(myword)))
print("ANTONYMS: " + str(antonyms_word(myword)))


Planung n 
SENSE: odenet-1763-n  ('i41397', [], 'Ausarbeiten eines Plans', [('hypernym', 'odenet-7966-n')], ['Disposition', 'Planung'])
SENSE: odenet-9646-n  ('i70074', 'any of the various versions in the development of a written work', [], [('hypernym', 'odenet-20257-n'), ('hyponym', 'odenet-16223-n')], ['Planung', 'Plan', 'Schema', 'Konzeption', 'Entwurf', 'Vorlage', 'Skizze', 'Zeichnung', 'Layout', 'Grundriss'])
HYPERNYMS: [('w8644_1763-n', 'odenet-7966-n', ['Vorbereitung', 'Aufbereitung']), ('w8644_9646-n', 'odenet-20257-n', ['SMSen', '(jemandem eine) SMS schicken', 'antexten', 'simsen', 'SMS verschicken', 'ansimsen'])]
HYPONYMS: [('w8644_9646-n', 'odenet-16223-n', ['Explosionszeichnung', 'Explosivdarstellung', 'Explosionsgrafik'])]
MERONYMS: []
ANTONYMS: []
