Zunächst müssen einige Module importiert werden, die wir nutzen wollen

In [90]:
from xml.etree import ElementTree as ET
import random
from random import *


Jetzt machen wir die OdeNet-XML-Datei auf, parsen das XML und öffnen eine Datei, in die die Ausgabe geschrieben wird.

In [91]:
de_wn = open(r"C:\Users\melaniesiegel\Documents\05_Projekte\WordNet\OdeNet\odenet.git\trunk\deWordNet.xml","r",encoding="utf-8")

out_lex = open("out_lex.txt","w",encoding="utf-8")

tree = ET.parse(de_wn)

root = tree.getroot()

lexicon = root.find('Lexicon')


Mit check_word_lemma greift man auf Lexikon-Einträge zu, bekommt die Lexikon-ID für ein Wort, den Lemma-Wert, POS und die IDs der Synsets, in denen das Wort enthalten ist.

In [72]:
def check_word_lemma(word_to_check):    
    for lexentry in lexicon.iter('LexicalEntry'):
        lemma = lexentry.find('Lemma')
        lemma_value = lemma.attrib['writtenForm']
        lemma_id = lexentry.attrib['id']
        if lemma_value == word_to_check:
            pos = lemma.attrib['partOfSpeech']
            senses = []
            for sense in lexentry.iter('Sense'):
                sense_id = sense.attrib['id']
                synset_id = sense.attrib['synset']
#                senserelation_type = lexentry.find('SenseRelation').attrib['relType']
#                senserelation_target = lexentry.find('SenseRelation').attrib['target']
                senses.append([sense_id,synset_id])
#            print("LEMMA: " + lemma_value + "\nPOS: " + pos + "\nSENSE ID: " + sense_id)
            return(lemma_id, lemma_value, pos, senses)

In [73]:
check_word_lemma("Weihnachten")

('w44207', 'Weihnachten', 'n', [['w44207_11098-n', 'odenet-11098-n']])

Hier bekommt man die Lexikon-IDs für eine Liste von Wörtern.

In [74]:
def words2ids(wordlist):
    word_id_list = []
    for word in wordlist:
        try:
            lemma_id, lemma, pos, senses = check_word_lemma(word)
            word_id_list.append(lemma_id)
        except:
            print(word + " NOT IN ODENET")
    return(word_id_list)

In [75]:
words2ids(['Frühling','Sommer','Herbst','Winter'])

['w14145', 'w44811', 'w1202374', 'w25612']

Mit check_word_id bekommt man für eine Lexikon-ID Lemma, POS, Synsets und Relationen

In [76]:
def check_word_id(id):    
    for lexentry in lexicon.iter('LexicalEntry'):
        if lexentry.attrib['id'] == id:
            lemma = lexentry.find('Lemma')
            lemma_value = lemma.attrib['writtenForm']
            pos = lemma.attrib['partOfSpeech']
            senses = []
            for sense in lexentry.iter('Sense'):
                sense_id = sense.attrib['id']
                synset_id = sense.attrib['synset']
#                senserelation_type = lexentry.find('SenseRelation').attrib['relType']
#                senserelation_target = lexentry.find('SenseRelation').attrib['target']
                senses.append(synset_id)
                relations = []
                if sense.find('SenseRelation') != None:
                    for relation in sense.iter('SenseRelation'):
                        reltype = relation.attrib['relType']
                        reltarget = relation.attrib['target']
                        relations.append((reltype,reltarget))
    return(lemma_value, pos, senses,relations)

In [77]:
check_word_id('w14145')

('Frühling', 'n', ['odenet-3067-n'], [])

Mit words_in_synset bekommt man die Wörter, die in einem Synset sind.

In [78]:
def words_in_synset(id):
    words = []
    for lexentry in lexicon.iter('LexicalEntry'):
        for sense in lexentry.iter('Sense'):
            if sense.attrib['synset'] == id:
                lemma = lexentry.find('Lemma').attrib['writtenForm']
                words.append(lemma)
    return(words)

In [79]:
words_in_synset('odenet-2754-n')

['Monitoring', 'Beaufsichtigung', 'Überwachung', 'Aufsicht']

Mit check_synset bekommt man alle Informationen zu einem Synset.

In [80]:
def check_synset(id):
    words = words_in_synset(id)
    for synset in lexicon.iter('Synset'):
        if id == synset.attrib['id']:
            ili = synset.attrib['ili']
            try:
                en_definition = synset.attrib["{http://purl.org/dc/elements/1.1/}description"]
            except KeyError:
                en_definition = []
            if synset.find('Definition') != None:
                de_definition = synset.find('Definition').text.strip()
            else:
                de_definition = []
            relations = []
            for relation in synset.iter('SynsetRelation'):
                reltype = relation.attrib['relType']
                reltarget = relation.attrib['target']
                relations.append((reltype,reltarget))
            return(ili,en_definition,de_definition, relations, words)

In [95]:
check_synset("odenet-25555-a")

('i26388',
 'to say, state, or perform again',
 [],
 [('hyponym', 'odenet-5119-v'),
  ('hyponym', 'odenet-423-a'),
  ('hyponym', 'odenet-34404-v'),
  ('hyponym', 'odenet-10538-v'),
  ('hyponym', 'odenet-11423-v'),
  ('hyponym', 'odenet-15312-n'),
  ('hyponym', 'odenet-17105-v'),
  ('hyponym', 'odenet-263-v')],
 ['wiederholt', 'mehrfach', 'mehrmalig', 'x-malig', '...malig'])

In [92]:
def hypernyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    hyp_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "hypernym":
                hypernym_synset = relation[1]
                hypernym_words = words_in_synset(relation[1])
#            else:
#                hypernym_synset = []
#                hypernym_words = []               
                hyp_list.append((sense[0],hypernym_synset,hypernym_words))
    return(hyp_list)

def hyponyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    hyp_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "hyponym":
                hyponym_synset = relation[1]
                hyponym_words = words_in_synset(relation[1])
 #           else:
 #               hyponym_synset = []
 #               hyponym_words = []               
                hyp_list.append((sense[0],hyponym_synset,hyponym_words))
    return(hyp_list)

def meronyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    mero_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "mero_part":
                meronym_synset = relation[1]
                meronym_words = words_in_synset(relation[1])
                mero_list.append((sense[0],meronym_synset,meronym_words))
    return(mero_list)

def holonyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    holo_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "holo_part":
                holo_synset = relation[1]
                holo_words = words_in_synset(relation[1])
                holo_list.append((sense[0],holo_synset,holo_words))
    return(holo_list)

def antonyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    anto_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "antonym":
                antonym_synset = relation[1]
                antonym_words = words_in_synset(relation[1])
                anto_list.append((sense[0],antonym_synset,antonym_words))
    return(anto_list)

In [93]:
hypernyms_word("überlassen")

[('w9034_13770-v',
  'odenet-10390-n',
  ['Liegenschaft', 'Grundbesitz', 'Landbesitz']),
 ('w9034_15292-v', 'odenet-21765-n', ['Schulden', 'Fremdkapital'])]

In [84]:
meronyms_word("Morgenland")

[('w12715_2720-n',
  'odenet-14514-n',
  ['Saudi-Arabien', 'Königreich Saudi-Arabien', 'Saudisch-Arabien']),
 ('w12715_2720-n', 'odenet-14519-n', ['Syrien', 'Arabische Republik Syrien']),
 ('w12715_2720-n', 'odenet-14516-n', ['Türkei', 'Republik Türkei']),
 ('w12715_9897-n', 'odenet-9309-n', ['Ferner Osten', 'Fernost'])]

In [94]:
myword = "übertragen"
(lemma_id, lemma_value, pos, senses) = check_word_lemma(myword)
print (lemma_value + " " + pos + " ")
for sense in senses:
    print("SENSE: " + str(sense[1]) + "  " + str(check_synset(sense[1])) + "\n")
print("HYPERNYMS: " + str(hypernyms_word(myword)))
print("HYPONYMS: " + str(hyponyms_word(myword)))
print("MERONYMS: " + str(meronyms_word(myword)))
print("HOLONYMS: " + str(holonyms_word(myword)))
print("ANTONYMS: " + str(antonyms_word(myword)))


übertragen v 
SENSE: odenet-437-v  ('i32904', 'move from one place to another', [], [('hyponym', 'odenet-2011-v')], ['veräußern', 'übertragen', 'transferieren'])

SENSE: odenet-1012-v  ('i30942', 'travel on water propelled by wind or by other means', [], [], ['übertragen', 'navigieren', 'routen', 'übermitteln', 'schicken', 'senden', 'leiten'])

SENSE: odenet-9659-v  ('i32904', 'move from one place to another', [], [('hyponym', 'odenet-2011-v'), ('hyponym', 'odenet-10882-v'), ('hyponym', 'odenet-13081-n'), ('hyponym', 'odenet-2065-v'), ('hyponym', 'odenet-26295-n'), ('hyponym', 'odenet-33907-v')], ['übertragen', 'überlassen', 'überantworten', 'abtreten', 'vererben', 'vermachen', 'verleihen', 'überschreiben', 'anheimstellen'])

SENSE: odenet-18701-v  ('i22219', 'communicate a disease to', [], [], ['übertragen', 'infizieren', 'anstecken'])

SENSE: odenet-20722-v  ('i32904', 'move from one place to another', [], [('hyponym', 'odenet-2011-v')], ['übertragen', 'abgegeben', 'übermittelt'])

S