# Add ILI to Ukrajinet 
In this notebook we'll do an step by step guide on how to add ILIs to the Ukrainian Wordnet.

#### Import certain packages

In [1]:
from xml.etree import ElementTree as ET
from lxml import etree
import re
import csv
# from synset_ili_dict import ili_dict
# from synset_ili_dict_2 import ili_dict

g_wordDict = None

#### Get a certain wordnet file and transform to a Dictionary

In [2]:
# Function for getting the local wordnet file
# In this case the file ../automatic-ukrajinet.xml

def get_wordnet_lexicon_local(wnfile):
     loc_wn = open(wnfile,"r",encoding="utf-8-sig")
     wntree = ET.parse(loc_wn)
     wnroot = wntree.getroot()
     lexicon = wnroot.find('Lexicon')
     return lexicon

'''
LexEntries
===
Uses function get_wordnet_lexicon_local(wnfile)
to extract all the information in the file
and writes it to a dictionary wordDict = {}
The dict looks like this:
{
'ukrajinet-1-n': ['абера́ція', 'спотво́рення'],
'ukrajinet-2-n': ['збо́чення'],
 ...,
}
'''

def get_word_dict(wordnet):
    lexicon = get_wordnet_lexicon_local(wordnet)            
    wordDict = {}        
    for synset in lexicon.findall('./Synset'):
        wordDict[synset.attrib['id']] = []
    for lexentry in lexicon.findall('./LexicalEntry'):
        for sense in lexentry.findall('./Sense'):                
            lemma = lexentry.find('Lemma').attrib['writtenForm']
            wordDict[sense.attrib['synset']].append(lemma)
    return wordDict

'''
The following function we do need later
'''

def words_in_synset(id):
    lexicon = get_wordnet_lexicon_local(r'automatic-ukrajinet.xml')
    words = []
    for lexentry in lexicon.iter('LexicalEntry'):
        for sense in lexentry.iter('Sense'):
            if sense.attrib['synset'] == id:
                lemma = lexentry.find('Lemma').attrib['writtenForm']
                words.append(lemma)
    return(words)

#### Combine Synsets with ILI
1. Find ILIs in certain translation file
* The translation file is given
* The translation file contains an ILI (from the english wordnet), an english word and the definition and the translated ukrainian word followed by a translated defintion, like so: 
    * ILI, EN, UA_Word, UA_Def
    * i70973, Linux: an open-source version of the UNIX operating system, Linux	, версія операційної системи UNIX з відкритим вихідним кодом
* (its a tsv file but for a better reading I choose the csv format here)
2. Combine Synsets and ILI
* The Ukrainian words in the current Wordnet are matched with the Ukrainian words in the translation file
* If the words of the current Wordnet exist in the translation file, than we'll extract the ili
* The information is safed in a dictionary in the following format:
    * {UA-Synset: [[ILI], [Ukrainian Definition from translation file], [English Word: Definition]], ...}
* The ```out.txt```file safes items with more than one ili

In [3]:
def find_ilis(uk_word):
    with open('../resources/translation_table.tsv','r', encoding='utf-8', errors='ignore') as ili_translations:
        translation_file = csv.DictReader(ili_translations, delimiter='\t')
        ili=[]
        definition=[]
        english=[]

        for row in translation_file:
            if uk_word == row['UA_WORD']: # Check if word exists in translation file
                ili.append(row["ILI"])
                definition.append(row["UA_DEF"])
                english.append(row["EN"])
        return ili, definition, english

def combine_synsets_and_ili():
     g_wordDict = get_word_dict(r'../automatic-ukrajinet.xml')
     out = open("out.txt","w",encoding="utf-8-sig")
     synset_ili_dict={}
     for key in g_wordDict:
          # print(key) # the key is the synset
          wordlist = g_wordDict[key] # the related values of the key are safed as a list
          ili_list = []
          for uk_word in wordlist:
              ili_w, uk_definition_w, english_w = find_ilis(uk_word) # gets the return values of the function for the ukrainian word
              if len(ili_w) > 0: # append all ilis that occur for certain word
                  ili_list.append([ili_w, uk_definition_w, english_w])
          if len(ili_list) > 0: 
            # if there is one ili for a certain word and definition take it and add to dict
            # Otherwise (with more ilis = write to certain file
              if len(ili_list[0][0]) == 1:
                  synset_ili_dict[key]= ili_list[0]
              else:
                  out.write(key + "\t" + str(wordlist) + "\t" + str(ili_list) + "\n") 
     out.close()
     return synset_ili_dict

combine_synsets_and_ili()

{'ukrajinet-68-n': [['i73496'],
  [" підрозділ п'єси, опери чи балету"],
  ['act: a subdivision of a play or opera or ballet']],
 'ukrajinet-184-n': [['i50441'],
  [' будівля, в якій здійснюється банківська діяльність'],
  ['bank: a building in which the business of banking is transacted']],
 'ukrajinet-268-n': [['i58289'],
  [' обертовий стрижень, який передає потужність або рух'],
  ['shaft: a revolving rod that transmits power or motion']],
 'ukrajinet-285-n': [['i58289'],
  [' обертовий стрижень, який передає потужність або рух'],
  ['shaft: a revolving rod that transmits power or motion']],
 'ukrajinet-398-n': [['i75774'],
  [' потужна кругова течія води (зазвичай результат суперечливих припливів і відпливів)'],
  ['maelstrom: a powerful circular current of water (usually the result of conflicting tides)']],
 'ukrajinet-408-n': [['i113539'],
  [' зменшення корисності в результаті тривалого використання'],
  ['wear: impairment resulting from long use']],
 'ukrajinet-494-n': [['i503

#### Anmerkungen
- Wie kommt es überhaupt zustande, dass es mehr als eine ILI für ein Wort gibt?
    - Durch die automatische Übersetzung? Sodass es manche Wörter gibt, die "gleich" übersetzt wurden

# Add unique ILIs to Ukrajinet
1. Transform the entries of the current Ukrainian Wordnet-XML file to a single line each.



In [6]:
# 1. Transform to OneLine entries

def format_wordnet_oneline(wordnet):
    ua_wn = open(wordnet,"r",encoding="utf-8")
    lines = ua_wn.readlines()
    ua_wn.close()
    out_ukrajinet = open("ukrajinet_oneline.xml","w", encoding="utf-8")
    for line in lines:
        if re.match(r'^\t*<LexicalEntry', line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*<Lemma',line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*<Pronunciation',line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*<Sense',line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*</Sense',line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*<SyntacticBehaviour',line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*</SyntacticBehaviour',line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*</LexicalEntry>', line):
            line = line.strip('\t')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*<Synset.*partOfSpeech="[a-z]"/>', line):
            line = line.strip('\t')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*<Synset.*dc:description=".*"/>', line):
            line = line.strip('\t')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*<Synset',line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)       
        elif re.match(r'^\t*<Definition', line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*</Definition', line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*<Example', line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*</Example', line):
            line = line.strip('\t')
            line = line.strip('\r')
            line = line.strip('\n')
            out_ukrajinet.write(line)
        elif re.match(r'^\t*</Synset>', line):
            line = line.strip('\t')
            out_ukrajinet.write(line)
        else:
            out_ukrajinet.write(line)
    out_ukrajinet.close()

format_wordnet_oneline("../automatic-ukrajinet.xml")

# WOHER KOMMT SYNSET_ILI_DICT_2

In [None]:
from synset_ili_dict_2 import ili_dict

# Testen, welche ILIs nicht eindeutig zugeordnet sind (dasselbe ILI in mehreren Synsets)

def test_for_ambiguous_ili():
    ili_list = []
    ambiguous_ilis=[]
    for key in ili_dict:
        ili = ili_dict[key][0][0]
        if ili in ili_list:
            ambiguous_ilis.append(ili)
        else:
            ili_list.append(ili)
    return ambiguous_ilis

# Alle Einträge mit diesem ILI ausgeben

def entries_with_ili(ili):
        for key in ili_dict:
            if ili == ili_dict[key][0][0]:
                words = words_in_synset(key)
                print(key + "\t" + str(words) + "\t" + str(ili_dict[key]))

def give_ambiguous_ilis():
    ambiguous_ilis = test_for_ambiguous_ili()
    for ili in ambiguous_ilis:
        entries_with_ili(ili)
        print("-----------------")

# Definitionen und ILIs für die eindeutigen ILI-Synsets eintragen

def add_to_nonambiguous_synsets():
    ambiguous_ilis = test_for_ambiguous_ili()
    for key in ili_dict:
        ili = ili_dict[key][0][0]
        definition = ili_dict[key][1][0]
        if ili not in ambiguous_ilis:
            change_attribute_in_ss(key,'ili',ili,r"C:\Users\Melanie Siegel\Documents\05_Projekte\Maksym Vakulenko\Wordnet\ILI\ukrajinet_oneline.xml")
            add_definition_to_ss(key,definition,r"C:\Users\Melanie Siegel\Documents\05_Projekte\Maksym Vakulenko\Wordnet\ILI\ukrajinet_oneline.xml")


# Die Version ohne Zeilenumbruch als Pretty Print speichern
# prettyprint_wordnet("ukrajinet_oneline.xml")


def prettyprint_wordnet(wordnet):
    oneline_wordnet = open(wordnet,"r", encoding="utf-8")
    lines = oneline_wordnet.readlines()
    oneline_wordnet.close()
    pretty_wordnet = open(r'../automatic-ukrajinet.xml',"w",encoding="utf-8")
    for line in lines:
        line = line.replace("<Lemma","\n\t<Lemma")
        line = line.replace("<Sense","\n\t<Sense")
        line = line.replace("</Sense","\n\t</Sense")
        line = line.replace("</LexicalEntry>","\n</LexicalEntry>")
        line = line.replace("<SynsetRelation","\n\t<SynsetRelation")
        line = line.replace("<Definition>","\n\t<Definition>")
        line = line.replace("<Example>","\n\t<Example>")
        line = line.replace("</Synset>","\n</Synset>")
        line = line.replace("<SyntacticBehaviour","\n\t<SyntacticBehaviour")
        pretty_wordnet.write(line)
    pretty_wordnet.close()

#### Change entries manually 
This is for changes that could be done manually 

##### Change attributes in Synsets (like ILI)
* Example usage:
    * ```change_attribute_in_ss('ukrajinet-4-n','ili','i97809',r"docs/ukrajinet_oneline.xml")```

#### Add definitions to a Synset
* Example usage:
    * ```add_definition_to_ss('ukrajinet-4-n','зношування частинок гірських порід через тертя під дією води, вітру або льоду',r"docs/ukrajinet_oneline.xml")```


In [None]:
# Change attributes in Synsets

def change_attribute_in_ss(synset, att, value, wordnetfile):
    in_wordnet = open(wordnetfile,"r",encoding="utf-8")
    lines = in_wordnet.readlines()
    in_wordnet.close()
    out_wordnet = open(wordnetfile,"w",encoding="utf-8")
    ss_string = '<Synset id="' + synset + '"'
    for line in lines:
        if ss_string in line:
            line = re.sub(att + '="[a-zA-Z0-9]*"', att + '="'+ value +'"', line)
            print(line)
        out_wordnet.write(line)
    out_wordnet.close()

In [None]:
# Add definition to a Synset

def add_definition_to_ss(synset, definition, wordnetfile):
        ua_wn = open(wordnetfile,"r",encoding="utf-8")
        lines = ua_wn.readlines()
        ua_wn.close()
        out_wordnet = open(wordnetfile,"w",encoding="utf-8")
        ss_string = '<Synset id="' + synset + '"'
        definition_string = "<Definition>" + definition + "</Definition>"
        for line in lines:
            if ss_string in line and "<Definition>" not in line:
                if '<Example>' in line:
                    line = line.replace('<Example>', definition_string + '<Example>')
                elif '<SynsetRelation' in line:
                    line = line.replace('<SynsetRelation', definition_string + '<SynsetRelation',1)
                elif '</Synset>' in line:
                    line = line.replace('</Synset>', definition_string + '</Synset>')
                else:
                    line = line.replace('/>', '>' + definition_string + '</Synset>')
                print(line)
            out_wordnet.write(line)
        out_wordnet.close()