[Kelly](https://spraakbanken.gu.se/en/resources/kelly)

In [1]:
URL = "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/kelly/kelly.xml"

In [2]:
import requests
from lxml import etree

In [3]:
req = requests.get(URL)

In [7]:
root = etree.fromstring(req.content)

In [14]:
EXAMPLE = """
    <LexicalEntry xmlns:karp="http://spraakbanken.gu.se/eng/research/infrastructure/karp/karp">
      <Lemma>
        <FormRepresentation>
          <feat att="writtenForm" val="vara" />
          <feat att="formInformation" val="(vardagl. va)" />
          <feat att="partOfSpeech" val="vb" />
          <feat att="kellyPartOfSpeech" val="verb" />
          <feat att="kellyID" val="88" />
          <feat att="rawFreq" val="2624032" />
          <feat att="wpm" val="23017,26" />
          <feat att="cefr" val="1" />
          <feat att="source" val="SweWaC" />
          <feat att="gram" val="att" />
          <feat att="example" val="e.g. var så god!" />
        </FormRepresentation>
      </Lemma>
      <Sense id="kelly--vara">
        <feat att="saldoSense" val="vara..3" />
        <feat att="saldoSense" val="vara..2" />
        <feat att="saldoSense" val="vara..1" />
        <feat att="saldoSense" val="vara..5" />
      </Sense>
      <karp:saldoLink ref="vara..3" />
      <karp:saldoLink ref="vara..2" />
      <karp:saldoLink ref="vara..1" />
      <karp:saldoLink ref="vara..5" />
    </LexicalEntry>
"""

In [15]:
eg = etree.fromstring(EXAMPLE)

In [21]:
type(eg)

lxml.etree._Element

In [None]:
KELLY_FEATS = [
    "cefr",
    "example",
    "formInformation",
    "gram",
    "kellyID",
    "kellyPartOfSpeech",
    "language",
    "languageCoding",
    "partOfSpeech",
    "rawFreq",
    "source",
    "wpm",
    "writtenForm"
]

In [31]:
from lxml.etree import _Element

class LexicalEntry():
    def __init__(self, data: _Element):
        self.formrep = {}
        self.senses = {}
        self.links = []
        self._read(data)

    def _read(self, data: _Element):
        for child in data:
            if child.tag == "Lemma":
                for schild in child:
                    if schild.tag == "FormRepresentation":
                        for sschild in schild:
                            if sschild.tag == "feat":
                                if sschild.attrib["att"] in KELLY_FEATS:
                                    self.formrep[sschild.attrib["att"]] = sschild.attrib["val"]
            elif child.tag == "Sense":
                senseid = child.attrib["id"]
                if not senseid in self.senses:
                    self.senses[senseid] = []
                for schild in child:
                    if schild.tag == "feat" and schild.attrib["att"] == "saldoSense":
                        self.senses[senseid].append(schild.attrib["val"])
            elif child.tag == "{http://spraakbanken.gu.se/eng/research/infrastructure/karp/karp}saldoLink":
                self.links.append(child.attrib["ref"])


In [19]:
for child in eg:
    if child.tag == "Lemma":
        for schild in child:
            print(schild.tag)
    elif child.tag == "Sense":
        for schild in child:
            print(schild.tag)
    elif child.tag == "{http://spraakbanken.gu.se/eng/research/infrastructure/karp/karp}saldoLink":
        print(child.attrib["ref"])

FormRepresentation
feat
feat
feat
feat
vara..3
vara..2
vara..1
vara..5


In [33]:
le = LexicalEntry(eg)

In [34]:
le.formrep

{'writtenForm': 'vara',
 'formInformation': '(vardagl. va)',
 'partOfSpeech': 'vb',
 'kellyPartOfSpeech': 'verb',
 'kellyID': '88',
 'rawFreq': '2624032',
 'wpm': '23017,26',
 'cefr': '1',
 'source': 'SweWaC',
 'gram': 'att',
 'example': 'e.g. var så god!'}

In [18]:
child.attrib["ref"]

'vara..3'

In [None]:
for child in root:
    if child.tag == "LexicalEntry":
        print(child.tag)