[Kelly](https://spraakbanken.gu.se/en/resources/kelly)

In [1]:
URL = "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/kelly/kelly.xml"

In [2]:
import requests
from lxml import etree

In [3]:
req = requests.get(URL)

In [7]:
root = etree.fromstring(req.content)

In [14]:
EXAMPLE = """
    <LexicalEntry xmlns:karp="http://spraakbanken.gu.se/eng/research/infrastructure/karp/karp">
      <Lemma>
        <FormRepresentation>
          <feat att="writtenForm" val="vara" />
          <feat att="formInformation" val="(vardagl. va)" />
          <feat att="partOfSpeech" val="vb" />
          <feat att="kellyPartOfSpeech" val="verb" />
          <feat att="kellyID" val="88" />
          <feat att="rawFreq" val="2624032" />
          <feat att="wpm" val="23017,26" />
          <feat att="cefr" val="1" />
          <feat att="source" val="SweWaC" />
          <feat att="gram" val="att" />
          <feat att="example" val="e.g. var så god!" />
        </FormRepresentation>
      </Lemma>
      <Sense id="kelly--vara">
        <feat att="saldoSense" val="vara..3" />
        <feat att="saldoSense" val="vara..2" />
        <feat att="saldoSense" val="vara..1" />
        <feat att="saldoSense" val="vara..5" />
      </Sense>
      <karp:saldoLink ref="vara..3" />
      <karp:saldoLink ref="vara..2" />
      <karp:saldoLink ref="vara..1" />
      <karp:saldoLink ref="vara..5" />
    </LexicalEntry>
"""

In [15]:
eg = etree.fromstring(EXAMPLE)

In [21]:
type(eg)

lxml.etree._Element

In [None]:
KELLY_FEATS = [
    "cefr",
    "example",
    "formInformation",
    "gram",
    "kellyID",
    "kellyPartOfSpeech",
    "language",
    "languageCoding",
    "partOfSpeech",
    "rawFreq",
    "source",
    "wpm",
    "writtenForm"
]

In [50]:
from lxml.etree import _Element

class LexicalEntry():
    def __init__(self, data: _Element):
        self.formrep = {}
        self.senses = {}
        self.links = []
        self._read(data)

    def _read(self, data: _Element):
        for child in data:
            if child.tag == "Lemma":
                for schild in child:
                    if schild.tag == "FormRepresentation":
                        for sschild in schild:
                            if sschild.tag == "feat":
                                if sschild.attrib["att"] in KELLY_FEATS:
                                    self.formrep[sschild.attrib["att"]] = sschild.attrib["val"]
            elif child.tag == "Sense":
                senseid = child.attrib["id"]
                if not senseid in self.senses:
                    self.senses[senseid] = []
                for schild in child:
                    if schild.tag == "feat" and schild.attrib["att"] == "saldoSense":
                        self.senses[senseid].append(schild.attrib["val"])
            elif child.tag == "{http://spraakbanken.gu.se/eng/research/infrastructure/karp/karp}saldoLink":
                self.links.append(child.attrib["ref"])

    def get_cefr(self):
        cefrmap = {
            "1": "A1",
            "2": "A2",
            "3": "B1",
            "4": "B2",
            "5": "C1",
            "6": "C2",
        }
        if "cefr" in self.formrep:
            return cefrmap[self.formrep["cefr"]]
    
    def get_word(self):
        if "writtenForm" in self.formrep:
            return self.formrep["writtenForm"]
        else:
            return None
    
    def get_example(self):
        if "example" in self.formrep:
            return self.formrep["example"]
        else:
            return None

    def get_pos(self):
        posmap = {
            "ab": "adverb",
            "av": "adjective",
            "in": "interj",
            "kn": "conj",
            "nl": "numeral",
            "nn": "noun",
            "pm": "proper name",
            "pn": "pronoun",
            "pp": "prep",
            "sn": "subj",
            "vb": "verb",
        }
        if "kellyPartOfSpeech" in self.formrep:
            return self.formrep["kellyPartOfSpeech"]
        elif "partOfSpeech" in self.formrep:
            if self.formrep["partOfSpeech"] == "nn":
                if gram in self.formrep and self.formrep["gram"] in ["en", "ett"]:
                    return f'noun-{self.formrep["gram"]}'
                else:
                    return "noun"
            return posmap[self.formrep["partOfSpeech"]]
        else:
            return None

    def get_wpm(self):
        if "wpm" in self.formrep:
            return float(self.formrep["wpm"].replace(",", "."))
        else:
            return None

In [51]:
le = LexicalEntry(eg)

In [45]:
le.get_cefr()

'A1'

In [46]:
le.get_word()

'vara'

In [49]:
le.get_pos()

'verb'

In [52]:
le.get_wpm()

23017.26

In [38]:
le.formrep

{'writtenForm': 'vara',
 'formInformation': '(vardagl. va)',
 'partOfSpeech': 'vb',
 'kellyPartOfSpeech': 'verb',
 'kellyID': '88',
 'rawFreq': '2624032',
 'wpm': '23017,26',
 'cefr': '1',
 'source': 'SweWaC',
 'gram': 'att',
 'example': 'e.g. var så god!'}