# General preparations

In [1]:
from __future__ import annotations
import codecs
from collections import namedtuple
from enum import Enum
from collections import defaultdict
import re

WordRecord = namedtuple("WordRecord", ["defaultform", "partofspeech", "data"])
NounDataRecord = namedtuple("NounDataRecord", ["anim", "gender", "cases"])
VerbDataRecord = namedtuple("VerbDataRecord", ["perf", "quantity"])

PhraseWordRecord = namedtuple("PhraseWordRecord", ["word", "traits"])

class Animate(Enum):
    Animate = 0
    Inanimate = 1
    def convert(string: str) -> Animate:
        animSwitcher = {
            'anim': Animate.Animate,
            'nanim': Animate.Inanimate
        }
        return animSwitcher.get(string)
    
class Gender(Enum):
    Male = 0
    Female = 1
    Neuter = 2
    def convert(string: str) -> Gender:
        genSwitcher = {
            'M': Gender.Male,
            'F': Gender.Female,
            'N': Gender.Neuter
        }
        return genSwitcher.get(string)
    
class Tense(Enum):
    Present = 0
    Past = 1
    Future = 2
    def convert(string: str) -> Gender:
        tenseSwitcher = {
            'present': Tense.Present,
            'past': Tense.Past,
            'future': Tense.Future
        }
        return tenseSwitcher.get(string)
    
class Perfectivity(Enum):
    Perfect = 0
    Imperfect = 1
    def convert(string: str) -> Perfectivity:
        perfSwitcher = {
            'perf': Perfectivity.Perfect,
            'imperf': Perfectivity.Imperfect
        }
        return perfSwitcher.get(string)
    
class PartOfSpeech(Enum):
    Noun = 0
    Pronoun = 1
    Adjective = 2
    Verb = 3
    def convert(string: str) -> PartOfSpeech:
        posSwitcher = {
            'N': PartOfSpeech.Noun,
            'Pr': PartOfSpeech.Pronoun,
            'Adj': PartOfSpeech.Adjective,
            'V': PartOfSpeech.Verb
        }
        return posSwitcher.get(string)
    
class Phrase(Enum):
    SNP = 0
    VP = 1
    ONP = 2
    NP = 3
    S = 4
    def convert(string: str) -> Phrase:
        phraseSwitcher = {
            'SNP': Phrase.SNP,
            'VP': Phrase.VP,
            'ONP': Phrase.ONP,
            'NP': Phrase.NP,
            'S': Phrase.S
        }
        return phraseSwitcher.get(string)
    
class Quantity(Enum):
    Singular = 0
    Plural = 1
    def convert(string: str) -> Quantity:
        qttSwitcher = {
            'sing': Quantity.Singular,
            'pl': Quantity.Plural
        }
        return qttSwitcher.get(string)

class Case(Enum):
    Nominative = 0
    Genitive = 1
    Dative = 2
    Accusative = 3
    Instrumental = 4
    Prepositional = 5
    def convert(string: str) -> Case:
        caseSwitcher = {
            'nom': Case.Nominative,
            'gen': Case.Genitive,
            'dat': Case.Dative,
            'acc': Case.Accusative,
            'inst': Case.Instrumental,
            'prep': Case.Prepositional
        }
        return caseSwitcher.get(string)

class PhraseHead(Enum):
    IsHead = 0
    def convert(string: str) -> Case:
        phSwitcher = {
            'head': PhraseHead.IsHead
        }
        return phSwitcher.get(string)
    
class Person(Enum):
    P1 = 0
    P2 = 1
    P3 = 2
    def convert(string: str) -> Person:
        pSwitcher = {
            '1P': Person.P1,
            '2P': Person.P2,
            '3P': Person.P3
        }
        return pSwitcher.get(string)
    
class Transitivity(Enum):
    Transitive = 0
    Intransitive = 1
    def convert(string: str) -> Transitivity:
        trSwitcher = {
            'tr': Transitivity.Transitive,
            'intr': Transitivity.Intransitive
        }
        return trSwitcher.get(string)
    
cases = [Case.Nominative, Case.Genitive, Case.Dative, Case.Accusative, Case.Instrumental,
         Case.Prepositional]
persons = [Person.P1, Person.P2, Person.P3]
genders = [Gender.Male, Gender.Female, Gender.Neuter]
wordTraits = [Animate, Gender, Tense, Perfectivity, Quantity, Case, PhraseHead, Person, Transitivity]

# imports contents of dictionary and splits it into lines
def loadFromFile(path):
    with codecs.open(path, 'r', 'utf-8') as dct:
        recs = dct.read().splitlines()
        return recs

# Loading dictionary

## Parsing

### Custom parsers

#### Nouns

In [10]:
# returns info about animacy and gender
def animGenParser(data):
    temp = data[1:-1].split('|')
    return (Animate.convert(temp[0]), Gender.convert(temp[1]))

# simple case parser
def parseCases(caseString):
    splitStr = caseString[1:-1].split('|')
    if len(splitStr) < 6:
        return None
    return dict(zip(cases, splitStr))

# noun parser
def parseNounRec(splitRec):
    anim, gen = animGenParser(splitRec[4])
    return WordRecord(splitRec[0], PartOfSpeech.Noun, NounDataRecord(anim, gen,
                            dict([(Quantity.Singular, parseCases(splitRec[2])),
                              (Quantity.Plural, parseCases(splitRec[3]))])))

#### Adjectives

In [13]:
# case parser for male forms of adjectives (with respect to animacy)
def parseMaleCases(caseString):
    temp = caseString[1:-1].split('|')
    temp[3] = temp[3].split(';')
    resDict = dict(zip(cases, temp))
    resDict['acc'] = {(Animate.Animate, temp[3][0]), (Animate.Inanimate, temp[3][1])}
    return resDict

# case parser for adjectives (with respect to genders)
def parseAdjSingCases(caseString):
    gens = caseString[1:-1].split(' ')
    resCases = defaultdict(Gender)
    for caseStringGen in gens:
        tmp = caseStringGen.split(':')
        gend = Gender.convert(tmp[0])
        if gend == Gender.Male:
            resCases[gend] = parseMaleCases(tmp[1])
        else:
            resCases[gend] = parseCases(tmp[1])
    return resCases

# adjective parser
def parseAdjRec(splitRec):
    return WordRecord(splitRec[0], PartOfSpeech.Adjective, 
                      dict([(Quantity.Singular, parseAdjSingCases(splitRec[2])),
                       (Quantity.Plural, parseCases(splitRec[3]))]))

#### Verbs

In [4]:
def parsePastPersonalForms(formString):
    temp = formString[1:-1].split('|')
    if len(temp) == 3:
        return dict(zip(genders, temp))
    else:
        return temp[0]

def parsePersonalForms(formString):
    temp = formString[1:-1].split('|')
    return dict(zip(persons, temp))

def parseVerbForms(formString):
    tenses = formString[1:-1].split(' ')
    resForms = defaultdict(Tense)
    for formString in tenses:
        tmp = formString.split(':')
        tense = Tense.convert(tmp[0])
        if tense == Tense.Past:
            resForms[tense] = parsePastPersonalForms(tmp[1])
        else:
            resForms[tense] = parsePersonalForms(tmp[1])
    return resForms

def perfectivityParser(data):
    temp = data[1:-1].split('|')
    return Perfectivity.convert(temp[0])
        
def parseVerbRec(splitRec):
    return WordRecord(splitRec[0], PartOfSpeech.Verb,
                      VerbDataRecord(perfectivityParser(splitRec[4]), parseVerbForms(splitRec[2]),
                                     parseVerbForms(splitRec[3])))

#### Pronouns

In [5]:
def parsePronounRec(splitRec):
    return WordRecord(splitRec[0], PartOfSpeech.Pronoun,
                      {(Quantity.Singular, parseCases(splitRec[2])),
                       (Quantity.Plural, parseCases(splitRec[3]))})

### Master parser function

In [6]:
def parseRecord(rec):
    splitRec = rec.split('\t')
    if splitRec[1] == 'N':
        return parseNounRec(splitRec)
    elif splitRec[1] == 'Adj':
        return parseAdjRec(splitRec)
    elif splitRec[1] == 'V':
        return parseVerbRec(splitRec)
    elif splitRec[1] == 'Pr':
        return parsePronounRec(splitRec)

# Loading rules

## Loading & parsing phrase structure: words and their traits

In [2]:
def parseTrait(tr):
    for traitType in wordTraits:
        try:
            return traitType.convert(tr)
        except KeyError:
            continue

def parseTraits(trs):
    res = []
    if len(trs) > 0:
        for trait in trs[1:-1].split(';'):
            res.append(parseTrait(trait))
    return res

def parseRightParts(rightPart):
    res = []
    for part in rightPart.split('|'):
        splitPart = part.split(' ')
        curWords = []
        for _part in splitPart:
            rec = _part.split('-')
            traits = []
            if len(rec) > 1:
                traits = rec[1]
            curWords.append(PhraseWordRecord(PartOfSpeech.convert(rec[0]), parseTraits(traits)))
        res.append(curWords)
    return res

def getPhraseStruct(rules):
    rulesFinal = {Phrase.S: {}}
    for rule in rules:
        splitRule = rule.split(':=')
        rulesFinal[Phrase.S][Phrase.convert(splitRule[0])] = parseRightParts(splitRule[1])
    return rulesFinal

## Loading & parsing non-terminal symbols, denoting overall sentence strucure

In [3]:
def parseSentHeads(rightPart):
    res = []
    for part in rightPart:
        res.append(Phrase.convert(part))
    return res

def getSentStruct(rules):
    rulesFinal = {Phrase.S: []}
    for rule in rules:
        sentHead = rule[1:].split(':=')
        for part in sentHead[1].split('|'):
            if Phrase.convert(sentHead[0]) in rulesFinal:
                rulesFinal[Phrase.convert(sentHead[0])].append(parseSentHeads(part.split()))
            elif Phrase.convert(sentHead[0]) in rulesFinal[Phrase.S]:
                rulesFinal[Phrase.S][Phrase.convert(sentHead[0])].append(parseSentHeads(part.split()))
            else:
                rulesFinal[Phrase.S][Phrase.convert(sentHead[0])] = parseSentHeads(part.split())
    return rulesFinal

### Master parser function

In [4]:
def parseRules(rulesSet):
    sentRules = []
    phraseRules = []
    for rule in rulesSet:
        if len(rule) == 0:
            continue
        if rule[0] == '!':
            sentRules.append(rule)
        elif rule[0] != '#':
            phraseRules.append(rule)           
    return (getSentStruct(sentRules), getPhraseStruct(phraseRules))

# Test area

In [5]:
dct = loadFromFile('./rules.txt')
sent, rules = parseRules(dct)
print(sent)
for rule in rules[Phrase.S][Phrase.SNP]:
    print(rule)

{<Phrase.S: 5>: [[<Phrase.SNP: 0>, <Phrase.VP: 1>, <Phrase.ONP: 2>], [<Phrase.ONP: 2>, <Phrase.VP: 1>, <Phrase.SNP: 0>]]}
[PhraseWordRecord(word=<PartOfSpeech.Noun: 0>, traits=[<Animate.Animate: 0>, None])]
[PhraseWordRecord(word=<PartOfSpeech.Adjective: 2>, traits=[]), PhraseWordRecord(word=<PartOfSpeech.Noun: 0>, traits=[<Animate.Animate: 0>, None, None])]
[PhraseWordRecord(word=<PartOfSpeech.Pronoun: 1>, traits=[None])]
