# General preparations

In [14]:
from __future__ import annotations
import codecs
from collections import namedtuple
from enum import Enum
from collections import defaultdict
import re
import random

WordRecord = namedtuple("WordRecord", ["defaultform", "quantityData", "traits"])

PhraseWordRecord = namedtuple("PhraseWordRecord", ["partofspeech", "traits"])

class Animate(Enum):
    Animate = 0
    Inanimate = 1
    def convert(string: str) -> Animate:
        switcher = {
            'anim': Animate.Animate,
            'nanim': Animate.Inanimate
        }
        return switcher[string]
    
class Gender(Enum):
    Male = 0
    Female = 1
    Neuter = 2
    def convert(string: str) -> Gender:
        switcher = {
            'M': Gender.Male,
            'F': Gender.Female,
            'N': Gender.Neuter
        }
        return switcher[string]
    
class Tense(Enum):
    Present = 0
    Past = 1
    Future = 2
    def convert(string: str) -> Gender:
        switcher = {
            'present': Tense.Present,
            'past': Tense.Past,
            'future': Tense.Future
        }
        return switcher[string]
    
class Perfectivity(Enum):
    Perfect = 0
    Imperfect = 1
    def convert(string: str) -> Perfectivity:
        switcher = {
            'perf': Perfectivity.Perfect,
            'imperf': Perfectivity.Imperfect
        }
        return switcher[string]
    
class PartOfSpeech(Enum):
    Noun = 0
    Pronoun = 1
    Adjective = 2
    Verb = 3
    def convert(string: str) -> PartOfSpeech:
        switcher = {
            'N': PartOfSpeech.Noun,
            'Pr': PartOfSpeech.Pronoun,
            'Adj': PartOfSpeech.Adjective,
            'V': PartOfSpeech.Verb
        }
        return switcher[string]
    
class Phrase(Enum):
    SNP = 0
    VP = 1
    ONP = 2
    NP = 3
    S = 4
    def convert(string: str) -> Phrase:
        switcher = {
            'SNP': Phrase.SNP,
            'VP': Phrase.VP,
            'ONP': Phrase.ONP,
            'NP': Phrase.NP,
            'S': Phrase.S
        }
        return switcher[string]
    
class Quantity(Enum):
    Singular = 0
    Plural = 1
    def convert(string: str) -> Quantity:
        switcher = {
            'sing': Quantity.Singular,
            'pl': Quantity.Plural
        }
        return switcher[string]

class Case(Enum):
    Nominative = 0
    Genitive = 1
    Dative = 2
    Accusative = 3
    Instrumental = 4
    Prepositional = 5
    def convert(string: str) -> Case:
        switcher = {
            'nom': Case.Nominative,
            'gen': Case.Genitive,
            'dat': Case.Dative,
            'acc': Case.Accusative,
            'inst': Case.Instrumental,
            'prep': Case.Prepositional
        }
        return switcher[string]

class PhraseHead(Enum):
    IsHead = 0
    def convert(string: str) -> Case:
        switcher = {
            'head': PhraseHead.IsHead
        }
        return switcher[string]
    
class Person(Enum):
    P1 = 0
    P2 = 1
    P3 = 2
    def convert(string: str) -> Person:
        switcher = {
            '1P': Person.P1,
            '2P': Person.P2,
            '3P': Person.P3
        }
        return switcher[string]
    
class Transitivity(Enum):
    Transitive = 0
    Intransitive = 1
    def convert(string: str) -> Transitivity:
        switcher = {
            'tr': Transitivity.Transitive,
            'intr': Transitivity.Intransitive
        }
        return switcher[string]
    
cases = [Case.Nominative, Case.Genitive, Case.Dative, Case.Accusative, Case.Instrumental,
         Case.Prepositional]
persons = [Person.P1, Person.P2, Person.P3]
genders = [Gender.Male, Gender.Female, Gender.Neuter]
wordTraits = [Animate, Gender, Tense, Perfectivity, Quantity, Case, PhraseHead, Person, Transitivity]
partsOfSpeech = [PartOfSpeech.Noun, PartOfSpeech.Pronoun, PartOfSpeech.Adjective, PartOfSpeech.Verb]

# imports contents of dictionary and splits it into lines
def loadFromFile(path):
    with codecs.open(path, 'r', 'utf-8') as dct:
        recs = dct.read().splitlines()
        return recs
    
def parseTrait(tr):
    for traitType in wordTraits:
        try:
            return traitType.convert(tr)
        except KeyError:
            continue

def parseTraits(trs):
    res = []
    if len(trs) > 0:
        for trait in trs[1:-1].split(';'):
            res.append(parseTrait(trait))
    return set(res)

# Loading dictionary

## Parsing

### Custom parsers

#### Nouns

In [63]:
# simple case parser
def parseCases(caseString):
    splitStr = caseString[1:-1].split('|')
    if len(splitStr) == 1:
        return []
    if len(splitStr) < 6:
        raise ValueError("Invalid case data", splitStr)
    return dict(zip(cases, splitStr))

# noun parser
def parseNounRec(splitRec):
    try:
        return WordRecord(splitRec[0], {Quantity.Singular: parseCases(splitRec[1]),
                              Quantity.Plural: parseCases(splitRec[2])}, parseTraits(splitRec[3]))
    except ValueError:
        raise ValueError("Failed to import entry:", splitRec[0])

#### Adjectives

In [62]:
# case parser for male forms of adjectives (with respect to animacy)
def parseMaleCases(caseString):
    temp = caseString[1:-1].split('|')
    temp[3] = temp[3].split(';')
    resDict = dict(zip(cases, temp))
    resDict['acc'] = {(Animate.Animate, temp[3][0]), (Animate.Inanimate, temp[3][1])}
    return resDict

# case parser for adjectives (with respect to genders)
def parseAdjSingCases(caseString):
    gens = caseString[1:-1].split(' ')
    resCases = defaultdict(Gender)
    for caseStringGen in gens:
        tmp = caseStringGen.split(':')
        gend = Gender.convert(tmp[0])
        if gend == Gender.Male:
            resCases[gend] = parseMaleCases(tmp[1])
        else:
            resCases[gend] = parseCases(tmp[1])
    return resCases

# adjective parser
def parseAdjRec(splitRec):
    return WordRecord(splitRec[0], {Quantity.Singular: parseAdjSingCases(splitRec[1]),
                       Quantity.Plural: parseCases(splitRec[2])}, parseTraits(splitRec[3]))

#### Verbs

In [61]:
def parsePastPersonalForms(formString):
    temp = formString[1:-1].split('|')
    if len(temp) == 3:
        return dict(zip(genders, temp))
    else:
        return temp[0]

def parsePersonalForms(formString):
    temp = formString[1:-1].split('|')
    return dict(zip(persons, temp))

def parseVerbForms(formString):
    tenses = formString[1:-1].split(' ')
    resForms = defaultdict(Tense)
    for formString in tenses:
        tmp = formString.split(':')
        tense = Tense.convert(tmp[0])
        if tense == Tense.Past:
            resForms[tense] = parsePastPersonalForms(tmp[1])
        else:
            resForms[tense] = parsePersonalForms(tmp[1])
    return resForms

def perfectivityParser(data):
    temp = data[1:-1].split('|')
    return Perfectivity.convert(temp[0])
        
def parseVerbRec(splitRec):
    res = WordRecord(splitRec[0], {Quantity.Singular: parseVerbForms(splitRec[1]),
                                     Quantity.Plural: parseVerbForms(splitRec[2])}, parseTraits(splitRec[3]))
    return res

#### Pronouns

In [60]:
def parsePronounRec(splitRec):
    return WordRecord(splitRec[0], {Quantity.Singular: parseCases(splitRec[1]),
                       Quantity.Plural: parseCases(splitRec[2])}, parseTraits(splitRec[3]))

### Master parser function

In [64]:
parsers = [parseNounRec, parsePronounRec, parseAdjRec, parseVerbRec]
partOfSpeechParsers = dict(zip(partsOfSpeech, parsers))

def parseRecord(rec):
    splitRec = rec.split('\t')
    #try:
    partofspeech = PartOfSpeech.convert(splitRec[0])
    return (partofspeech, partOfSpeechParsers[partofspeech](splitRec[1:]))
#    if partofspeech == PartOfSpeech.Noun:
#        return (partofspeech, parseNounRec(splitRec))
#    elif partofspeech == PartOfSpeech.Adjective:
#        return (partofspeech, parseAdjRec(splitRec))
#    elif partofspeech == PartOfSpeech.Verb:
#        return (partofspeech, parseVerbRec(splitRec))
#    elif partofspeech == PartOfSpeech.Pronoun:
#        return (partofspeech, parsePronounRec(splitRec))
    #except KeyError:
    #    print('Entry will be skipped')
    #return None

# Loading rules

## Loading & parsing phrase structure: words and their traits

In [7]:
def parseRightParts(rightPart):
    res = []
    for part in rightPart.split('|'):
        splitPart = part.split(' ')
        curWords = []
        for _part in splitPart:
            rec = _part.split('-')
            traits = []
            if len(rec) > 1:
                traits = rec[1]
            curWords.append(PhraseWordRecord(PartOfSpeech.convert(rec[0]), parseTraits(traits)))
        res.append(curWords)
    return res

def getPhraseStruct(rules):
    rulesFinal = {}
    for rule in rules:
        splitRule = rule.split(':=')
        rulesFinal[Phrase.convert(splitRule[0])] = parseRightParts(splitRule[1])
    return rulesFinal

## Loading & parsing non-terminal symbols, denoting overall sentence strucure

In [8]:
def parseSentHeads(rightPart):
    res = []
    for part in rightPart:
        res.append(Phrase.convert(part))
    return res

def getSentStruct(rules):
    rulesFinal = {Phrase.S: []}
    for rule in rules:
        sentHead = rule[1:].split(':=')
        for part in sentHead[1].split('|'):
            if Phrase.convert(sentHead[0]) in rulesFinal:
                rulesFinal[Phrase.convert(sentHead[0])].append(parseSentHeads(part.split()))
            elif Phrase.convert(sentHead[0]) in rulesFinal[Phrase.S]:
                rulesFinal[Phrase.S][Phrase.convert(sentHead[0])].append(parseSentHeads(part.split()))
            else:
                rulesFinal[Phrase.S][Phrase.convert(sentHead[0])] = parseSentHeads(part.split())
    return rulesFinal

### Master parser function

In [9]:
def parseRules(rulesSet):
    sentRules = []
    phraseRules = []
    for rule in rulesSet:
        if len(rule) == 0:
            continue
        if rule[0] == '!':
            sentRules.append(rule)
        elif rule[0] != '#':
            phraseRules.append(rule)           
    return (getSentStruct(sentRules), getPhraseStruct(phraseRules))

# Binding the components together: sentence generation

In [85]:
def sentGenerator(dct, sentStruct, rules):
    pattern = random.choice(sentStruct[Phrase.S])
    base = []
    for elem in pattern:
        base.append(random.choice(rules[elem]))
    sent = []
    print(base)
    phraseWithWords = []
    for phrase in base:
        for _word in phrase:
            sent.append(random.choice(dct[_word.partofspeech]).defaultform)
    print(sent)
    #print(rules)

## Master "main" function

In [24]:
def main(dctPath = './dictionary.txt', rulesPath = './rules.txt'):
    contents = loadFromFile(dctPath)
    dct = defaultdict(PartOfSpeech)
    for part in partsOfSpeech:
        dct[part] = []
    for entry in contents:
        partOfSpeech, rec = parseRecord(entry)
        dct[partOfSpeech].append(rec)
    contents = loadFromFile(rulesPath)
    sentStruct, rules = parseRules(contents)
    sentGenerator(dct, sentStruct, rules)

# Test area

In [92]:
main()

[[PhraseWordRecord(partofspeech=<PartOfSpeech.Pronoun: 1>, traits={<Case.Nominative: 0>})], [PhraseWordRecord(partofspeech=<PartOfSpeech.Verb: 3>, traits={<Transitivity.Transitive: 0>})], [PhraseWordRecord(partofspeech=<PartOfSpeech.Adjective: 2>, traits={None}), PhraseWordRecord(partofspeech=<PartOfSpeech.Noun: 0>, traits={<PhraseHead.IsHead: 0>, <Case.Accusative: 3>})]]
['волк', 'серый', 'видеть', 'я']


In [132]:
recs = loadFromFile('./dictionary.txt')
print(parseRecord(recs[4]))

WordRecord(defaultform='я', partofspeech=<PartOfSpeech.Pronoun: 1>, quantityData={<Quantity.Singular: 0>: {<Case.Nominative: 0>: 'я', <Case.Genitive: 1>: 'меня', <Case.Dative: 2>: 'мне', <Case.Accusative: 3>: 'меня', <Case.Instrumental: 4>: 'мной', <Case.Prepositional: 5>: 'мне'}, <Quantity.Plural: 1>: {<Case.Nominative: 0>: 'мы', <Case.Genitive: 1>: 'нас', <Case.Dative: 2>: 'нам', <Case.Accusative: 3>: 'нас', <Case.Instrumental: 4>: 'нами', <Case.Prepositional: 5>: 'нас'}}, traits={None})
