# General preparations

In [1]:
from __future__ import annotations
import codecs
from collections import namedtuple
from enum import Enum
from collections import defaultdict
import re
import random

WordRecord = namedtuple("WordRecord", ["defaultform", "data", "traits"])

PhraseWordRecord = namedtuple("PhraseWordRecord", ["partofspeech", "traits"])

class Animate(Enum):
    Animate = 0
    Inanimate = 1
    def convert(string: str) -> Animate:
        switcher = {
            'anim': Animate.Animate,
            'nanim': Animate.Inanimate
        }
        return switcher[string]
    
class Gender(Enum):
    Male = 0
    Female = 1
    Neuter = 2
    def convert(string: str) -> Gender:
        switcher = {
            'M': Gender.Male,
            'F': Gender.Female,
            'N': Gender.Neuter
        }
        return switcher[string]
    
class Tense(Enum):
    Present = 0
    Past = 1
    Future = 2
    def convert(string: str) -> Gender:
        switcher = {
            'present': Tense.Present,
            'past': Tense.Past,
            'future': Tense.Future
        }
        return switcher[string]
    
class Perfectivity(Enum):
    Perfect = 0
    Imperfect = 1
    def convert(string: str) -> Perfectivity:
        switcher = {
            'perf': Perfectivity.Perfect,
            'imperf': Perfectivity.Imperfect
        }
        return switcher[string]
    
class PartOfSpeech(Enum):
    Noun = 0
    Pronoun = 1
    Adjective = 2
    Verb = 3
    def convert(string: str) -> PartOfSpeech:
        switcher = {
            'N': PartOfSpeech.Noun,
            'Pr': PartOfSpeech.Pronoun,
            'Adj': PartOfSpeech.Adjective,
            'V': PartOfSpeech.Verb
        }
        return switcher[string]
    
class Quantity(Enum):
    Singular = 0
    Plural = 1
    def convert(string: str) -> Quantity:
        switcher = {
            'sing': Quantity.Singular,
            'pl': Quantity.Plural
        }
        return switcher[string]

class Case(Enum):
    Nominative = 0
    Genitive = 1
    Dative = 2
    Accusative = 3
    Instrumental = 4
    Prepositional = 5
    def convert(string: str) -> Case:
        switcher = {
            'nom': Case.Nominative,
            'gen': Case.Genitive,
            'dat': Case.Dative,
            'acc': Case.Accusative,
            'inst': Case.Instrumental,
            'prep': Case.Prepositional
        }
        return switcher[string]

class PhraseHead(Enum):
    Head = 0
    Slave = 0
    def convert(string: str) -> Case:
        switcher = {
            'head': PhraseHead.Head,
            'slave': PhraseHead.Slave
        }
        return switcher[string]
    
class Person(Enum):
    P1 = 0
    P2 = 1
    P3 = 2
    def convert(string: str) -> Person:
        switcher = {
            '1P': Person.P1,
            '2P': Person.P2,
            '3P': Person.P3
        }
        return switcher[string]
    
class Transitivity(Enum):
    Transitive = 0
    Intransitive = 1
    def convert(string: str) -> Transitivity:
        switcher = {
            'tr': Transitivity.Transitive,
            'intr': Transitivity.Intransitive
        }
        return switcher[string]
    
cases = [Case.Nominative, Case.Genitive, Case.Dative, Case.Accusative, Case.Instrumental,
         Case.Prepositional]
persons = [Person.P1, Person.P2, Person.P3]
genders = [Gender.Male, Gender.Female, Gender.Neuter]
wordTraits = [Animate, Gender, Tense, Perfectivity, Quantity, Case, PhraseHead, Person, Transitivity]
partsOfSpeech = [PartOfSpeech.Noun, PartOfSpeech.Pronoun, PartOfSpeech.Adjective, PartOfSpeech.Verb]
quantities = [Quantity.Singular, Quantity.Plural]

startingNT = "S"

# imports contents of dictionary and splits it into lines
def loadFromFile(path):
    with codecs.open(path, 'r', 'utf-8') as dct:
        recs = dct.read().splitlines()
        return recs
    
def parseTrait(tr):
    for traitType in wordTraits:
        try:
            return (traitType, traitType.convert(tr))
        except KeyError:
            continue

def parseTraits(trs):
    res = dict(zip(wordTraits, [None] * len(wordTraits)))
    if len(trs) > 2:
        for trait in trs[1:-1].split(';'):
            trType, val = parseTrait(trait)
            res[trType] = val
    return res

# Loading dictionary

## Parsing

### Custom parsers

#### Nouns

In [2]:
# simple case parser
def parseCases(caseString):
    splitStr = caseString[1:-1].split('|')
    if len(splitStr) == 1:
        return []
    if len(splitStr) < 6:
        raise ValueError("Invalid case data", splitStr)
    return dict(zip(cases, splitStr))

# noun parser
def parseNounRec(splitRec):
    try:
        return WordRecord(splitRec[0], {Quantity.Singular: parseCases(splitRec[1]),
                              Quantity.Plural: parseCases(splitRec[2])}, parseTraits(splitRec[3]))
    except ValueError:
        raise ValueError("Failed to import entry:", splitRec[0])

#### Adjectives

In [3]:
# case parser for male/plural forms of adjectives (with respect to animacy)
def parseAnimCases(caseString):
    temp = caseString[1:-1].split('|')
    temp[3] = temp[3].split(';')
    resDict = dict(zip(cases, temp))
    resDict[Case.Accusative] = {(Animate.Animate, temp[3][0]), (Animate.Inanimate, temp[3][1])}
    return resDict

# case parser for adjectives (with respect to genders)
def parseAdjSingCases(caseString):
    gens = caseString[1:-1].split(' ')
    resCases = defaultdict(Gender)
    for caseStringGen in gens:
        tmp = caseStringGen.split(':')
        gend = Gender.convert(tmp[0])
        if gend == Gender.Male:
            resCases[gend] = parseAnimCases(tmp[1])
        else:
            resCases[gend] = parseCases(tmp[1])
    return resCases

# adjective parser
def parseAdjRec(splitRec):
    return WordRecord(splitRec[0], {Quantity.Singular: parseAdjSingCases(splitRec[1]),
                       Quantity.Plural: parseAnimCases(splitRec[2])}, parseTraits(splitRec[3]))

#### Verbs

In [4]:
def parsePastPersonalForms(formString):
    temp = formString[1:-1].split('|')
    if len(temp) == 3:
        return dict(zip(genders, temp))
    else:
        return temp[0]

def parsePersonalForms(formString):
    temp = formString[1:-1].split('|')
    return dict(zip(persons, temp))

def parseVerbForms(formString):
    tenses = formString[1:-1].split(' ')
    resForms = defaultdict(Tense)
    for formString in tenses:
        tmp = formString.split(':')
        tense = Tense.convert(tmp[0])
        if tense == Tense.Past:
            resForms[tense] = parsePastPersonalForms(tmp[1])
        else:
            resForms[tense] = parsePersonalForms(tmp[1])
    return resForms

def perfectivityParser(data):
    temp = data[1:-1].split('|')
    return Perfectivity.convert(temp[0])
        
def parseVerbRec(splitRec):
    res = WordRecord(splitRec[0], {Quantity.Singular: parseVerbForms(splitRec[1]),
                                     Quantity.Plural: parseVerbForms(splitRec[2])}, parseTraits(splitRec[3]))
    return res

#### Pronouns

In [5]:
def parsePronounRec(splitRec):
    return WordRecord(splitRec[0], {Quantity.Singular: parseCases(splitRec[1]),
                       Quantity.Plural: parseCases(splitRec[2])}, parseTraits(splitRec[3]))

### Master parser function

In [6]:
parsers = [parseNounRec, parsePronounRec, parseAdjRec, parseVerbRec]
partOfSpeechParsers = dict(zip(partsOfSpeech, parsers))

def parseRecord(rec):
    splitRec = rec.split('\t')
    #try:
    partofspeech = PartOfSpeech.convert(splitRec[0])
    return (partofspeech, partOfSpeechParsers[partofspeech](splitRec[1:]))
    #except KeyError:
    #    print('Entry will be skipped')
    #return None

# Loading rules

## Loading & parsing phrase structure: words and their traits

In [7]:
def parseRightParts(rightPart):
    res = []
    for part in rightPart.split('|'):
        splitPart = part.split(' ')
        curWords = []
        for _part in splitPart:
            rec = _part.split('-')
            traits = []
            if len(rec) > 1:
                traits = rec[1]
            curWords.append(PhraseWordRecord(PartOfSpeech.convert(rec[0]), parseTraits(traits)))
        res.append(curWords)
    return res

def getPhraseStruct(rules):
    rulesFinal = {}
    for rule in rules:
        splitRule = rule.split(':=')
        rulesFinal[splitRule[0]] = parseRightParts(splitRule[1])
    return rulesFinal

## Loading & parsing non-terminal symbols, denoting overall sentence strucure

In [8]:
def parseSentHeads(rightPart):
    res = []
    for part in rightPart:
        res.append(part)
    return res

def getSentStruct(rules):
    rulesFinal = {startingNT: []}
    for rule in rules:
        sentHead = rule[1:].split(':=')
        for part in sentHead[1].split('|'):
            if sentHead[0] in rulesFinal:
                rulesFinal[sentHead[0]].append(parseSentHeads(part.split()))
            elif sentHead[0] in rulesFinal[startingNT]:
                rulesFinal[startingNT][sentHead[0]].append(parseSentHeads(part.split()))
            else:
                rulesFinal[startingNT][sentHead[0]] = parseSentHeads(part.split())
    return rulesFinal

### Master parser function

In [9]:
def parseRules(rulesSet):
    sentRules = []
    phraseRules = []
    for rule in rulesSet:
        if len(rule) == 0:
            continue
        if rule[0] == '!':
            sentRules.append(rule)
        elif rule[0] != '#':
            phraseRules.append(rule)           
    return (getSentStruct(sentRules), getPhraseStruct(phraseRules))

# Binding the components together: sentence generation

### Adjective

In [11]:
adjectiveTraits = [Animate, Gender, Quantity, Case, PhraseHead]
#trs = adjectiveTraits & set(word.traits.keys())

def caseGetter(word, traits):
    res = None
    if traits[Case] == Case.Accusative:
        if traits[Animate] == Animate.Animate:
            if traits[Gender] == Gender.Male:
                res = word.data[0]

def adjAgreement(phrase):
    headTraits = None
    if PhraseHead.Head == phrase[0].traits[PhraseHead]:
        raise ValueError('Adj can\'t be head of phrase')
    for word in phrase:
        if PhraseHead.Head in word.traits:
            headTraits = word.traits
    res = caseGetter(word, adjectiveTraits & headTraits)
    return res

## Noun

In [67]:
nounTraits = [Animate, Gender, Quantity, Case, PhraseHead]

def updateTraits(rule, word):
    for tr in nounTraits:
        if word.traits[tr] == None:
            word.traits[tr] = rule.traits[tr]
    
def nounForm(rule, word):
    updateTraits(rule, word)
    if word.traits[Quantity] == None:
        word.traits[Quantity] = random.choice(quantities)
    if word.traits[Case] == None:
            word.traits[Case] = Case.Nominative
    return word.data[word.traits[Quantity]][word.traits[Case]]

<enum 'Gender'> Gender.Male
<enum 'PhraseHead'> None
<enum 'Case'> None
<enum 'Animate'> Animate.Animate
<enum 'Quantity'> Quantity.Plural

## Pronoun

In [None]:
nounTraits = [Animate, Gender, Quantity, Case, PhraseHead, Person]

def updateTraits(rule, word):
    for tr in nounTraits:
        if word.traits[tr] == None:
            word.traits[tr] = rule.traits[tr]
    
def nounForm(rule, word):
    updateTraits(rule, word)
    if word.traits[Quantity] == None:
        word.traits[Quantity] = random.choice(quantities)
    if word.traits[Case] == None:
            word.traits[Case] = Case.Nominative
    return word.data[word.traits[Quantity]][word.traits[Case]]

In [32]:
agreement = [nounAgreement]#, parsePronounRec, parseAdjRec, parseVerbRec]
agreementSetters = dict(zip(partsOfSpeech, agreement))

def traitModifier(traits, word):
    return 0

def agreementSetter(rule, phrase):
    #for word in phrase:
    #    if not isinstance(phrase[0], list):
    #        #agreementSetters[rule[1].partofspeech](word)
            nounForm(rule[1], phrase[1])
    
#    print('<>>', type(rule))
#    print('<>', rule)
    #print(phrase)
#    agreementSetters[rule.partofspeech(phrase)]
#    headTraits = set()
#    for elem in rule:
#        if PhraseHead.IsHead in elem.traits:
#            headTraits = elem.traits
#            break
    #for i in range(0, headInd):
        
            

WordRecord(defaultform='серый', data={<Quantity.Singular: 0>: defaultdict(<enum 'Gender'>, {<Gender.Male: 0>: {<Case.Nominative: 0>: 'серый', <Case.Genitive: 1>: 'серого', <Case.Dative: 2>: 'серому', <Case.Accusative: 3>: {(<Animate.Inanimate: 1>, 'серый'), (<Animate.Animate: 0>, 'серого')}, <Case.Instrumental: 4>: 'серым', <Case.Prepositional: 5>: 'сером'}, <Gender.Female: 1>: {<Case.Nominative: 0>: 'серая', <Case.Genitive: 1>: 'серую', <Case.Dative: 2>: 'серой', <Case.Accusative: 3>: 'серую', <Case.Instrumental: 4>: 'серой', <Case.Prepositional: 5>: 'серой'}}), <Quantity.Plural: 1>: {<Case.Nominative: 0>: 'серые', <Case.Genitive: 1>: 'серых', <Case.Dative: 2>: 'серым', <Case.Accusative: 3>: {(<Animate.Animate: 0>, 'серых'), (<Animate.Inanimate: 1>, 'серые')}, <Case.Instrumental: 4>: 'серыми', <Case.Prepositional: 5>: 'серых'}}, traits={<enum 'Animate'>: None, <enum 'Gender'>: None, <enum 'Tense'>: None, <enum 'Perfectivity'>: None, <enum 'Quantity'>: None, <enum 'Case'>: None, <enum 'PhraseHead'>: None, <enum 'Person'>: None, <enum 'Transitivity'>: None})

In [14]:
def sentGenerator(dct, sentStruct, rules):
    pattern = random.choice(sentStruct[startingNT])
    base = []
    for elem in pattern:
        base.append(random.choice(rules[elem]))
    sent = []
    for phrase in base:
        sent.append([])
        for _word in phrase:
            sent[len(sent)-1].append(random.choice(dct[_word.partofspeech]))
    agreementSetter(base[0], sent[0])
    #for i in range(0, len(sent)):
        #for j in range(0, len(sent[i])):
            #print(base[i][j], sent[i][j].defaultform)
            #print(agreementSetter(base, sent))
    #print(sent)
    #print(rules)

## Master "main" function

In [26]:
def main(dctPath = './dictionary.txt', rulesPath = './rules.txt'):
    contents = loadFromFile(dctPath)
    dct = defaultdict(PartOfSpeech)
    for part in partsOfSpeech:
        dct[part] = []
    for entry in contents:
        partOfSpeech, rec = parseRecord(entry)
        dct[partOfSpeech].append(rec)
    contents = loadFromFile(rulesPath)
    sentStruct, rules = parseRules(contents)
    sentGenerator(dct, sentStruct, rules)

# Test area

In [66]:
main()

волки


In [87]:
print(type(set({'a': 2, 'b': 3}.keys())))

<class 'set'>
