In [269]:
from yargy import Parser, rule, and_, not_, or_
from yargy.interpretation import fact
from yargy.predicates import gram, dictionary, type as t1, eq, gte, lte, dictionary, is_capitalized
from yargy.relations import gnc_relation
from yargy.pipelines import morph_pipeline

MONTHS = {
    'январь',
    'февраль',
    'март',
    'апрель',
    'мая',
    'июнь',
    'июль',
    'август',
    'сентябрь',
    'октябрь',
    'ноябрь',
    'декабрь'
}
MONTH_NAME = dictionary(MONTHS)

MONTH = and_(
    gte(1),
    lte(12)
)
DAY = and_(
    gte(1),
    lte(31)
)
YEAR = and_(
    gte(1920),
    lte(2024)
)


# 2990:2995 - Сергей Довлатов родился в 1941
# 542 Иосиф Кобзон родился в городе Часов Яр 


file = open("news.txt")
full_text = ''
for item in file.readlines()[0:560]:
    splitted_news_text = item.split('\t')[2]
    full_text = full_text + splitted_news_text



Person = fact(
    'Entry',
    ['name', 'birthDate', 'birthPlace']
)

Name = fact(
    'Name',
    ['firstName', 'lastName'],
)

Date = fact('Date', ['birthDate'])

Place = fact('Place', ['birthPlace'])


FIRST = and_(
    gram('Name'),
    not_(gram('Abbr')),
)
LAST = and_(
    gram('Surn'),
    not_(gram('Abbr')),
)
PLACE = and_(gram('NOUN'), not_(gram('Abbr')), is_capitalized())


gnc = gnc_relation()

first_name_matcher = FIRST.interpretation(
        Name.firstName
    ).match(gnc)
last_name_matcher = LAST.interpretation(
        Name.lastName
    ).match(gnc)

in_preposition_pipeline = morph_pipeline(['в', 'из'])

was_born_pipeline = morph_pipeline(['родился', 'был рожден', 'появился на свет'])
year_born_pipeline = morph_pipeline(['г', 'год'])
born_city_pipeline = morph_pipeline(['город', 'столица', 'поселение', 'регион', 'территория', 'страна'])

NAME_RULE = rule(
    or_(
        rule(first_name_matcher, last_name_matcher).interpretation(Name),
        rule(last_name_matcher, first_name_matcher).interpretation(Name),
        # rule(first_name_matcher).interpretation(Name),
        # rule(last_name_matcher).interpretation(Name),
    )
).interpretation(
    Name
)

BORN_DATE_RULE = or_(
    rule(DAY, MONTH_NAME, YEAR).interpretation(Date.birthDate),
    rule(DAY, '.', MONTH, '.', YEAR).interpretation(Date.birthDate),
    rule(YEAR, '-', MONTH, '-', DAY).interpretation(Date.birthDate),
    rule(in_preposition_pipeline.optional(), YEAR, year_born_pipeline.optional(), eq('.').optional()).interpretation(Date.birthDate)
).interpretation(Date.birthDate)

BORN_CITY_RULE = rule(in_preposition_pipeline.optional(), born_city_pipeline, PLACE.repeatable()).interpretation(Place.birthPlace)

ANY_WORD_RULE = rule(or_(gram('VERB'), gram('NOUN'), gram('ADJF'), gram('PREP'), gram('ADJS'))).repeatable().optional()

PERSON = or_(
    rule(NAME_RULE.interpretation(Person.name), ANY_WORD_RULE, BORN_CITY_RULE.interpretation(Person.birthPlace)),
    rule(NAME_RULE.interpretation(Person.name), ANY_WORD_RULE, was_born_pipeline.optional(), ANY_WORD_RULE, BORN_DATE_RULE.interpretation(Person.birthDate).optional(), BORN_CITY_RULE.interpretation(Person.birthPlace)),
    rule(was_born_pipeline, ANY_WORD_RULE, NAME_RULE.interpretation(Person.name), ANY_WORD_RULE, BORN_DATE_RULE.interpretation(Person.birthDate)),
    rule(NAME_RULE.interpretation(Person.name), ANY_WORD_RULE, was_born_pipeline, ANY_WORD_RULE, BORN_DATE_RULE.interpretation(Person.birthDate)),
    rule(BORN_DATE_RULE.interpretation(Person.birthDate), was_born_pipeline, ANY_WORD_RULE, NAME_RULE.interpretation(Person.name)), 
    rule(was_born_pipeline, ANY_WORD_RULE, BORN_DATE_RULE.interpretation(Person.birthDate), ANY_WORD_RULE, NAME_RULE.interpretation(Person.name)), 
    rule(was_born_pipeline, ANY_WORD_RULE, NAME_RULE.interpretation(Person.name), ANY_WORD_RULE, BORN_DATE_RULE.interpretation(Person.birthDate)), 
    rule(NAME_RULE.interpretation(Person.name), ANY_WORD_RULE, was_born_pipeline, ANY_WORD_RULE, BORN_DATE_RULE.interpretation(Person.birthDate)),
    rule(
        NAME_RULE.interpretation(
            Person.name
        )
    )
).interpretation(
    Person
)

# Иосиф Кобзон родился в городе Часов Яр 
# Эстерхази родился в венгерской столице в 1950 году

parser = Parser(PERSON)
entries = []

for match in parser.findall(full_text):
    entries.append(match.fact)
print(entries)



[Entry(name=Name(firstName='Матс', lastName='Сундин'), birthDate=None, birthPlace=None), Entry(name=Name(firstName='Владимир', lastName='Филиппов'), birthDate=None, birthPlace=None), Entry(name=Name(firstName='Ильгизу', lastName='Фахриеву'), birthDate=None, birthPlace=None), Entry(name=Name(firstName='Михаилом', lastName='Прохоровым'), birthDate=None, birthPlace=None), Entry(name=Name(firstName='Бена', lastName='Николсона'), birthDate=None, birthPlace=None), Entry(name=Name(firstName='Анри', lastName='Матисса'), birthDate=None, birthPlace=None), Entry(name=Name(firstName='Пабло', lastName='Пикассо'), birthDate=None, birthPlace=None), Entry(name=Name(firstName='Жоржа', lastName='Брака'), birthDate=None, birthPlace=None), Entry(name=Name(firstName='Владимиром', lastName='Путиным'), birthDate=None, birthPlace=None), Entry(name=Name(firstName='Леонида', lastName='Брежнева'), birthDate=None, birthPlace=None), Entry(name=Name(firstName='Юрий', lastName='Андропов'), birthDate=None, birthPlace