In [1]:
import json
import os
import re
import csv
import dateparser
import datetime
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pickle
from joblib import dump, load
import stanza
from tokenize_uk import tokenize_uk
import requests
from bs4 import BeautifulSoup

In [1293]:
with open('../../data/articles/test/zelen.json') as f:
    test_data = json.load(f)
with open('../../data/articles/train/train_it_2.json') as f:
    train_data = json.load(f)

In [3]:
nlp = stanza.Pipeline('uk', processors='tokenize,pos,lemma,depparse')

2020-06-09 18:37:33 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package |
-----------------------
| tokenize  | iu      |
| pos       | iu      |
| lemma     | iu      |
| depparse  | iu      |

2020-06-09 18:37:33 INFO: Use device: cpu
2020-06-09 18:37:33 INFO: Loading: tokenize
2020-06-09 18:37:33 INFO: Loading: pos
2020-06-09 18:37:34 INFO: Loading: lemma
2020-06-09 18:37:34 INFO: Loading: depparse
2020-06-09 18:37:35 INFO: Done loading processors!


In [4]:
def get_classifier():
    pipe = Pipeline([
        ('dict_vect', DictVectorizer()),
        ('lrc', LogisticRegression(random_state=42, multi_class='multinomial',
                                   max_iter=800, solver='sag', n_jobs=-1))])

    return pipe

In [5]:
def make_annotattion_sources(source_path, target_path):
    data_files = os.listdir(source_articles_path)
    file_count = 0
    for file in data_files:
        with open(os.path.join(source_path, file)) as f:
            art_count = 0
            cont = json.load(f)
            for i, article in enumerate(cont):
                res = []
                res.append(article['url'] + '\n')
                res.append('===\n')
                res.append(f'{article["title"]}\t|\tREL_{art_count}\n')
                res.append('===\n')

                sents = tokenize_uk.tokenize_sents(article['content'])
                for i, sent in enumerate(sents):
                    res.append(f'{sent}\t|\tREL_{art_count}_{i}\n')
                res.append(f'\nREL_{art_count}\n')
                res.append('END\n\n')
                
                with open(os.path.join(target_path, f'to_annotate_{file_count}.txt'), 'a') as f:
                    f.writelines(res)
                    
                    if art_count == 99:
                        file_count += 1
                        art_count = 0
                    else:
                        art_count += 1

In [10]:
''' Parse annotated data

algorythm of parsing annotated data
1) if all the article marked as IS_NO_EVENT and there are no other marks inside -
mark every sentense as NO_EVENT
2) if a block contains more than one sentence - split into multiple sentences
by annotated split object (DATE annotation at the sentence beginning)
'''


def parse_annotated_articles(annotated_articles_path):
    files = os.listdir(annotated_articles_path)
    lines = []
    raw_lines = []
    for file in files:
        with open(os.path.join(annotated_articles_path, file)) as f:
            reader = csv.reader(f, dialect='excel-tab')
            count = 0
            miltiline_events_cache = []
            is_all_non_event = None
            all_non_events = []
            start_mark = None

            for i, row in enumerate(reader):
                if row:
                    first_column = row[0]
                    next_column = row[1]

                    if first_column != '===':

                        if next_column == 'O':
                            fst_col_split = first_column.split(' | ')

                            # The line was required to be annotated annotation
                            if len(fst_col_split) > 1:
                                # The line was annotated, annotation is set on the next line
                                if not fst_col_split[1]:
                                    mark_row = reader.__next__()
                                    is_event = mark_row[1] == 'IS_EVENT'
                                    if not start_mark:
                                        start_mark = mark_row[0]
                                    if is_all_non_event is not None:
                                        is_all_non_event = not is_event
                                    line = (fst_col_split[0], is_event)
                                    if not len(miltiline_events_cache):
                                        lines.append(line)
                                        if is_all_non_event:
                                            all_non_events.append(line)
                                    else:
                                        miltiline_events_cache.append(line)
                                        is_all_non_event = False
                                # Not annotated
                                else:
                                    if is_all_non_event:
                                        all_non_events.append(
                                            (fst_col_split[0], False))
                                    mark = re.findall(
                                        'REL_\\d+', fst_col_split[1])
                                    if mark:
                                        start_mark = mark[0]
                                if len(miltiline_events_cache):
                                    # We already have some sentences in the cache
                                    # Updating their status
                                    ee = [miltiline_events_cache[i:i + 2]
                                          for i in range(0, len(miltiline_events_cache), 2)]
                                    for e in ee:
                                        r = []
                                        for x in e:
                                            a, b = x
                                            if a not in r:
                                                r.append(a.strip())
                                        t = ' '.join(r)
                                        lines.append((t, is_event))
                                    miltiline_events_cache = []
                            else:
                                event_text = fst_col_split[0].strip()
                                # Append multi-event text if we already have something in the cache
                                if event_text and len(miltiline_events_cache):
                                    miltiline_events_cache.append(
                                        (event_text, next_column))
                                    is_all_non_event = False
                        elif not (row[0].startswith('REL_') or row == 'END'):
                            line = (row[0], next_column)
                            miltiline_events_cache.append(line)
                            is_all_non_event = False
                    # We reached the end
                    if first_column == start_mark:
                        if is_all_non_event and next_column == 'IS_NOT_EVENT':
                            lines += all_non_events
                        all_non_events = []
                count += 1
    return lines

In [73]:
def print_tree(doc):
    print(*[f'id: {word.id}\tword: {word.text}\tPOS:{word.upos}\t\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')


def get_all_docs(annotated_texts):
    docs = []
    for i, (text, is_event) in enumerate(annotated_texts):
        try:
            doc = nlp(text)
            docs.append((doc, is_event))
        except Exception as e:
            print(f'Failed to create nlp from text that starts with: {text[:50]} {e}')
        if i % 500 == 0:
            print('-->', i)
    return docs
        

def parse_features(feats_string):
    res = {}
    feats = feats_string.split('|')
    for feat in feats:
        k, v = feat.split('=')
        res[k] = v
    return res


def find_dates(string, is_future=False):
    valid_months = ['січня', 'січ.','лютого','лют.','березня','берез.','квітня','квіт.',
                'травня','трав.','червня','черв.','липня','лип.','серпня','серп.',
               'вересня','верес.','жовтня','жовт.','листопада','листоп.','грудня','груд.']
    mon_regex_str = '|'.join(valid_months).replace('.', '\.')
    regex = '(\s\d{4}\s|(\d+ (' + mon_regex_str + ')(\s\d{4})?)|\d{2,4}-\d{2}-\d{2,4}|\d{2}.\d{2}.\d{2,4}|\d{2}\/\d{2}\/\d{2,4})'
    matches = re.findall(regex, string, re.IGNORECASE)
    dates = []
    for match in matches:
        date = match[0].strip()
        if len(date) == 4:
            curr_year = datetime.datetime.now().year
            # fragile thing, it may predict date if it's actually some other 4-digit stuff
            if int(date) <= curr_year or is_future:
                dates.append(date)
        else:
            dates.append(date)
    return dates


def get_doc_core_members(doc):
    res = []
    
    adv_final = ['вперше', 'нарешті', 'врешті', 'вчора', 'сьогодні', 'позавчора']

    for sent in doc.sentences:
        spo = {}
        pred = None
        subj = None
        obj = None
        
        num_words = len(sent.words)
        
        root = next((word
                     for word in sent.words if word.deprel == 'root'),
                    None)
        if not doc.text.strip() or num_words == 2 and sent.words[num_words - 1].upos == 'PUNCT':
            continue
        # FIXME: iterate only once
        if root:
            root_conj = next((word for word in sent.words if word.deprel ==
                              'conj' and word.head == int(root.id)), None)
            root_mod = next((word for word in sent.words if word.deprel ==
                              'advmod' and word.upos == 'PART' and word.head == int(root.id)), None)

            subj = next((word for word in sent.words if word.deprel ==
                         'nsubj' and word.head == int(root.id)), None)
            obj = next((word for word in sent.words if word.deprel ==
                        'obj' and word.head == int(root_conj.id if root_conj else root.id)),
                       None)
            c_conj = next((word
                     for word in sent.words if word.upos == 'CCONJ' and sent.words[int(word.id) - 2].upos == 'PUNCT'),
                    None)
            root_adv_final = next((word for word in sent.words if word.deprel ==
                        'advmod' and word.upos == 'ADV' and word.head == int(root.id) \
                and word.lemma in adv_final),
                       None)
            root_xcomp = next((word for word in sent.words if word.deprel ==
                              'xcomp' and word.head == int(root.id)), None)
            root_xcomp_noun = next((word for word in sent.words if word.deprel == 'xcomp:sp' \
                               and word.upos == 'NOUN' \
                              and word.head == int(root.id)),
                                    None)

            spo['subj'] = subj
            spo['root'] = root
            spo['root-conj'] = root_conj
            spo['obj'] = obj
            spo['root_mod'] = root_mod
            spo['c_conj'] = c_conj
            spo['root_adv_final'] = root_adv_final
            spo['root_xcomp'] = root_xcomp
            spo['root_xcomp_noun'] = root_xcomp_noun
            if subj:
                subj_conj = next((word for word in sent.words if word.deprel ==
                             'conj' and (word.upos == 'NOUN' or word.upos == 'PRON') \
                                  and word.head == int(subj.id)), None)
                spo['subj-conj'] = subj_conj
                if subj_conj:
                    subj_conj_verb = next((word for word in sent.words if word.upos ==
                        'VERB' and word.head == int(subj_conj.id)),
                       None)
                    spo['subj-conj-verb'] = subj_conj_verb

        res.append((sent.text, spo))
    return res



def get_features(doc):
    features = []
    
    predicate_special = ['допустити', 'думати', 'припустити', 'відреагувати', 'пояснити',
                     'сказати', 'заявити', 'повідомити', 'повідомляти', 'розповісти',
                      'розповідати', 'рекомендувати', 'порекомендувати', 'мати', 'стати', 'почати']

    spos = get_doc_core_members(doc)
#     print(spos)
    for sent_text, spo in spos:
        feat = {}
        if spo:
            root = spo['root']
            root_conj = spo.get('root-conj')
            root_adv_final = spo.get('root_adv_final')
            root_xcomp = spo.get('root_xcomp')
            root_xcomp_noun = spo.get('root_xcomp_noun')
            subj = spo.get('subj')
            subj_conj = spo.get('subj-conj')
            obj = spo.get('obj')

            dates = find_dates(doc.text, True)

            if root.feats:
                pred_features = parse_features(root.feats)
            else:
                pred_features = {}

            feat['subj'] = 'SUBJ' if subj else 'NONE'
            feat['has-date'] = len(dates) > 0
#             feat['c_conj'] = spo.get('c_conj') is not None
#             # TODO: mb remove these
#             feat['root_adv_final'] = root_adv_final is not None
#             feat['pred-special'] = root.lemma in predicate_special
            if pred_features.get('Tense') == 'Past':
                feat['root_xcomp'] = root_xcomp is not None
                if root_xcomp:
                    feat['root_xcomp_pos'] = root_xcomp.upos

            if subj:
#                 subj_features = parse_features(subj.feats)
#                 feat['subj-animacy'] = subj_features.get('Animacy') or 'NONE'
                feat['subj-pos'] = subj.upos
#             if subj_conj:
#                 feat['subj-conj'] = 'SUBJ_CONJ'
#                 subj_conj_features = parse_features(subj_conj.feats)
#                 feat['subj-conj-animacy'] = subj_conj_features.get('Animacy') or 'NONE'

#             if root_conj and root_conj.upos == 'VERB':
#                 pred_conj_features = parse_features(root_conj.feats)
#                 feat['pred-conj-tense'] = pred_conj_features.get(
#                     'Tense') or 'NONE'

            feat['pred'] = root.lemma
            feat['pred-pos'] = root.upos
            feat['obj'] = 'OBJ' if obj else 'NONE'
            if obj:
                feat['obj-pos'] = obj.upos
#             if root.upos != 'VERB':
#                 print(root.text, root.upos, root.feats, doc.text)
            if root.upos == 'VERB':
                feat['pred-tense'] = pred_features.get('Tense') or 'NONE'
                feat['pred-aspect'] = pred_features.get('Aspect') or 'NONE'
            if root.upos == 'NOUN' or root.upos == 'PROPN':
                feat['pred-anim'] = pred_features.get('Animacy') or 'NONE'
                feat['pred-abbr'] = pred_features.get('Abbr') or 'NONE'
            
            features.append(feat)
    return features



def get_data(docs):
    def _get_spo_shape(s, p, o):
        ids = [p.id]
        if s:
            ids.append(s.id)
        if o:
            ids.append(o.id)
        indexes = [str(y) for x, y in sorted([(x, i) for i, x in enumerate(ids)])]
        
        return '_'.join(indexes)

    features, labels = [], []

    for doc, is_event in docs:
        feats = get_features(doc)
        for feat in feats:
            features.append(feat)
            labels.append(is_event if feat else False)

    return features, labels

In [8]:
# source_articles_path = '../../data/articles/source_normalized'
# annotation_source_path = '../../data/articles/for_annotation'
# make_annotattion_sources(source_articles_path, annotation_source_path)

In [11]:
annotated_articles_path = '../../data/articles/annotated'
annotated_parsed = parse_annotated_articles(annotated_articles_path)
annotated_parsed = list(set(annotated_parsed))
truish = [(x, y) for x, y in annotated_parsed if y]
falsish = [(x, y) for x, y in annotated_parsed if not y]
print('All:', len(annotated_parsed))
print('Is event:', len(truish))
print('Is not event:', len(falsish))

All: 2720
Is event: 1502
Is not event: 1218


In [2497]:
# with open('../../data/articles/annotated_parsed.json', 'w') as f:
#     json.dump(annotated_parsed, f, ensure_ascii=False)

In [20]:
all_docs = get_all_docs(annotated_parsed)

--> 0
--> 500
--> 1000
--> 1500
Failed to create nlp from text that starts with: Батькам нікуди дітей дівати. Кернес не ввів карант 
--> 2000
--> 2500


In [62]:
# X, y = get_data(all_docs[300:600])

In [55]:
clf = get_classifier()

In [74]:
X, y = get_data(all_docs)

In [75]:
data_train, data_test, target_train, target_test = train_test_split(X, y, test_size=0.3,
                                                                    random_state=42, shuffle = True, stratify = y)

In [76]:
clf.fit(data_train, target_train)

Pipeline(memory=None,
         steps=[('dict_vect',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('lrc',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=800,
                                    multi_class='multinomial', n_jobs=-1,
                                    penalty='l2', random_state=42, solver='sag',
                                    tol=0.0001, verbose=0, warm_start=False))],
         verbose=False)

In [77]:
print(classification_report(target_test, clf.predict(data_test)))

              precision    recall  f1-score   support

       False       0.77      0.70      0.73       395
        True       0.77      0.82      0.80       483

    accuracy                           0.77       878
   macro avg       0.77      0.76      0.76       878
weighted avg       0.77      0.77      0.77       878



In [82]:
def predict_is_event(sent, clf):
    doc = nlp(sent)
    features = get_features(doc)
    print(features)
    return clf.predict(features)[0]
    

is_ev = predict_is_event('ЗМІ: ДБР дозволили примусовий привід Порошенка', clf)
print(is_ev)

dump(clf, './classifier.joblib')
# clf2 = load('./classifier.joblib') 

[{'subj': 'NONE', 'has-date': False, 'pred': 'ЗМІ', 'pred-pos': 'NOUN', 'obj': 'NONE', 'pred-anim': 'Inan', 'pred-abbr': 'Yes'}]
False


['./classifier.joblib']

In [80]:
s = 'ЗМІ: ДБР дозволили примусовий привід Порошенка'
# find path to 'буде' and check it's tense
s = 'Політична партія Рух нових сил Михайла Саакашвілі буде під номером 22 у виборчому бюлетені на позачергових парламентських виборах.'
s = 'В Україні буде сухо і тепло, але з "ложечкою дьогтю"'
s = 'З 7 квітня державний кордон України можна буде перетнути лише автотранспортом і лише у 19 пунктах пропуску.'
doc = nlp(s)

In [81]:
def get_head(tokens, word, root, deprel):
    head = int(word.head)
    is_found = False
    while not is_found or head != 0:
        if tokens[head - 2].deprel == deprel and int(tokens[head - 2].head) == root:
            print('&&&', tokens[head - 2], word)
            is_found = True
            return tokens[head - 2], word

        head = int(tokens[head - 2].id)
    return is_found

for sent in doc.sentences:
    root = next((word
                     for word in sent.words if word.deprel == 'root' and word.upos == 'VERB'),
                    None)
#     xc = [word for word in sent.words if get_head(sent.words, word, int(root.id), 'xcomp:sp') \
#                                and word.upos == 'NOUN']

#     print('>>', xc)

#     for word in sent.words:
#         print(word)