In [20]:
import json
import os
import re
import csv
import dateparser
import datetime
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import eli5
from eli5.sklearn.utils import get_feature_names
import pickle
import numpy as np
from joblib import dump, load
import stanza
from tokenize_uk import tokenize_uk
import requests
from bs4 import BeautifulSoup

In [3]:
nlp = stanza.Pipeline('uk', processors='tokenize,pos,lemma,depparse')

2020-06-12 13:03:19 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package |
-----------------------
| tokenize  | iu      |
| pos       | iu      |
| lemma     | iu      |
| depparse  | iu      |

2020-06-12 13:03:19 INFO: Use device: cpu
2020-06-12 13:03:19 INFO: Loading: tokenize
2020-06-12 13:03:19 INFO: Loading: pos
2020-06-12 13:03:20 INFO: Loading: lemma
2020-06-12 13:03:20 INFO: Loading: depparse
2020-06-12 13:03:21 INFO: Done loading processors!


In [4]:
def get_classifier():
    pipe = Pipeline([
        ('dict_vect', DictVectorizer()),
        ('lrc', LogisticRegression(random_state=42, multi_class='multinomial',
                                   max_iter=800, solver='sag', n_jobs=-1))])

    return pipe

In [4]:
def make_annotattion_sources(source_path, target_path):
    data_files = os.listdir(source_articles_path)
    file_count = 0
    for file in data_files:
        with open(os.path.join(source_path, file)) as f:
            art_count = 0
            cont = json.load(f)
            for i, article in enumerate(cont):
                res = []
                res.append(article['url'] + '\n')
                res.append('===\n')
                res.append(f'{article["title"]}\t|\tREL_{art_count}\n')
                res.append('===\n')

                sents = tokenize_uk.tokenize_sents(article['content'])
                for i, sent in enumerate(sents):
                    res.append(f'{sent}\t|\tREL_{art_count}_{i}\n')
                res.append(f'\nREL_{art_count}\n')
                res.append('END\n\n')
                
                with open(os.path.join(target_path, f'to_annotate_{file_count}.txt'), 'a') as f:
                    f.writelines(res)
                    
                    if art_count == 99:
                        file_count += 1
                        art_count = 0
                    else:
                        art_count += 1

In [281]:
''' Parse annotated data

algorythm of parsing annotated data
1) if all the article marked as IS_NO_EVENT and there are no other marks inside -
mark every sentense as NO_EVENT
2) if a block contains more than one sentence - split into multiple sentences
by annotated split object (DATE annotation at the sentence beginning)
'''


def trim_indirect_speech(text):
    stops = ['Курс НБУ: ']
    
    text_trimmed = text

    if ', -' in text_trimmed:
        text_trimmed = re.sub('(\"|»)?\,\s-\s.*', '', text_trimmed)
    text_trimmed = re.sub('(^\w+(-\w+)?((\s+\w+(-\w+)?){1})?):\s', '', text_trimmed)
    for stop in stops:
        text_trimmed = text_trimmed.replace(stop, '')
    return text_trimmed


def parse_annotated_articles(annotated_articles_path):
    files = os.listdir(annotated_articles_path)
    lines = []
    raw_lines = []
    for file in files:
        with open(os.path.join(annotated_articles_path, file)) as f:
            reader = csv.reader(f, dialect='excel-tab')
            count = 0
            miltiline_events_cache = []
            is_all_non_event = None
            all_non_events = []
            start_mark = None

            for i, row in enumerate(reader):
                if row:
                    first_column = row[0]
                    next_column = row[1]

                    if first_column != '===':

                        if next_column == 'O':
                            fst_col_split = first_column.split(' | ')

                            # The line was required to be annotated annotation
                            if len(fst_col_split) > 1:
                                # The line was annotated, annotation is set on the next line
                                if not fst_col_split[1]:
                                    mark_row = reader.__next__()
                                    is_event = mark_row[1] == 'IS_EVENT'
                                    if not start_mark:
                                        start_mark = mark_row[0]
                                    if is_all_non_event is not None:
                                        is_all_non_event = not is_event
                                    line = (fst_col_split[0], is_event)
                                    if not len(miltiline_events_cache):
                                        lines.append(line)
                                        if is_all_non_event:
                                            all_non_events.append(line)
                                    else:
                                        miltiline_events_cache.append(line)
                                        is_all_non_event = False
                                # Not annotated
                                else:
                                    if is_all_non_event:
                                        all_non_events.append(
                                            (fst_col_split[0], False))
                                    mark = re.findall(
                                        'REL_\\d+', fst_col_split[1])
                                    if mark:
                                        start_mark = mark[0]
                                if len(miltiline_events_cache):
                                    # We already have some sentences in the cache
                                    # Updating their status
                                    ee = [miltiline_events_cache[i:i + 2]
                                          for i in range(0, len(miltiline_events_cache), 2)]
                                    for e in ee:
                                        r = []
                                        for x in e:
                                            a, b = x
                                            if a not in r:
                                                r.append(a.strip())
                                        t = ' '.join(r)
                                        lines.append((t, is_event))
                                    miltiline_events_cache = []
                            else:
                                event_text = fst_col_split[0].strip()
                                # Append multi-event text if we already have something in the cache
                                if event_text and len(miltiline_events_cache):
                                    miltiline_events_cache.append(
                                        (event_text, next_column))
                                    is_all_non_event = False
                        elif not (row[0].startswith('REL_') or row == 'END'):
                            line = (row[0], next_column)
                            miltiline_events_cache.append(line)
                            is_all_non_event = False
                    # We reached the end
                    if first_column == start_mark:
                        if is_all_non_event and next_column == 'IS_NOT_EVENT':
                            lines += all_non_events
                        all_non_events = []
                count += 1
    return lines

In [344]:
def get_all_docs(annotated_texts):
    docs = []
    for i, (text, is_event) in enumerate(annotated_texts):
        try:
            doc = nlp(text)
            docs.append((doc, is_event))
        except Exception as e:
            print(f'Failed to create nlp from text that starts with: {text[:50]} {e}')
        if i % 500 == 0:
            print('-->', i)
    return docs
        

def parse_features(feats_string):
    res = {}
    feats = feats_string.split('|')
    for feat in feats:
        k, v = feat.split('=')
        res[k] = v
    return res


def find_dates(string, is_future=False):
    valid_months = ['січня', 'січ.','лютого','лют.','березня','берез.','квітня','квіт.',
                'травня','трав.','червня','черв.','липня','лип.','серпня','серп.',
               'вересня','верес.','жовтня','жовт.','листопада','листоп.','грудня','груд.']
    mon_regex_str = '|'.join(valid_months).replace('.', '\.')
    regex = '(\s\d{4}\s|(\d+ (' + mon_regex_str + ')(\s\d{4})?)|\d{2,4}-\d{2}-\d{2,4}|\d{2}.\d{2}.\d{2,4}|\d{2}\/\d{2}\/\d{2,4})'
    matches = re.findall(regex, string, re.IGNORECASE)
    dates = []
    for match in matches:
        date = match[0].strip()
        if len(date) == 4:
            curr_year = datetime.datetime.now().year
            # fragile thing, it may predict date if it's actually some other 4-digit stuff
            if int(date) <= curr_year or is_future:
                dates.append(date)
        else:
            dates.append(date)
    return dates


def get_doc_core_members(doc):
    res = []
    
    adv_final = ['вперше', 'нарешті', 'врешті', 'вчора', 'сьогодні', 'позавчора']
    
    
    def get_token_children(token, tree):
        return [x for x in tree if x.head == int(token.id)]
    
    def get_token_window(token_id, tokens):
        lefts, rights = [], []
        if token_id > 3:
            lefts.append(tokens[token_id - 4])
        if token_id > 2:
            lefts.append(tokens[token_id - 3])
        if token_id > 1:
            lefts.append(tokens[token_id - 2])
        neigb_right = token_id + 1
        while neigb_right < len(tokens) - token_id:
            rights.append(tokens[neigb_right])
            neigb_right += 1
        return lefts, rights
    

    def get_root_ccomp_verb(root_id, tree):
        for word in tree.words:
            if word.deprel == 'ccomp' and word.head == root.id:
                if word.upos == 'VERB':
                    return word
                for child in get_token_children(word, tree.words):
                    if child.upos == 'VERB':
                        return child

    for sent in doc.sentences:
        spo = {}
        pred = None
        subj = None
        obj = None
        
        num_words = len(sent.words)
        
        root = next((word
                     for word in sent.words if word.deprel == 'root'),
                    None)
        if not doc.text.strip() or num_words == 2 and sent.words[num_words - 1].upos == 'PUNCT' or root.upos == 'SYM':
            continue
        # FIXME: iterate only once
        if root:
            root_conj = next((word for word in sent.words if word.deprel ==
                              'conj' and word.head == int(root.id)), None)
            root_mod = next((word for word in sent.words if word.deprel ==
                              'advmod' and word.upos == 'PART' and word.head == int(root.id)), None)

            subj = next((word for word in sent.words if word.deprel ==
                         'nsubj' and word.head == int(root.id)), None)
            obj = next((word for word in sent.words if word.deprel ==
                        'obj' and word.head == int(root_conj.id if root_conj else root.id)),
                       None)
            c_conj = next((word
                     for word in sent.words if word.upos == 'CCONJ' and sent.words[int(word.id) - 2].upos == 'PUNCT'),
                    None)
            root_adv_final = next((word for word in sent.words if word.deprel ==
                        'advmod' and word.upos == 'ADV' and word.head == int(root.id) \
                and word.lemma.lower() in adv_final),
                       None)
            root_xcomp = next((word for word in sent.words if word.deprel ==
                              'xcomp' and word.head == int(root.id)), None)
            root_ccomp = get_root_ccomp_verb(int(root.id), sent)
            root_xcomp_noun = next((word for word in sent.words if word.deprel == 'xcomp:sp' \
                               and word.upos == 'NOUN' \
                              and word.head == int(root.id)),
                                    None)
            root_window = get_token_window(int(root.id), sent.words)
            

            spo['subj'] = subj
            spo['root'] = root
            spo['root-conj'] = root_conj
            spo['obj'] = obj
            spo['root_mod'] = root_mod
            spo['c_conj'] = c_conj
            spo['root_adv_final'] = root_adv_final
            spo['root_xcomp'] = root_xcomp
            spo['root_ccomp'] = root_ccomp
            spo['root_xcomp_noun'] = root_xcomp_noun
            spo['root_window'] = root_window
            spo['all_verbs'] = [x for x in sent.words if x.upos == 'VERB']
            if subj:
                subj_conj = next((word for word in sent.words if word.deprel ==
                             'conj' and (word.upos == 'NOUN' or word.upos == 'PRON') \
                                  and word.head == int(subj.id)), None)
                spo['subj-conj'] = subj_conj
                if subj_conj:
                    subj_conj_verb = next((word for word in sent.words if word.upos ==
                        'VERB' and word.head == int(subj_conj.id)),
                       None)
                    spo['subj-conj-verb'] = subj_conj_verb
                
                subj_window = get_token_window(int(subj.id), sent.words)
                spo['subj_window'] = subj_window
            if obj:
                obj_window = get_token_window(int(obj.id), sent.words)
                spo['obj_window'] = obj_window

        res.append((sent.text, spo, num_words))
    return res



def get_features(doc):
    features = []
    
    predicate_special = ['допустити', 'думати', 'припустити', 'відреагувати', 'пояснити',
                     'сказати', 'заявити', 'повідомити', 'повідомляти', 'розповісти',
                      'розповідати', 'рекомендувати', 'порекомендувати', 'мати', 'стати', 'почати',
                        'назвати']
    
    def _get_verbs_past(verbs):
        past = 0
        for verb in verbs:
            feats = parse_features(verb.feats)
            if feats.get('Tense') == 'Past':
                past += 1
        return past


    spos = get_doc_core_members(doc)
    for sent_text, spo, num_words in spos:
        feat = {}
        if spo:
            root = spo['root']
            root_conj = spo.get('root-conj')
            root_adv_final = spo.get('root_adv_final')
            root_xcomp = spo.get('root_xcomp')
            root_ccomp = spo.get('root_ccomp')
            root_xcomp_noun = spo.get('root_xcomp_noun')
            root_lefts, root_rights = spo['root_window']
            subj = spo.get('subj')
            subj_conj = spo.get('subj-conj')
            subj_window = spo.get('subj_window')
            obj_window = spo.get('obj_window')
            obj = spo.get('obj')
            all_verbs = spo['all_verbs']

            dates = find_dates(doc.text, True)

            if root.feats:
                pred_features = parse_features(root.feats)
            else:
                pred_features = {}

            pos_shape = root.upos
            if subj:
                pos_shape += f'_{subj.upos}'
            if obj:
                pos_shape += f'_{obj.upos}'
            
            feat['pos-shape'] = pos_shape
            feat['subj'] = 'SUBJ' if subj else 'NONE'
            feat['has-date'] = len(dates) > 0
            
            if len(root_lefts) - 1 > 0:
                feat['root-w-1'] = root_lefts[len(root_lefts) - 1].upos
            if len(root_lefts) - 2 > 0:
                feat['root-w-2'] = root_lefts[len(root_lefts) - 2].upos
            if len(root_lefts) - 3 > 0:
                feat['root-w-3'] = root_lefts[len(root_lefts) - 3].upos
            if len(root_rights) - 1 > 0:
                feat['root-w+1'] = root_rights[len(root_rights) - 1].upos
            if len(root_rights) - 2 > 0:
                feat['root-w+2'] = root_rights[len(root_rights) - 2].upos
            if len(root_rights) - 3 > 0:
                feat['root-w+3'] = root_rights[len(root_rights) - 3].upos
                
            if subj_window:
                subj_lefts, subj_rights = subj_window
                
                if len(subj_lefts) - 1 > 0:
                    feat['subj-w-1'] = subj_lefts[len(subj_lefts) - 1].upos
                if len(subj_lefts) - 2 > 0:
                    feat['subj-w-2'] = subj_lefts[len(subj_lefts) - 2].upos
                if len(subj_lefts) - 3 > 0:
                    feat['subj-w-3'] = subj_lefts[len(subj_lefts) - 3].upos
                if len(subj_rights) - 1 > 0:
                    feat['subj-w+1'] = subj_rights[len(subj_rights) - 1].upos
                if len(subj_rights) - 2 > 0:
                    feat['subj-w+2'] = subj_rights[len(subj_rights) - 2].upos
                if len(subj_rights) - 3 > 0:
                    feat['subj-w+3'] = subj_rights[len(subj_rights) - 3].upos
                    
            if obj_window:
                obj_lefts, obj_rights = obj_window
                
                if len(obj_lefts) - 1 > 0:
                    feat['obj-w-1'] = obj_lefts[len(obj_lefts) - 1].upos
                if len(obj_lefts) - 2 > 0:
                    feat['obj-w-2'] = obj_lefts[len(obj_lefts) - 2].upos
                if len(obj_lefts) - 3 > 0:
                    feat['obj-w-3'] = obj_lefts[len(obj_lefts) - 3].upos
                if len(obj_rights) - 1 > 0:
                    feat['obj-w+1'] = obj_rights[len(obj_rights) - 1].upos
                if len(obj_rights) - 2 > 0:
                    feat['obj-w+2'] = obj_rights[len(obj_rights) - 2].upos
                if len(obj_rights) - 3 > 0:
                    feat['obj-w+3'] = obj_rights[len(obj_rights) - 3].upos
            
                
            feat['is_question'] = sent_text.endswith('?')

            if pred_features.get('Tense') == 'Past':
                feat['root_xcomp'] = root_xcomp is not None
                if root_xcomp:
                    feat['root_xcomp_pos'] = root_xcomp.upos
                if root_ccomp:
                    root_ccomp_features = parse_features(root_ccomp.feats)
                    feat['root_ccomp_tense'] = root_ccomp_features.get('Tense') or 'NONE'
                    feat['root_ccomp_aspect'] = root_ccomp_features.get('Aspect') or 'NONE'
                    if root_ccomp_features.get('Tense') != 'Past':
                        feat['pred-special'] = root.lemma.lower() in predicate_special
            if root_conj:
                feat['root_conj_special'] = root_conj.lemma.lower() in predicate_special

            feat['all_verb_past'] = _get_verbs_past(all_verbs) == len(all_verbs)

            if subj:
                subj_features = parse_features(subj.feats)
                feat['subj-animacy'] = subj_features.get('Animacy') or 'NONE'
                feat['subj-pos'] = subj.upos
            else:
                feat['subj-animacy'] = 'NONE'
                feat['subj-pos'] = 'NONE'

            feat['obj'] = 'OBJ' if obj else 'NONE'

            if root.upos == 'VERB':
                feat['pred-tense'] = pred_features.get('Tense') or 'NONE'
                feat['pred-aspect'] = pred_features.get('Aspect') or 'NONE'
            if root.upos == 'NOUN' or root.upos == 'PROPN':
                feat['pred-anim'] = pred_features.get('Animacy') or 'NONE'
                feat['pred-abbr'] = pred_features.get('Abbr') or 'NONE'
            
            features.append(feat)
    return features



def get_data(docs):
    def _get_spo_shape(s, p, o):
        ids = [p.id]
        if s:
            ids.append(s.id)
        if o:
            ids.append(o.id)
        indexes = [str(y) for x, y in sorted([(x, i) for i, x in enumerate(ids)])]
        
        return '_'.join(indexes)

    features, labels = [], []

    for doc, is_event in docs:
        feats = get_features(doc)
        for feat in feats:
            features.append(feat)
            labels.append(is_event if feat else False)

    return features, labels

In [8]:
# source_articles_path = '../../data/articles/source_normalized'
# annotation_source_path = '../../data/articles/for_annotation'
# make_annotattion_sources(source_articles_path, annotation_source_path)

In [282]:
annotated_articles_path = '../../data/articles/annotated'
annotated_parsed = parse_annotated_articles(annotated_articles_path)
annotated_parsed = list(set(annotated_parsed))
annotated_parsed = [(trim_indirect_speech(x), y) for x, y in annotated_parsed]
truish = [(x, y) for x, y in annotated_parsed if y]
falsish = [(x, y) for x, y in annotated_parsed if not y]
print('All:', len(annotated_parsed))
print('Is event:', len(truish))
print('Is not event:', len(falsish))

All: 3189
Is event: 1767
Is not event: 1422


In [285]:
# with open('../../data/articles/annotated_parsed.json', 'w') as f:
#     json.dump(annotated_parsed, f, ensure_ascii=False)

In [286]:
all_docs = get_all_docs(annotated_parsed)

--> 0
--> 500
--> 1000
--> 1500
--> 2000
--> 2500
Failed to create nlp from text that starts with: Батькам нікуди дітей дівати. Кернес не ввів карант 
--> 3000


In [9]:
clf = get_classifier()

In [345]:
X, y = get_data(all_docs)

In [346]:
data_train, data_test, target_train, target_test = train_test_split(X, y, test_size=0.3,
                                                                    random_state=42, shuffle = True, stratify = y)

In [347]:
clf.fit(data_train, target_train)



Pipeline(memory=None,
         steps=[('dict_vect',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('lrc',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=800,
                                    multi_class='multinomial', n_jobs=-1,
                                    penalty='l2', random_state=42, solver='sag',
                                    tol=0.0001, verbose=0, warm_start=False))],
         verbose=False)

In [348]:
print(classification_report(target_test, clf.predict(data_test)))

              precision    recall  f1-score   support

       False       0.72      0.68      0.70       458
        True       0.75      0.79      0.77       561

    accuracy                           0.74      1019
   macro avg       0.74      0.73      0.73      1019
weighted avg       0.74      0.74      0.74      1019



In [363]:
clfr = clf.get_params()['lrc']
vec = clf.get_params()['dict_vect']
feature_names = get_feature_names(clfr, vec)

eli5.explain_weights(clfr, top=30, feature_names=feature_names)

Weight?,Feature
+0.981,obj-w+1=PRON
+0.817,pos-shape=ADJ_PROPN
+0.588,obj-w+2=DET
+0.574,root-w+1=X
+0.538,obj-w-2=NUM
+0.525,subj-w+1=X
+0.525,subj-w-1=PROPN
+0.514,pred-tense=NONE
+0.511,obj-w+3=X
+0.480,pos-shape=VERB_ADJ_X


In [354]:
dump(clf, './classifier.joblib')

['./classifier.joblib']

In [None]:
def is_contain_string(string, sub):
    overlap_len = int(len(string) * 0.8)
    if len(sub) < overlap_len:
        return False
    return string.find(sub) == 0


def is_term_in_title(title_doc, term_doc):
    for sent in title_doc.sentences:
        for word in sent.words:
            for term_word in term_doc.sentences[0].words:
                term = term_word.lemma.lower()
                w = word.lemma.lower()
                is_in = term == w or is_contain_string(w, term) or is_contain_string(term, w)
                if is_in:
                    return True
    return False


def is_root_in_past(doc):
    past = 0
    for sent in doc.sentences:
        root = next((word for word in sent.words if word.deprel == 'root'), None)
        root_feats = parse_features(root.feats)
        if root_feats.get('Tense') == 'Past':
            past += 1
    return past == len(doc.sentences)


def predict_is_event(title, snippet, term, clf):
    title = trim_indirect_speech(title)
    if snippet:
        snippet = trim_indirect_speech(snippet)
    title_lang = langdetect.detect(title)
    snippet_lang = langdetect.detect(snippet) if snippet else title_lang

    if title_lang != 'uk' and snippet_lang != 'uk':
        return False
    title_doc = nlp(title)
    is_root_past = is_root_in_past(title_doc)
    if snippet:
        snippet_doc = nlp(snippet)
        if is_root_past:
            is_root_past = is_root_in_past(title_doc)
    if not is_root_past:
        return False
    term_doc = nlp(term)

    is_term_in = is_term_in_title(title_doc, term_doc)
    if snippet:
        is_term_in = is_term_in or is_term_in_title(snippet_doc, term_doc)
    if not is_term_in:
        return False
    title_features = get_features(title_doc)
    is_title_ev = clf.predict(title_features)[0]
    if snippet:
        snippet_features = get_features(snippet_doc)
        is_snippet_ev = clf.predict(snippet_features)[0]

    is_ev = is_title_ev

    if snippet:
        is_ev = is_title_ev or is_snippet_ev

    return is_ev