In [1315]:
import json
import os
import re
import csv
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import stanza
from tokenize_uk import tokenize_uk
import requests
from html.parser import HTMLParser
from bs4 import BeautifulSoup

In [1293]:
with open('../../data/articles/test/zelen.json') as f:
    test_data = json.load(f)
with open('../../data/articles/train/train_it_2.json') as f:
    train_data = json.load(f)

In [None]:
nlp = stanza.Pipeline('uk', processors='tokenize,pos,lemma,depparse')

In [1331]:
def get_classifier():
    pipe = Pipeline([
        ('dict_vect', DictVectorizer()),
        ('lrc', LogisticRegression(random_state=42, multi_class='multinomial',
                                   max_iter=100, solver='sag', n_jobs=-1))])

    return pipe

In [1357]:
def get_spo_old(search_obj):
    raw_text, spo = [(k, search_obj[k]) for k in search_obj][0]
    subj, pred, obj = [x.lower() if x else x for x in (
        spo if spo else [None, None, None])]
    return raw_text, subj, pred, obj


def get_subj_pred_obj_from_text(text):
    res = []
    try:
        doc = nlp(text)
        for sent in doc.sentences:
            pred = None
            subj = None
            obj = None
            pred = next(((word.id, word.lemma)
                         for word in sent.words if word.deprel == 'root' and word.upos == 'VERB'),
                        None)
            if pred:
                subj = next(((word.id, word.lemma) for word in sent.words if word.deprel ==
                             'nsubj' and word.head == int(pred[0])), None)
                obj = next(((word.id, word.lemma) for word in sent.words if word.deprel ==
                            'obj' and word.head == int(pred[0])), None)
            if pred:
                res.append((sent.text, (subj[1] if subj else None, pred[1], obj[1] if obj else None)))
            else:
                res.append((sent.text, None))
    except Exception as e:
        print(f'Failed to create nlp from text that starts with: {text[:50]}')
    return res


def get_spo(text):
    spos = get_subj_pred_obj_from_text(text)
    # TODO: handle multiple sentences; perhaps take the one containing most from SPO
    raw_text, spo = spos[0] if spos else (text, None)
    subj, pred, obj = [x.lower() if x else x for x in (
        spo if spo else [None, None, None])]
    return raw_text, subj, pred, obj


def search_by_token(token, article):
    _, t_subj, _, t_obj = get_spo(article['title'][0])
    is_found = get_is_match(token, [t_subj, t_obj])
    if not is_found:
        for sent in article['content']:
            _, s_subj, _, s_obj = get_spo(sent)

            is_found = get_is_match(token, [s_subj, s_obj])
            if is_found:
                break
    return is_found


def search_relevant_articles(search_term, corpus):
    res = []
    search_tokens = nlp(search_term).sentences[0].words

    for article in corpus:
        is_found = None
        title_obj = article['title'][0]
        title, t_subj, _, t_obj = get_spo(article['title'][0])

        if len(search_tokens) == 1:
            is_found = search_by_token(search_tokens[0], article)
        else:
            for token in search_tokens:
                is_found = search_by_token(token, article)
        if is_found:
            res.append(
                {'url': article['url'], 'date': article['date'], 'title': title})
    return res

In [1359]:
# def get_data_old(corpus):
#     features, labels = [], []
    
#     for article in corpus:
#         feat = {}
#         title_obj = article['title'][0]
#         title, t_subj, _, t_obj = get_spo(title_obj)
#         feat['title'] = title
#         # TODO:
#         # 1) separate feature extractors
#         # 2) check Mariana's suggestions in PR
#         # 3) annotate more data
#         feat['t_subj'] = t_subj or 'NONE'
#         features.append(feat)
#         labels.append(article['relevant'])
        
#     return features, labels


def get_data(titles):
    features, labels = [], []
    
    for i, (title, is_event) in enumerate(titles):
        feat = {}
        # 1) separate feature extractors
        # 2) check Mariana's suggestions in PR
        title, t_subj, _, t_obj = get_spo(title)
        feat['title'] = title
        feat['t_subj'] = t_subj or 'NONE'
        features.append(feat)
        labels.append(is_event)
        
        if i % 200 == 0:
            print('-->', i)
        
    return features, labels

In [1256]:
# def get_subj_pred_obj(corpus, res_file):
#     current_content = []
#     with open(res_file, 'w', encoding='utf-8') as f:
#         json.dump(current_content, f)

#     for i, art in enumerate(corpus):
#         title_with_spo = get_subj_pred_obj_text(art['title'])
#         content_with_spo = get_subj_pred_obj_text(art['content'])
#         res = {
#             'url': art['url'],
#             'date': art['date'],
#             'title': title_with_spo,
#             'content': content_with_spo,
#             'relevant': art['relevant']
#         }
#         with open(res_file, 'r', encoding='utf-8') as f:
#             current_content = json.load(f)
#         current_content.append(res)

#         with open(res_file, 'w', encoding='utf-8') as f:
#             json.dump(current_content, f, ensure_ascii=False)
#         print('>>>', i)
#     return current_content

In [None]:
# get_subj_pred_obj(test_data, '../../data/articles/train/train_it_2.json')

In [26]:
# X, y = get_data(train_data)

In [15]:
# print(X[:3])
# print(y[:3])

[{'title': 'Опублікований фільм Рік президента Зеленського'}, {'title': 'У Зеленського "не дійшли руки" до всіх обіцянок'}, {'title': 'Зеленський анонсував скорочення правоохоронних органів'}]
[True, False, False]


In [603]:
def make_annotattion_sources(source_path, target_path):
    data_files = os.listdir(source_articles_path)
    file_count = 0
    for file in data_files:
        with open(os.path.join(source_path, file)) as f:
            art_count = 0
            cont = json.load(f)
            for i, article in enumerate(cont):
                res = []
                res.append(article['url'] + '\n')
                res.append('===\n')
                res.append(f'{article["title"]}\t|\tREL_{art_count}\n')
                res.append('===\n')

                sents = tokenize_uk.tokenize_sents(article['content'])
                for i, sent in enumerate(sents):
                    res.append(f'{sent}\t|\tREL_{art_count}_{i}\n')
                res.append(f'\nREL_{art_count}\n')
                res.append('END\n\n')
                
                with open(os.path.join(target_path, f'to_annotate_{file_count}.txt'), 'a') as f:
                    f.writelines(res)
                    
                    if art_count == 99:
                        file_count += 1
                        art_count = 0
                    else:
                        art_count += 1

In [604]:
source_articles_path = '../../data/articles/source_normalized'
annotation_source_path = '../../data/articles/for_annotation'
make_annotattion_sources(source_articles_path, annotation_source_path)

In [1372]:
def parse_annotated_articles(path):
    files = os.listdir(path)
    lines = []
    raw_lines = []
    for file in files:
        with open(os.path.join(annotated_articles_path, file)) as f:
            reader = csv.reader(f, dialect='excel-tab')
            count = 0
            miltiline_events_cache = []
            is_all_non_event = None
            all_non_events = []
            start_mark = None

            for i, row in enumerate(reader):
                if row:
                    first_column = row[0]
                    next_column = row[1]
                    
                    if first_column != '===':
                        
                        if next_column == 'O':
                            fst_col_split = first_column.split(' | ')
                            
                            # The line requires annotation (is either )
                            if len(fst_col_split) > 1:
                                # The line was annotated, annotation is set on the next line
                                if not fst_col_split[1]:
                                    mark_row = reader.__next__()
                                    is_event = mark_row[1] == 'IS_EVENT'
                                    if not start_mark:
                                        start_mark = mark_row[0]
                                    if is_all_non_event is not None:
                                        is_all_non_event = not is_event
                                    line = (fst_col_split[0], is_event)
                                    if not len(miltiline_events_cache):
                                        lines.append(line)
                                        if is_all_non_event:
                                            all_non_events.append(line)
                                    else:
                                        miltiline_events_cache.append(line)
                                        is_all_non_event = False
                                # Not annotated
                                else:
#                                     lines.append((fst_col_split[0], None))
                                    if is_all_non_event:
                                        all_non_events.append((fst_col_split[0], False))
                                    mark = re.findall('REL_\d+', fst_col_split[1])
                                    if mark:
                                        start_mark = mark[0]
                                if len(miltiline_events_cache):
                                    # We already have some sentences in the cache
                                    # Updating their status
                                    ee = [miltiline_events_cache[i:i + 2] for i in range(0, len(miltiline_events_cache), 2)]
                                    for e in ee:
                                        r = []
                                        for x in e:
                                            a, b = x
                                            if a not in r:
                                                r.append(a.strip())
                                        t = ' '.join(r)
                                        lines.append((t, is_event))
                                    miltiline_events_cache = []
                            else:
                                event_text = fst_col_split[0].strip()
                                # Append multi-event text if we already have something in the cache
                                if event_text and len(miltiline_events_cache):
                                    miltiline_events_cache.append((event_text, next_column))
                                    is_all_non_event = False
                        elif not (row[0].startswith('REL_') or row == 'END'):
                            line = (row[0], next_column)
                            miltiline_events_cache.append(line)
                            is_all_non_event = False
                    # We reached the end
                    if first_column == start_mark:
                        if is_all_non_event and next_column == 'IS_NOT_EVENT':
                            lines += all_non_events
                        all_non_events = []
                count += 1
    return lines

In [1375]:
annotated_articles_path = '../../data/articles/annotated'
annotated_parsed = parse_annotated_articles(annotated_articles_path)
truish = [(x, y) for x, y in annotated_parsed if y]
falsish = [(x, y) for x, y in annotated_parsed if not y]
print(len(annotated_parsed))
print(len(truish))
print(len(falsish))

1412
587
825


In [1366]:
X, y = get_data(annotated_labeled)

--> 0
Failed to create nlp from text that starts with: Батькам нікуди дітей дівати. Кернес не ввів карант
--> 200
--> 400
--> 600
--> 800
--> 1000
--> 1200
--> 1400


In [1371]:
print(len(data_test))

424


In [1367]:
data_train, data_test, target_train, target_test = train_test_split(X, y, test_size=0.3,
                                                                    random_state=42, shuffle = True, stratify = y)

In [1338]:
clf = get_classifier()

In [1368]:
clf.fit(data_train, target_train)

Pipeline(memory=None,
         steps=[('dict_vect',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('lrc',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='multinomial', n_jobs=-1,
                                    penalty='l2', random_state=42, solver='sag',
                                    tol=0.0001, verbose=0, warm_start=False))],
         verbose=False)

In [1369]:
print(classification_report(target_test, clf.predict(data_test)))

              precision    recall  f1-score   support

       False       0.68      0.95      0.79       248
        True       0.84      0.36      0.51       176

    accuracy                           0.71       424
   macro avg       0.76      0.66      0.65       424
weighted avg       0.75      0.71      0.67       424



In [601]:
# TODO: fix glued sentences:
# Григорій СуркісСуд зобов'язав ПриватБанк виплатити офшорним компаніям Суркісів депозити
# на $250 млн Апеляційний суд Києва відхилив скаргу юристів державного ПриватБанку на рішення
# Печерського райсуду про фактичне стягнення з банку $250 млн, які зберігалися на депозитах шести компаній,
#пов'язаних із бізнесменами Григорієм та Ігорем Суркісами в кіпрській філії ПриватБанку до націоналізації. | REL_87_0

In [1250]:
# algorythm of parsing annotated data
# 1) if all the article marked as IS_NO_EVENT and there are no other marks inside - mark every sentense as NO_EVENT
# 2) if a block contains more than one sentence - split into multiple sentences by annotated split object (DATE annotation at the sentence beginning)

In [1313]:
# with open('../../data/articles/annotated/parsed.json', 'w') as f:
#     json.dump(annotated_parsed, f, ensure_ascii=False)

In [1353]:
a = [('Зеленський просить Мінфін почати переговори з МВФ про допомогу через коронавірус', ('Зеленський', 'просити', 'мінфін'))]
b, c = a[0] if a else ('fdfdf', None)
print(b, c)

Зеленський просить Мінфін почати переговори з МВФ про допомогу через коронавірус ('Зеленський', 'просити', 'мінфін')
