In [1315]:
import json
import os
import re
import csv
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import stanza
from tokenize_uk import tokenize_uk
import requests
from bs4 import BeautifulSoup

In [1293]:
with open('../../data/articles/test/zelen.json') as f:
    test_data = json.load(f)
with open('../../data/articles/train/train_it_2.json') as f:
    train_data = json.load(f)

In [None]:
nlp = stanza.Pipeline('uk', processors='tokenize,pos,lemma,depparse')

In [1331]:
def get_classifier():
    pipe = Pipeline([
        ('dict_vect', DictVectorizer()),
        ('lrc', LogisticRegression(random_state=42, multi_class='multinomial',
                                   max_iter=100, solver='sag', n_jobs=-1))])

    return pipe

In [603]:
def make_annotattion_sources(source_path, target_path):
    data_files = os.listdir(source_articles_path)
    file_count = 0
    for file in data_files:
        with open(os.path.join(source_path, file)) as f:
            art_count = 0
            cont = json.load(f)
            for i, article in enumerate(cont):
                res = []
                res.append(article['url'] + '\n')
                res.append('===\n')
                res.append(f'{article["title"]}\t|\tREL_{art_count}\n')
                res.append('===\n')

                sents = tokenize_uk.tokenize_sents(article['content'])
                for i, sent in enumerate(sents):
                    res.append(f'{sent}\t|\tREL_{art_count}_{i}\n')
                res.append(f'\nREL_{art_count}\n')
                res.append('END\n\n')
                
                with open(os.path.join(target_path, f'to_annotate_{file_count}.txt'), 'a') as f:
                    f.writelines(res)
                    
                    if art_count == 99:
                        file_count += 1
                        art_count = 0
                    else:
                        art_count += 1

In [1611]:
''' Parse annotated data

algorythm of parsing annotated data
1) if all the article marked as IS_NO_EVENT and there are no other marks inside -
mark every sentense as NO_EVENT
2) if a block contains more than one sentence - split into multiple sentences
by annotated split object (DATE annotation at the sentence beginning)
'''


def parse_annotated_articles(annotated_articles_path):
    files = os.listdir(annotated_articles_path)
    lines = []
    raw_lines = []
    for file in files:
        with open(os.path.join(annotated_articles_path, file)) as f:
            reader = csv.reader(f, dialect='excel-tab')
            count = 0
            miltiline_events_cache = []
            is_all_non_event = None
            all_non_events = []
            start_mark = None

            for i, row in enumerate(reader):
                if row:
                    first_column = row[0]
                    next_column = row[1]

                    if first_column != '===':

                        if next_column == 'O':
                            fst_col_split = first_column.split(' | ')

                            # The line was required to be annotated annotation
                            if len(fst_col_split) > 1:
                                # The line was annotated, annotation is set on the next line
                                if not fst_col_split[1]:
                                    mark_row = reader.__next__()
                                    is_event = mark_row[1] == 'IS_EVENT'
                                    if not start_mark:
                                        start_mark = mark_row[0]
                                    if is_all_non_event is not None:
                                        is_all_non_event = not is_event
                                    line = (fst_col_split[0], is_event)
                                    if not len(miltiline_events_cache):
                                        lines.append(line)
                                        if is_all_non_event:
                                            all_non_events.append(line)
                                    else:
                                        miltiline_events_cache.append(line)
                                        is_all_non_event = False
                                # Not annotated
                                else:
                                    if is_all_non_event:
                                        all_non_events.append(
                                            (fst_col_split[0], False))
                                    mark = re.findall(
                                        'REL_\\d+', fst_col_split[1])
                                    if mark:
                                        start_mark = mark[0]
                                if len(miltiline_events_cache):
                                    # We already have some sentences in the cache
                                    # Updating their status
                                    ee = [miltiline_events_cache[i:i + 2]
                                          for i in range(0, len(miltiline_events_cache), 2)]
                                    for e in ee:
                                        r = []
                                        for x in e:
                                            a, b = x
                                            if a not in r:
                                                r.append(a.strip())
                                        t = ' '.join(r)
                                        lines.append((t, is_event))
                                    miltiline_events_cache = []
                            else:
                                event_text = fst_col_split[0].strip()
                                # Append multi-event text if we already have something in the cache
                                if event_text and len(miltiline_events_cache):
                                    miltiline_events_cache.append(
                                        (event_text, next_column))
                                    is_all_non_event = False
                        elif not (row[0].startswith('REL_') or row == 'END'):
                            line = (row[0], next_column)
                            miltiline_events_cache.append(line)
                            is_all_non_event = False
                    # We reached the end
                    if first_column == start_mark:
                        if is_all_non_event and next_column == 'IS_NOT_EVENT':
                            lines += all_non_events
                        all_non_events = []
                count += 1
    return lines

In [1612]:
def parse_features(feats_string):
    res = {}
    feats = feats_string.split('|')
    for feat in feats:
        k, v = feat.split('=')
        res[k] = v
    return res


def get_subj_pred_obj_from_text(text):
    res = []
    try:
        doc = nlp(text)
        for sent in doc.sentences:
            spo = {}
            pred = None
            subj = None
            obj = None
            root = next((word
                         for word in sent.words if word.deprel == 'root' and word.upos == 'VERB'),
                        None)
            if root:
                root_conj = next((word for word in sent.words if word.deprel ==
                                  'conj' and word.head == int(root.id)), None)

                subj = next((word for word in sent.words if word.deprel ==
                             'nsubj' and word.head == int(root.id)), None)
                obj = next(((word.id, word.lemma) for word in sent.words if word.deprel ==
                            'obj' and word.head == int(root_conj.id if root_conj else root.id)),
                           None)

                spo['subj'] = subj.lemma if subj else None
                spo['root'] = root
                spo['root-conj'] = root_conj
                spo['obj'] = obj[1] if obj else None
                if subj:
                    conj = next((word.lemma for word in sent.words if word.deprel ==
                                 'conj' and word.head == int(subj.id)), None)
                    spo['subj-conj'] = conj

            res.append((sent.text, spo))
    except Exception as e:
        print(
            f'Failed to create nlp from text that starts with: {text[:50]} {e}')
    return res


def get_spo(text):
    spos = get_subj_pred_obj_from_text(text)
    # TODO: handle multiple sentences; perhaps take the one containing most from SPO
    raw_text, spo = spos[0] if spos else (text, {})
    return raw_text, spo


def get_data(titles):
    features, labels = [], []

    for i, (title, is_event) in enumerate(titles):
        feat = {}
        title, spo = get_spo(title)
        if spo:
            root = spo['root']
            root_conj = spo.get('root-conj')

            pred_features = parse_features(root.feats)

            subj_conj = spo.get('subj-conj')

            if subj_conj:
                feat['subj'] = f'{spo["subj"]}_{subj_conj}'
            else:
                feat['subj'] = spo.get('subj') or 'NONE'
#             feat['subj'] = spo.get('subj') or 'NONE'

            if root_conj:
                feat['pred'] = f'{root.lemma}_{root_conj.lemma}'
                pred_conj_features = parse_features(root_conj.feats)
                feat['pred-conj-tense'] = pred_conj_features.get(
                    'Tense') or 'NONE'
            else:
                feat['pred'] = root.lemma
#             feat['pred'] = root.lemma
            feat['obj'] = spo.get('obj') or 'NONE'
            feat['pred-tense'] = pred_features.get('Tense') or 'NONE'
        features.append(feat)
        labels.append(is_event)

        if i % 500 == 0:
            print('-->', i)

    return features, labels

In [604]:
source_articles_path = '../../data/articles/source_normalized'
annotation_source_path = '../../data/articles/for_annotation'
make_annotattion_sources(source_articles_path, annotation_source_path)

In [1614]:
annotated_articles_path = '../../data/articles/annotated'
annotated_parsed = parse_annotated_articles(annotated_articles_path)
annotated_parsed = list(set(annotated_parsed))
truish = [(x, y) for x, y in annotated_parsed if y]
falsish = [(x, y) for x, y in annotated_parsed if not y]
print('All:', len(annotated_parsed))
print('Is event:', len(truish))
print('Is not event:', len(falsish))

All: 2801
Is event: 1330
Is not event: 1471


In [1615]:
X, y = get_data(annotated_parsed)

--> 0
--> 500
--> 1000
Failed to create nlp from text that starts with: Батькам нікуди дітей дівати. Кернес не ввів карант 
Failed to create nlp from text that starts with: Там за півдоби зареєстрували понад 80 нових випадк 
--> 1500
--> 2000
--> 2500


In [1616]:
data_train, data_test, target_train, target_test = train_test_split(X, y, test_size=0.3,
                                                                    random_state=42, shuffle = True, stratify = y)

In [1338]:
clf = get_classifier()

In [1617]:
clf.fit(data_train, target_train)

Pipeline(memory=None,
         steps=[('dict_vect',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('lrc',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='multinomial', n_jobs=-1,
                                    penalty='l2', random_state=42, solver='sag',
                                    tol=0.0001, verbose=0, warm_start=False))],
         verbose=False)

In [1618]:
print(classification_report(target_test, clf.predict(data_test)))

              precision    recall  f1-score   support

       False       0.76      0.77      0.77       442
        True       0.74      0.74      0.74       399

    accuracy                           0.75       841
   macro avg       0.75      0.75      0.75       841
weighted avg       0.75      0.75      0.75       841

