In [46]:
import json
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import stanza
import requests
from html.parser import HTMLParser
from bs4 import BeautifulSoup

In [None]:
with open('../../data/articles/test/zelen.json') as f:
    test_data = json.load(f)
with open('../../data/articles/train/train_it_2.json') as f:
    train_data = json.load(f)

In [None]:
nlp = stanza.Pipeline('uk', processors='tokenize,pos,lemma,depparse')

In [None]:
def get_classifier():
    pipe = Pipeline([
        ('dict_vect', DictVectorizer()),
        ('lrc', LogisticRegression(random_state=42, multi_class='multinomial',
                                   max_iter=100, solver='sag', n_jobs=-1))])

    return pipe

In [None]:
def get_spo(search_obj):
    raw_text, spo = [(k, search_obj[k]) for k in search_obj][0]
    subj, pred, obj = [x.lower() if x else x for x in (
        spo if spo else [None, None, None])]
    return raw_text, subj, pred, obj


def search_by_token(token, article):
    _, t_subj, _, t_obj = get_spo(article['title'][0])
    is_found = get_is_match(token, [t_subj, t_obj])
    if not is_found:
        for sent in article['content']:
            _, s_subj, _, s_obj = get_spo(sent)

            is_found = get_is_match(token, [s_subj, s_obj])
            if is_found:
                break
    return is_found


def search_relevant_articles(search_term, corpus):
    res = []
    search_tokens = nlp(search_term).sentences[0].words

    for article in corpus:
        is_found = None
        title_obj = article['title'][0]
        title, t_subj, _, t_obj = get_spo(article['title'][0])

        if len(search_tokens) == 1:
            is_found = search_by_token(search_tokens[0], article)
        else:
            for token in search_tokens:
                is_found = search_by_token(token, article)
        if is_found:
            res.append(
                {'url': article['url'], 'date': article['date'], 'title': title})
    return res

In [25]:
def get_data(corpus):
    features, labels = [], []
    
    for article in corpus:
        feat = {}
        title_obj = article['title'][0]
        title, t_subj, _, t_obj = get_spo(title_obj)
        feat['title'] = title
        # STOP POINT
        # TODO:
        # 1) separate feature extractors
        # 2) check Mariana's suggestions in PR
        # 3) scrape more data
        # 4) annotate more data
        feat['t_subj'] = t_subj or 'NONE'
        features.append(feat)
        labels.append(article['relevant'])
        
    return features, labels

In [None]:
def get_subj_pred_obj_text(text):
    res = []
    try:
        doc = nlp(text)
        for sent in doc.sentences:
            pred = None
            subj = None
            obj = None
            pred = next(((word.id, word.lemma)
                         for word in sent.words if word.deprel == 'root' and word.upos == 'VERB'),
                        None)
            if pred:

                subj = next(((word.id, word.lemma) for word in sent.words if word.deprel ==
                             'nsubj' and word.head == int(pred[0])), None)
                obj = next(((word.id, word.lemma) for word in sent.words if word.deprel ==
                            'obj' and word.head == int(pred[0])), None)
            if pred:
                res.append(
                    {sent.text: (subj[1] if subj else None, pred[1], obj[1] if obj else None)})
            else:
                res.append({sent.text: None})
    except:
        print('Failed to create nlp from text that starts with: ' + text[:50])
    return res


def get_subj_pred_obj(corpus, res_file):
    current_content = []
    with open(res_file, 'w', encoding='utf-8') as f:
        json.dump(current_content, f)

    for i, art in enumerate(corpus):
        title_with_spo = get_subj_pred_obj_text(art['title'])
        content_with_spo = get_subj_pred_obj_text(art['content'])
        res = {
            'url': art['url'],
            'date': art['date'],
            'title': title_with_spo,
            'content': content_with_spo,
            'relevant': art['relevant']
        }
        with open(res_file, 'r', encoding='utf-8') as f:
            current_content = json.load(f)
        current_content.append(res)

        with open(res_file, 'w', encoding='utf-8') as f:
            json.dump(current_content, f, ensure_ascii=False)
        print('>>>', i)
    return current_content

In [None]:
get_subj_pred_obj(test_data, '../../data/articles/train/train_it_2.json')

In [26]:
X, y = get_data(train_data)

In [15]:
print(X[:3])
print(y[:3])

[{'title': 'Опублікований фільм Рік президента Зеленського'}, {'title': 'У Зеленського "не дійшли руки" до всіх обіцянок'}, {'title': 'Зеленський анонсував скорочення правоохоронних органів'}]
[True, False, False]


In [27]:
data_train, data_test, target_train, target_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
clf = get_classifier()

In [28]:
clf.fit(data_train, target_train)

Pipeline(memory=None,
         steps=[('dict_vect',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('lrc',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='multinomial', n_jobs=-1,
                                    penalty='l2', random_state=42, solver='sag',
                                    tol=0.0001, verbose=0, warm_start=False))],
         verbose=False)

In [29]:
print(classification_report(target_test, clf.predict(data_test)))

              precision    recall  f1-score   support

       False       0.79      0.73      0.76        15
        True       0.82      0.86      0.84        21

    accuracy                           0.81        36
   macro avg       0.80      0.80      0.80        36
weighted avg       0.80      0.81      0.80        36

