In [554]:
import json
import sys
import pandas as pd
import stanza

In [None]:
nlp = stanza.Pipeline('uk', processors='tokenize,pos,lemma,depparse')

In [None]:
with open('../../data/articles/articles.json') as f:
    corpus = json.load(f)

In [None]:
print(len(corpus))

In [None]:
def get_data_to_annotate(src_dir, src_file_name, map_file_name):
    src_file_path = os.path.join(src_dir, src_file_name)
    with open(src_file_path) as f:
        arts = json.load(f)
        maps = []
        for i, art in enumerate(arts):
            content_to_ann = art['title'] + '\n' + art['content']
            ann_dir_path = src_dir + '_ann'
            if not os.path.isdir(ann_dir_path):
                os.mkdir(ann_dir_path)
            filename = os.path.splitext(src_file_name)[0]
            with open(os.path.join(ann_dir_path, filename + f'_{i}_.txt'), 'w', encoding='utf-8') as fa:
                fa.write(content_to_ann)
            maps.append({i: art['url']})
        with open(os.path.join(ann_dir_path, map_file_name), 'w', encoding='utf-8') as fm:
            json.dump(maps, fm)


def get_subj_pred_obj_text_1(text):
    res = []
    try:
        doc = nlp(text)
        for sent in doc.sentences:
            pred = None
            subj = None
            obj = None
            pred = next(((word.id, word.lemma) for word in sent.words if word.deprel == 'root' and word.upos == 'VERB'), None)
            if pred:

                subj = next(((word.id, word.lemma) for word in sent.words if word.deprel == 'nsubj' and word.head == int(pred[0])), None)
                obj = next(((word.id, word.lemma) for word in sent.words if word.deprel == 'obj' and word.head == int(pred[0])), None)
            if pred:
                res.append({sent.text: (subj[1] if subj else None, pred[1], obj[1] if obj else None)})
            else:
                res.append({sent.text: None})
    except:
        print('Failed to create nlp from text that starts with: ' + text[:50])
    return res


def get_subj_pred_obj_1(corpus, res_file):
    current_content = []
    with open(res_file, 'w', encoding='utf-8') as f:
        json.dump(current_content, f)

    for i, art in enumerate(corpus):
        title_with_spo = get_subj_pred_obj_text_1(art['title'])
        content_with_spo = get_subj_pred_obj_text_1(art['content'])
        res = {
            'url': art['url'],
            'date': art['date'],
            'title': title_with_spo,
            'content': content_with_spo
        }
        with open(res_file, 'r', encoding='utf-8') as f:
            current_content = json.load(f)
        current_content.append(res)
        
        with open(res_file, 'w', encoding='utf-8') as f:
            json.dump(current_content, f, ensure_ascii=False)
        print('>>>', i)
    return current_content
        

def get_train_data(corpus, filename):
    data = corpus[:]
    return get_subj_pred_obj_1(data, filename)

In [None]:
corpus_with_subj_pred_obj = get_subj_pred_obj_1(corpus[401:], '../../data/articles/test/corona_spo_it_1.json')

In [None]:
print(len(corpus))

In [None]:
with open('../../data/articles/articles_with_spo_it1.json') as f:
    dev_arts = json.load(f)

In [None]:
test_urls = [x['url'] for x in dev_arts]

In [None]:
corona = 'коронавірус' # TODO: add covid, sars-cov-2, пандем
covid = 'covid'
sars_cov_2 = 'sars-cov-2'
pandem = 'пандем'
zelen = 'зеленськ'
pres_u = 'президент україни'
story = 'сюжет'

test_arts_corona = [x for x in corpus if (corona in x['title'].lower() or corona in x['content'].lower() \
                                         or covid in x['title'].lower() or covid in x['content'].lower() \
                                         or sars_cov_2 in x['title'].lower() or sars_cov_2 in x['content'].lower() \
                                         or pandem in x['title'].lower() or pandem in x['content'].lower()) \
                    and story not in x['title'].lower()]
# test_arts_zelen_all = [x for x in corpus if (zelen in x['title'].lower() or zelen in x['content'].lower() \
#                                          or pres_u in x['title'].lower() or pres_u in x['content'].lower())]
# test_arts_zelen = [x for x in test_arts_zelen_all if story not in x['title'].lower()]



In [None]:
def get_test_arts_for_mark(arts):
    for art in arts:
        art.update({'relevant': None})
    return arts

In [None]:
print(len(test_arts_corona))
print(len(test_arts_zelen))

In [None]:
test_arts_corona = get_test_arts_for_mark(test_arts_corona)
# test_arts_zelen = get_test_arts_for_mark(test_arts_zelen)

In [None]:
with open('../../data/articles/test/corona.json', 'w') as f:
    json.dump(test_arts_corona, f, ensure_ascii=False)
with open('../../data/articles/test/zelen.json', 'w') as f:
    json.dump(test_arts_zelen, f, ensure_ascii=False)
with open('../../data/articles/test/benya.json', 'w') as f:
    json.dump(test_arts_benya, f, ensure_ascii=False)

In [None]:
with open('../../data/articles/test/zelen.json') as f:
    content = json.load(f)

In [None]:
ex_urls = [x['url'] for x in content]
for art in test_arts_corona[200:250]:
    if art['url'] not in ex_urls:
        content.append(art)

In [None]:
truish = [x for x in content if x['relevant']]
print(len(truish))

In [None]:
with open('../../data/articles/test/corona.json', 'w') as f:
    json.dump(content, f, ensure_ascii=False)

In [None]:
print(len(content))

In [None]:
with open('../../data/articles/test/zelen.json') as f:
    content = json.load(f)
train_data = get_train_data(content, '../../data/articles/test/zelen_spo_it_1.json')

In [None]:
with open('../../data/articles/test/zelen_spo_it_1.json') as zf:
    z_content = json.load(zf)
with open('../../data/articles/test/corona_spo_it_1.json') as cf:
    c_content = json.load(cf)
with open('../../data/articles/test/spo_it_1.json', 'w') as f:
    res = z_content + c_content
    json.dump(res, f, ensure_ascii=False)

In [None]:
with open('../../data/articles/test/corona.json') as f:
    expected = [{x['url']: x['relevant']} for x in json.load(f)]
    with open('../../data/articles/test/corona_expected.json', 'w') as f:
        json.dump(expected, f)
with open('../../data/articles/test/zelen.json') as f:
    expected = [{x['url']: x['relevant']} for x in json.load(f)]
    with open('../../data/articles/test/zelen_expected.json', 'w') as f:
        json.dump(expected, f)

In [555]:
def get_spo(search_obj):
    raw_text, spo = [(k, search_obj[k]) for k in search_obj][0]
    subj, pred, obj = [x.lower() if x else x for x in (spo if spo else [None, None, None])]
    return raw_text, subj, pred, obj


def get_is_match(token, spo):
    return token.lemma in spo
    
    
def search_by_token(token, article):
    _, t_subj, _, t_obj = get_spo(article['title'][0])
    is_found = get_is_match(token, [t_subj, t_obj])
    if not is_found:
        for sent in article['content']:
            _, s_subj, _, s_obj = get_spo(sent)
            
            is_found = get_is_match(token, [s_subj, s_obj])
            if is_found:
                break
    return is_found




def search_relevant_articles(search_term, corpus):
    res = []
    search_tokens = nlp(search_term).sentences[0].words

    for article in corpus:
        is_found = None
        title_obj = article['title'][0]
        title, t_subj, _, t_obj = get_spo(article['title'][0])
        
        if len(search_tokens) == 1:
            is_found = search_by_token(search_tokens[0], article)
        else:
            for token in search_tokens:
                is_found = search_by_token(token, article)
        if is_found:
            res.append({'url': article['url'], 'date': article['date'], 'title': title})
    return res
           


def validate_result(result, test_data):
    test_true = [x['url'] for x in test_data if x['relevant']]
    actual_urls = [x['url'] for x in result]
    
    true_positives = len([x for x in actual_urls if x in test_true])
    false_positives = len([x for x in actual_urls if x not in test_true])
    false_negatives = len([x for x in test_true if x not in actual_urls])

    act_len = len(actual_urls)

    recall = round(true_positives/(true_positives + false_negatives), 2)
    precision = round(true_positives/(true_positives + false_positives), 2)

    return ({'recall': recall, 'precision': precision})             

In [None]:
# with open('../../data/articles/train/train_spo_it_1.json') as f:
#     corpus = json.load(f)

In [None]:
search_corona = 'коронавірус'
search_zelen = 'зеленський'

corona_results = search_relevant_articles(search_corona, corpus)
zelen_results = search_relevant_articles(search_zelen, corpus)

In [None]:
print(len(corona_results))
print(len(zelen_results))

In [556]:
with open('../../data/articles/test/zelen.json') as f:
    test_data = json.load(f)
res = validate_result(zelen_results, test_data)
print(res)

{'recall': 0.92, 'precision': 0.73}


In [557]:
with open('../../data/articles/test/corona.json') as f:
    test_data = json.load(f)
res = validate_result(corona_results, test_data)
print(res)

{'recall': 0.1, 'precision': 0.44}


In [559]:
with open('../../data/articles/all_articles.json') as f:
    all_arts = json.load(f)
    print(len(all_arts))

30370
