In [15]:
!pip install rouge razdel pymorphy2

Collecting rouge
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Collecting razdel
  Downloading https://files.pythonhosted.org/packages/15/2c/664223a3924aa6e70479f7d37220b3a658765b9cfe760b4af7ffdc50d38f/razdel-0.5.0-py3-none-any.whl
Collecting pymorphy2
[?25l  Downloading https://files.pythonhosted.org/packages/a3/33/fff9675c68b5f6c63ec8c6e6ff57827dda28a1fa5b2c2d727dffff92dd47/pymorphy2-0.8-py2.py3-none-any.whl (46kB)
[K     |████████████████████████████████| 51kB 2.8MB/s 
Collecting dawg-python>=0.7
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-none-any.whl
Collecting pymorphy2-dicts<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/02/51/2465fd4f72328ab50877b54777764d928da8cb15b74e2680fc1bd8cb3173/pymorphy2_dicts-2.4.393442.3710985-py2.py3-none-any.whl (7.1MB)


In [0]:
import razdel
import re
import copy
import random

from tqdm.notebook import tqdm
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge

Utils: score calculating

In [0]:
def calc_scores(references, predictions, metric="all"):
    print("Count:", len(predictions))
    print("Last true headline:", references[-1])
    print("Last predicted headline:", predictions[-1])

    if metric in ("bleu", "all"):
        print("\nBLEU: ", corpus_bleu([[r] for r in references], predictions))
    if metric in ("rouge", "all"):
        rouge = Rouge()
        scores = rouge.get_scores(predictions, references, avg=True)
        scores_string = ""
        for metric, value in scores.items():
            scores_string += "\n" + str(metric) + ":" + str(value)
        print("ROUGE: ", scores_string, "\n")

First model is FSDE aka first sentence dumb extractor.

Our hypothesis is that in many news the first sentence of text already contains the most important summary. From the good literacy, that kind of duplication in the title looks silly, but as a baseline method we want to check what metrics it can achieve.

In [0]:
def calc_FSDE_score(data, lower=True):
    references = []
    predictions = []

    for text, title in tqdm(data[['Text', 'Head_title']].values):
        title = title if not lower else title.lower()
        references.append(title)

        sentences = [sentence.text for sentence in razdel.sentenize(text)]
        sentences[:] = [s if not lower else s.lower() for s in sentences]
        prediction = " ".join(sentences[:1])
        predictions.append(prediction)
    calc_scores(references, predictions)

In [19]:
calc_FSDE_score(cleaned_dataset)

HBox(children=(FloatProgress(value=0.0, max=133773.0), HTML(value='')))


Count: 133773
Last true headline: эпидемия бедности
Last predicted headline: в наше-то время мыслимо ли представить, чтобы в одной из богатейших стран мира дети недоедали, а их семьи жестко экономили на всем чем можно?

BLEU:  0.18306681409532496
ROUGE:  
rouge-1:{'f': 0.16207236690169588, 'p': 0.1277506276502493, 'r': 0.25285626683840967}
rouge-2:{'f': 0.06587346819565139, 'p': 0.05227357298541585, 'r': 0.10050212340946274}
rouge-l:{'f': 0.15003518774424648, 'p': 0.11868430806959729, 'r': 0.2314076211846482} 



Selfdesigned TextRank

In [0]:
from itertools import combinations
import networkx as nx
import numpy as np
import pymorphy2

In [0]:
def unique_words_similarity(words1, words2):
    '''
    Функция подсчёта близости предложений на основе пересечения слов
    ''' 
    words1 = set(words1)
    words2 = set(words2)
    if not len(words1) or not len(words2):
        return 0.0
    return len(words1.intersection(words2))/(np.log10(len(words1)) + np.log10(len(words2)))

def gen_text_rank_summary(text, calc_similarity=unique_words_similarity, summary_part=0.2, lower=True, morph=None):
    '''
    Составление summary с помощью TextRank
    '''
    # Разбиваем текст на предложения
    sentences = [sentence.text for sentence in razdel.sentenize(text)]
    n_sentences = len(sentences)

    # Токенизируем предложения
    sentences_words = [[token.text.lower() if lower else token.text for token in razdel.tokenize(sentence)] for sentence in sentences]

    # При необходимости лемматизируем слова
    if morph is not None:
        sentences_words = [[morph.parse(word)[0].normal_form for word in words] for words in sentences_words]

    # Для каждой пары предложений считаем близость
    pairs = combinations(range(n_sentences), 2)
    scores = [(i, j, calc_similarity(sentences_words[i], sentences_words[j])) for i, j in pairs]

    # Строим граф с рёбрами, равными близости между предложениями
    g = nx.Graph()
    g.add_weighted_edges_from(scores)

    # Считаем PageRank
    pr = nx.pagerank(g)
    result = [(i, pr[i], s) for i, s in enumerate(sentences) if i in pr]
    result.sort(key=lambda x: x[1], reverse=True)

    # Выбираем топ предложений
    n_summary_sentences = max(int(n_sentences * summary_part), 1)
    result = result[:1]
    #result = result[:n_summary_sentences]

    # Восстанавливаем оригинальный их порядок
    result.sort(key=lambda x: x[0])

    # Восстанавливаем текст выжимки
    predicted_summary = " ".join([sentence for i, proba, sentence in result])
    predicted_summary = predicted_summary.lower() if lower else predicted_summary
    return predicted_summary

def calc_text_rank_score(records, calc_similarity=unique_words_similarity, summary_part=0.1, lower=True, morph=None):
    references = []
    predictions = []

    for text, title in tqdm(records[['Text', 'Head_title']].values):
        title = title if not lower else title.lower()
        references.append(title)

        predicted_title = gen_text_rank_summary(text, calc_similarity, summary_part, lower, morph=morph)
        text = text if not lower else text.lower()
        predictions.append(predicted_title)

    calc_scores(references, predictions)

In [22]:
%%time
calc_text_rank_score(cleaned_dataset)

HBox(children=(FloatProgress(value=0.0, max=133773.0), HTML(value='')))


Count: 133773
Last true headline: эпидемия бедности
Last predicted headline: при этом схема ваучеров на питание сопряжена с изнурительным хождением по инстанциям, а британские благотворительные "банки еды" не в состоянии поддержать в этой беспрецедентной ситуации всех страдальцев.

BLEU:  0.10876123392394214
ROUGE:  
rouge-1:{'f': 0.0961651612014209, 'p': 0.069697386121575, 'r': 0.1925407766925539}
rouge-2:{'f': 0.02343784135065573, 'p': 0.01727150528781293, 'r': 0.04389035150053941}
rouge-l:{'f': 0.08839101886483362, 'p': 0.06429160318423158, 'r': 0.1737626437592151} 

CPU times: user 1h 8min 5s, sys: 47.1 s, total: 1h 8min 52s
Wall time: 1h 8min 55s


Second version uses MorphAnalyzer.

In [0]:
calc_text_rank_score(cleaned_dataset, morph=pymorphy2.MorphAnalyzer())

Oracle summary

In [0]:
def build_oracle_summary_greedy(text, gold_summary, calc_score, lower=True, max_sentences=30):
    '''
    Жадное построение oracle summary
    '''
    gold_summary = gold_summary.lower() if lower else gold_summary
    # Делим текст на предложения
    sentences = [sentence.text.lower() if lower else sentence.text for sentence in razdel.sentenize(text)][:max_sentences]
    n_sentences = len(sentences)
    oracle_summary_sentences = set()
    score = -1.0
    summaries = []
    for _ in range(min(n_sentences, 2)):
        for i in range(n_sentences):
            if i in oracle_summary_sentences:
                continue
            current_summary_sentences = copy.copy(oracle_summary_sentences)
            # Добавляем какое-то предложения к уже существующему summary
            current_summary_sentences.add(i)
            current_summary = " ".join([sentences[index] for index in sorted(list(current_summary_sentences))])
            # Считаем метрики
            current_score = calc_score(current_summary, gold_summary)
            summaries.append((current_score, current_summary_sentences))
        # Если получилось улучшить метрики с добавлением какого-либо предложения, то пробуем добавить ещё
        # Иначе на этом заканчиваем
        best_summary_score, best_summary_sentences = max(summaries)
        if best_summary_score <= score:
            break
        oracle_summary_sentences = best_summary_sentences
        score = best_summary_score
    oracle_summary = " ".join([sentences[index] for index in sorted(list(oracle_summary_sentences))])
    return oracle_summary, oracle_summary_sentences


def calc_single_score(pred_summary, gold_summary, rouge):
    return rouge.get_scores([pred_summary], [gold_summary], avg=True)['rouge-2']['f']


def calc_oracle_score(records, lower=True):
    references = []
    predictions = []
    rouge = Rouge()
  
    for text, title in tqdm(records[['Text', 'Head_title']].values):
        title = title if not lower else title.lower()
        references.append(title)
        predicted_summary, _ = build_oracle_summary_greedy(text, title, calc_score=lambda x, y: calc_single_score(x, y, rouge))
        predictions.append(predicted_summary)

    calc_scores(references, predictions)

In [25]:
%%time
calc_oracle_score(cleaned_dataset)

HBox(children=(FloatProgress(value=0.0, max=133773.0), HTML(value='')))


Count: 133773
Last true headline: эпидемия бедности
Last predicted headline: в наше-то время мыслимо ли представить, чтобы в одной из богатейших стран мира дети недоедали, а их семьи жестко экономили на всем чем можно?

BLEU:  0.19906226621154677
ROUGE:  
rouge-1:{'f': 0.20859158264629868, 'p': 0.16832612865043137, 'r': 0.3243579397562898}
rouge-2:{'f': 0.10575615567007783, 'p': 0.08643241167731687, 'r': 0.16497572116128686}
rouge-l:{'f': 0.19693814105018825, 'p': 0.1595544994641524, 'r': 0.3019438025349718} 

CPU times: user 27min 37s, sys: 12.3 s, total: 27min 49s
Wall time: 27min 50s
