In [None]:
!pip install rouge razdel pymorphy2

In [None]:
import razdel
import re
import copy
import random

from tqdm.notebook import tqdm
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge

Utils: score calculating

In [None]:
def calc_scores(references, predictions, metric="all"):
    print("Count:", len(predictions))
    print("Last true headline:", references[-1])
    print("Last predicted headline:", predictions[-1])

    if metric in ("bleu", "all"):
        print("\nBLEU: ", corpus_bleu([[r] for r in references], predictions))
    if metric in ("rouge", "all"):
        rouge = Rouge()
        scores = rouge.get_scores(predictions, references, avg=True)
        scores_string = ""
        for metric, value in scores.items():
            scores_string += "\n" + str(metric) + ":" + str(value)
        print("ROUGE: ", scores_string, "\n")

First model is FSDE aka first sentence dumb extractor.

Our hypothesis is that in many news the first sentence of text already contains the most important summary. From the good literacy, that kind of duplication in the title looks silly, but as a baseline method we want to check what metrics it can achieve.

In [None]:
def calc_FSDE_score(data, lower=True):
    references = []
    predictions = []

    for text, title in tqdm(data[['Text', 'Head_title']].values):
        title = title if not lower else title.lower()
        references.append(title)

        sentences = [sentence.text for sentence in razdel.sentenize(text)]
        sentences[:] = [s if not lower else s.lower() for s in sentences]
        prediction = " ".join(sentences[:1])
        predictions.append(prediction)
    calc_scores(references, predictions)

In [None]:
calc_FSDE_score(cleaned_dataset)

Selfdesigned TextRank

In [None]:
from itertools import combinations
import networkx as nx
import numpy as np
import pymorphy2

In [None]:
def unique_words_similarity(words1, words2):
    '''
    Функция подсчёта близости предложений на основе пересечения слов
    ''' 
    words1 = set(words1)
    words2 = set(words2)
    if not len(words1) or not len(words2):
        return 0.0
    return len(words1.intersection(words2))/(np.log10(len(words1)) + np.log10(len(words2)))

def gen_text_rank_summary(text, calc_similarity=unique_words_similarity, summary_part=0.2, lower=True, morph=None):
    '''
    Составление summary с помощью TextRank
    '''
    # Разбиваем текст на предложения
    sentences = [sentence.text for sentence in razdel.sentenize(text)]
    n_sentences = len(sentences)

    # Токенизируем предложения
    sentences_words = [[token.text.lower() if lower else token.text for token in razdel.tokenize(sentence)] for sentence in sentences]

    # При необходимости лемматизируем слова
    if morph is not None:
        sentences_words = [[morph.parse(word)[0].normal_form for word in words] for words in sentences_words]

    # Для каждой пары предложений считаем близость
    pairs = combinations(range(n_sentences), 2)
    scores = [(i, j, calc_similarity(sentences_words[i], sentences_words[j])) for i, j in pairs]

    # Строим граф с рёбрами, равными близости между предложениями
    g = nx.Graph()
    g.add_weighted_edges_from(scores)

    # Считаем PageRank
    pr = nx.pagerank(g)
    result = [(i, pr[i], s) for i, s in enumerate(sentences) if i in pr]
    result.sort(key=lambda x: x[1], reverse=True)

    # Выбираем топ предложений
    n_summary_sentences = max(int(n_sentences * summary_part), 1)
    result = result[:1]
    #result = result[:n_summary_sentences]

    # Восстанавливаем оригинальный их порядок
    result.sort(key=lambda x: x[0])

    # Восстанавливаем текст выжимки
    predicted_summary = " ".join([sentence for i, proba, sentence in result])
    predicted_summary = predicted_summary.lower() if lower else predicted_summary
    return predicted_summary

def calc_text_rank_score(records, calc_similarity=unique_words_similarity, summary_part=0.1, lower=True, morph=None):
    references = []
    predictions = []

    for text, title in tqdm(records[['Text', 'Head_title']].values):
        title = title if not lower else title.lower()
        references.append(title)

        predicted_title = gen_text_rank_summary(text, calc_similarity, summary_part, lower, morph=morph)
        text = text if not lower else text.lower()
        predictions.append(predicted_title)

    calc_scores(references, predictions)

In [None]:
%%time
calc_text_rank_score(cleaned_dataset)

Second version uses MorphAnalyzer.

In [None]:
calc_text_rank_score(cleaned_dataset, morph=pymorphy2.MorphAnalyzer())

In [None]:
Oracle summary

In [None]:
def build_oracle_summary_greedy(text, gold_summary, calc_score, lower=True, max_sentences=30):
    '''
    Жадное построение oracle summary
    '''
    gold_summary = gold_summary.lower() if lower else gold_summary
    # Делим текст на предложения
    sentences = [sentence.text.lower() if lower else sentence.text for sentence in razdel.sentenize(text)][:max_sentences]
    n_sentences = len(sentences)
    oracle_summary_sentences = set()
    score = -1.0
    summaries = []
    for _ in range(min(n_sentences, 2)):
        for i in range(n_sentences):
            if i in oracle_summary_sentences:
                continue
            current_summary_sentences = copy.copy(oracle_summary_sentences)
            # Добавляем какое-то предложения к уже существующему summary
            current_summary_sentences.add(i)
            current_summary = " ".join([sentences[index] for index in sorted(list(current_summary_sentences))])
            # Считаем метрики
            current_score = calc_score(current_summary, gold_summary)
            summaries.append((current_score, current_summary_sentences))
        # Если получилось улучшить метрики с добавлением какого-либо предложения, то пробуем добавить ещё
        # Иначе на этом заканчиваем
        best_summary_score, best_summary_sentences = max(summaries)
        if best_summary_score <= score:
            break
        oracle_summary_sentences = best_summary_sentences
        score = best_summary_score
    oracle_summary = " ".join([sentences[index] for index in sorted(list(oracle_summary_sentences))])
    return oracle_summary, oracle_summary_sentences


def calc_single_score(pred_summary, gold_summary, rouge):
    return rouge.get_scores([pred_summary], [gold_summary], avg=True)['rouge-2']['f']


def calc_oracle_score(records, nrows=30000, lower=True):
    references = []
    predictions = []
    rouge = Rouge()
  
    for text, title in tqdm(records[['Text', 'Head_title']].values[:nrows]):
        title = title if not lower else title.lower()
        references.append(title)
        predicted_summary, _ = build_oracle_summary_greedy(text, title, calc_score=lambda x, y: calc_single_score(x, y, rouge))
        predictions.append(predicted_summary)

    calc_scores(references, predictions)

In [None]:
%%time
calc_oracle_score(cleaned_dataset)