In [1]:
import math
import numpy as np
import warnings
from tqdm import tqdm
from nltk.corpus import gutenberg
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
es = Elasticsearch(['https://sunho:Dunkel6eit!!@i-o-optimized-deployment-84c1c6.es.us-east-1.aws.found.io:9243'], timeout=30)

In [3]:
def create_training_data_and_labels(es, index_name):
    """ Create training data/labels given the Elasticsearch client. """
    doc_ids, training_data, training_labels = [], [], []
    res = es.search(index=index_name, size=20000)
    hits = res['hits']['hits']
    for hit in tqdm(hits, position=0, desc='creating training data/labels'):
        doc_id = hit['_id']
        text = hit['_source']['text']
        doc_ids.append(doc_id)
        training_data.append(text)
    return doc_ids, np.array(training_data)

def create_and_train_tokenizer(train_file_ids):
    text = ""
    for file_id in train_file_ids:
        text += gutenberg.raw(file_id)

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(text)
    
    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    return tokenizer

In [4]:
doc_ids, training_data = create_training_data_and_labels(es, 'duc-2001')

creating training data/labels: 100%|██████████| 308/308 [00:00<00:00, 225295.72it/s]


In [5]:
tokenizer = create_and_train_tokenizer(train_file_ids=gutenberg.fileids())
vectorizer = TfidfVectorizer()

In [6]:
def kl_score(p, q, lambda_param=0.1):
    """ Compute KL similarity score. """
    return sum([p[i] * math.log(p[i] + lambda_param / (q[i] + lambda_param * len(p) ) ) for i in range(len(p))])

def KL_similarization(training_data, vectorizer, tokenizer):
    """ Extract KL summaries given the training dataset. """
    summaries = {}

    for doc_index, document in enumerate(tqdm(training_data, position=0, desc='KL summarization')):
        best_sentences = []
        vectorizer = TfidfVectorizer(stop_words='english')
        doc_vectors = vectorizer.fit_transform([document]).toarray()[0]
        for _ in range(3):
            sentences = tokenizer.tokenize(document)
            sentence_score_map = {}
            for sentence_order, sentence in enumerate(sentences):
                if sentence in best_sentences:
                    continue
                candidates = [summary for summary in best_sentences]
                candidates.append(sentence)
                candidate_as_sentences = ' '.join(candidate for candidate in candidates)
                candidate_vector = vectorizer.transform([candidate_as_sentences]).toarray()[0]
                sentence_score_map[sentence_order] = kl_score(doc_vectors, candidate_vector)
            if len(sentence_score_map) != 0:
                top_candidate_order, top_candidate_score = sorted(sentence_score_map.items(), key=lambda x: x[1])[0]
                top_candidate_sentence = sentences[top_candidate_order]
                best_sentences.append(top_candidate_sentence)
        
        doc_id = doc_ids[doc_index]
        summary = ' '.join(best_sentences)
        summaries[doc_id] = summary
    return summaries

def LDA_summarization(training_data, vectorizer, tokenizer):
    """ Extract LDA summaries given the training dataset. """
    summaries = {}
    lda = LatentDirichletAllocation(n_components=20, max_iter=5, learning_method='online', learning_offset=50, random_state=0)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for doc_index, document in enumerate(tqdm(training_data, position=0, desc='LDA summarization')):
            best_sentences = []
            vectorizer = TfidfVectorizer(stop_words='english')
            doc_vectors = vectorizer.fit_transform([document]).todense()
            doc_topics_distribution = lda.fit_transform(doc_vectors).flatten()
            for _ in range(3):
                sentences = tokenizer.tokenize(document)
                sentence_score_map = {}
                for sentence_order, sentence in enumerate(sentences):
                    if sentence in best_sentences:
                        continue
                    candidates = [summary for summary in best_sentences]
                    candidates.append(sentence)
                    candidate_as_sentences = ' '.join(candidate for candidate in candidates)
                    candidate_vector = vectorizer.transform([candidate_as_sentences]).todense()
                    sentence_topics_distribution = lda.transform(candidate_vector).flatten()
                    sentence_score_map[sentence_order] = kl_score(doc_topics_distribution, sentence_topics_distribution)
                if len(sentence_score_map) == 0:
                    continue
                top_candidate_order, top_candidate_score = sorted(sentence_score_map.items(), key=lambda x: x[1])[0]
                top_candidate_sentence = sentences[top_candidate_order]
                best_sentences.append(top_candidate_sentence)
            
            doc_id = doc_ids[doc_index]
            summary = ' '.join(best_sentences)
            summaries[doc_id] = summary
    return summaries

In [7]:
kl_summaries = KL_similarization(training_data, vectorizer, tokenizer)

KL summarization: 100%|██████████| 308/308 [00:49<00:00,  6.22it/s]


In [8]:
lda_summaries = LDA_summarization(training_data, vectorizer, tokenizer)

LDA summarization: 100%|██████████| 308/308 [00:50<00:00,  6.06it/s]


In [9]:
def update_summaries(index_name, kl_summaries, lda_summaries):
    summaries_zipped = zip(kl_summaries.items(), lda_summaries.items())
    for (doc_id, kl_summary), (_, lda_summary) in tqdm(summaries_zipped, position=0, desc=f'updating {index_name} summaries'):
        yield {
            '_index': index_name,
            '_op_type': 'update',
            '_id': doc_id,
            'doc': {
                'kl_summary': kl_summary,
                'lda_summary': lda_summary
            }
        }

In [10]:
bulk(es, update_summaries('duc-2001', kl_summaries, lda_summaries))

updating duc-2001 summaries: 308it [00:00, 22794.73it/s]


(308, [])

In [11]:
from pyrouge.rouge import Rouge155
from pprint import pprint

In [12]:
def evaluation(summaries):
    res = es.search(index='duc-2001')
    hits = res['hits']['hits']
    count, precision, recall, f_score = 0, 0, 0, 0
    for hit in hits:
        summary_id = hit['_id']
        summary_text = hit['_source']['gold_summary']

        ref_texts = {summary_id: summaries[summary_id]}

        rouge = Rouge155(n_words=100)
        score_dict = rouge.score_summary(summary_text, ref_texts)
        precision += score_dict['rouge_1_precision']
        recall += score_dict['rouge_1_recall']
        f_score += score_dict['rouge_1_f_score']
        count += 1
    precision, recall, f_score = precision/count, recall/count, f_score/count
    print(f'rouge_1_precision: {precision}, rouge_1_recall: {recall}, rouge_1_f_score: {f_score}')

In [13]:
print('KL summaries evaluation')
evaluation(kl_summaries)

KL summaries evaluation
rouge_1_precision: 0.791975, rouge_1_recall: 0.795583, rouge_1_f_score: 0.7936829999999999


In [14]:
print('LDA summaries evaluation')
evaluation(lda_summaries)

LDA summaries evaluation
rouge_1_precision: 0.773509, rouge_1_recall: 0.7840779999999999, rouge_1_f_score: 0.7786629999999999


In [15]:
from pyrouge.rouge import Rouge155
from pprint import pprint

ref_texts = {'A': "Poor nations pressurise developed countries into granting trade subsidies."}
summary_text = "Poor nations demand trade subsidies from developed nations."


rouge = Rouge155(n_words=100)
score = rouge.score_summary(summary_text, ref_texts)
pprint(score)

{'rouge_1_f_score': 0.77586,
 'rouge_1_f_score_cb': 0.77586,
 'rouge_1_f_score_ce': 0.77586,
 'rouge_1_precision': 0.88235,
 'rouge_1_precision_cb': 0.88235,
 'rouge_1_precision_ce': 0.88235,
 'rouge_1_recall': 0.69231,
 'rouge_1_recall_cb': 0.69231,
 'rouge_1_recall_ce': 0.69231,
 'rouge_2_f_score': 0.57894,
 'rouge_2_f_score_cb': 0.57894,
 'rouge_2_f_score_ce': 0.57894,
 'rouge_2_precision': 0.66,
 'rouge_2_precision_cb': 0.66,
 'rouge_2_precision_ce': 0.66,
 'rouge_2_recall': 0.51562,
 'rouge_2_recall_cb': 0.51562,
 'rouge_2_recall_ce': 0.51562,
 'rouge_3_f_score': 0.5,
 'rouge_3_f_score_cb': 0.5,
 'rouge_3_f_score_ce': 0.5,
 'rouge_3_precision': 0.57143,
 'rouge_3_precision_cb': 0.57143,
 'rouge_3_precision_ce': 0.57143,
 'rouge_3_recall': 0.44444,
 'rouge_3_recall_cb': 0.44444,
 'rouge_3_recall_ce': 0.44444,
 'rouge_4_f_score': 0.45455,
 'rouge_4_f_score_cb': 0.45455,
 'rouge_4_f_score_ce': 0.45455,
 'rouge_4_precision': 0.52083,
 'rouge_4_precision_cb': 0.52083,
 'rouge_4_precisi