In [1]:
import pandas as pd
import text_summarizer
import rouge
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor

In [2]:
!ls .data

test.txt.src  test.txt.tgt


In [3]:
!mkdir -p .data
!wget -nc -O .data/test.txt.tgt https://raw.githubusercontent.com/lambdaofgod/project_data/master/summarization/cnn-dm/test.txt.tgt.tagged
!wget -nc -O .data/test.txt.src https://raw.githubusercontent.com/lambdaofgod/project_data/master/summarization/cnn-dm/test.txt.src

File ‘.data/test.txt.tgt’ already there; not retrieving.
File ‘.data/test.txt.src’ already there; not retrieving.


In [4]:
def map_parallel(f, iter):
    exc = ProcessPoolExecutor()
    return exc.map(f, iter)


def load_texts(path):
    return pd.Series(open(path, 'r').readlines())


def maybe_summarize_texts(summarization_method, texts):
    return pd.Series(list(tqdm(map_parallel(summarization_method, texts), total=len(texts))))


def flatten_rouge_dict(rouge_dict):
    return {k + '-' + subk: val for (k, dist_val) in rouge_dict.items() for (subk, val) in dist_val.items()}


def get_rouge_df(summaries, references):
    valid_summaries_indices = summaries != ''
    r = rouge.Rouge()
    scores = r.get_scores(summaries[valid_summaries_indices], references[valid_summaries_indices])
    return pd.DataFrame([flatten_rouge_dict(score) for score in scores])

In [5]:
input_texts = load_texts('.data/test.txt.src')
reference_summaries = load_texts('.data/test.txt.tgt')

In [6]:
input_lengths = input_texts.str.split().apply(len)
reference_summaries_lengths = reference_summaries.str.split().apply(len)

lengths_df = pd.DataFrame({'input': input_lengths, 'summary': reference_summaries_lengths})

In [7]:
lengths_df.describe()

Unnamed: 0,input,summary
count,11490.0,11490.0
mean,778.267885,66.074151
std,399.743713,26.906939
min,59.0,11.0
25%,475.0,49.0
50%,701.0,62.0
75%,998.0,76.0
max,2380.0,738.0


In [8]:
def target_summary_length(text, summary_length_ratio=0.2):
    return int(len(text.split()) * summary_length_ratio)

In [9]:
cbow_summarizer = text_summarizer.CentroidBOWSummarizer(length_limit=3)


def summarize_with_cbow(text):
    try:
        summary = cbow_summarizer.summarize(text, limit=target_summary_length(text))
    except:
        summary = ''
    return summary

In [10]:
%%time
embeddings = text_summarizer.centroid_word_embeddings.load_gensim_embedding_model('glove-wiki-gigaword-50');

CPU times: user 18.8 s, sys: 318 ms, total: 19.1 s
Wall time: 20 s


In [11]:
embedding_summarizer = text_summarizer.CentroidWordEmbeddingsSummarizer(embeddings, length_limit=3)


def summarize_with_embeddings(text):
    try:
        summary = embedding_summarizer.summarize(text, limit=target_summary_length(text))
    except:
        summary = ''
    return summary

In [12]:
cbow_summaries = maybe_summarize_texts(summarize_with_cbow, input_texts)

100%|██████████| 11490/11490 [02:46<00:00, 68.94it/s]


In [13]:
%%time

cbow_rouge_df = get_rouge_df(cbow_summaries, reference_summaries)

CPU times: user 3min 2s, sys: 504 ms, total: 3min 2s
Wall time: 3min 3s


In [14]:
import summa

def summarize_with_textrank(text):
    try:
        summary = summa.summarizer.summarize(text, words=target_summary_length(text))
    except:
        summary = ''
    return summary

In [15]:
%%time

textrank_summaries = maybe_summarize_texts(summarize_with_textrank, input_texts);

100%|██████████| 11490/11490 [03:11<00:00, 60.12it/s]

CPU times: user 8.69 s, sys: 1.34 s, total: 10 s
Wall time: 3min 13s





In [16]:
%%time

textrank_scores_df = get_rouge_df(textrank_summaries, reference_summaries)
textrank_scores_df

CPU times: user 2min 39s, sys: 295 ms, total: 2min 40s
Wall time: 2min 40s


In [17]:
textrank_scores_df[[col for col in textrank_scores_df.columns if '-r' in col]].mean()

rouge-1-r    0.505383
rouge-2-r    0.201163
rouge-l-r    0.462364
dtype: float64

In [18]:
cbow_rouge_df[[col for col in cbow_rouge_df.columns if '-r' in col]].mean()

rouge-1-r    0.475737
rouge-2-r    0.175338
rouge-l-r    0.433820
dtype: float64

In [19]:
embedding_summaries = maybe_summarize_texts(summarize_with_embeddings, input_texts)

  uu = np.average(np.square(u), weights=w)
  uu = np.average(np.square(u), weights=w)
  uu = np.average(np.square(u), weights=w)
  uu = np.average(np.square(u), weights=w)
  uv = np.average(u * v, weights=w)
  vv = np.average(np.square(v), weights=w)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  vv = np.average(np.square(v), weights=w)
  uv = np.average(u * v, weights=w)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  vv = np.average(np.square(v), weights=w)
  uv = np.average(u * v, weights=w)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  vv = np.average(np.square(v), weights=w)
  uv = np.average(u * v, weights=w)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
100%|██████████| 11490/11490 [04:12<00:00, 45.44it/s]


In [20]:
embedding_scores_df = get_rouge_df(embedding_summaries, reference_summaries)

In [21]:
embedding_scores_df[[col for col in embedding_scores_df.columns if '-r' in col]].mean()

rouge-1-r    0.469270
rouge-2-r    0.157695
rouge-l-r    0.425080
dtype: float64