<a href="https://colab.research.google.com/github/katrina906/CS6120-Summarization-Project/blob/main/compare_extractive_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# only using False versions of metrics because always choose the same models for both and thus all results are the same

In [1]:
%%capture 
!pip install rouge-score
!pip install import-ipynb
!pip install fasttext
!pip install compress-fasttext
!pip install gensim==3.8.3

In [2]:
import os
import pandas as pd
import numpy as np
import pickle
import string
import re
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import networkx as nx
from rouge_score import rouge_scorer
import gensim
import fasttext
from gensim.models import FastText
import compress_fasttext
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords  
import matplotlib.pyplot as plt
import numpy as np
import import_ipynb
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  
import bokeh
from bokeh.layouts import gridplot, column, row
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
from bokeh.models import Div
from bokeh.models import Span
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.transform import dodge
from math import pi

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# load in functions from extract_summarization notebook
%cd "drive/MyDrive/Colab Notebooks"
import extractive_summarization
%cd ..

/content/drive/MyDrive/Colab Notebooks
importing Jupyter notebook from extractive_summarization.ipynb
/content/drive/My Drive


### Load best models (one per algorithm, per metric)

In [310]:
model_dict = {}
eval_dict = {}
config_dict = {}
for model in ['lsa', 'textrank', 'baseline']:
  with open('/content/drive/MyDrive/data/trained_model_' + model + '.pkl', 'rb') as f:
    load = pickle.load(f)
    eval_dict[model] = load[1]
    model_dict[model] = load[2]
    config_dict[model] = load[3]

In [9]:
config_dict
# lsa always no normalization, stemming, all 3 grams
# lsa and textrank both always bow binary
# never embeddings used

{'baseline': {('fmeasure', False): "('baseline', 'num_words_gt')",
  ('fmeasure', True): "('baseline', 'num_words_gt')",
  ('precision', False): "('baseline', 'num_words_lt')",
  ('precision', True): "('baseline', 'num_words_lt')",
  ('recall', False): "('baseline', 'num_sentences')",
  ('recall', True): "('baseline', 'num_sentences')"},
 'lsa': {('fmeasure',
   False): "('lsa', 'nostop', 'stem', 'bow', 'binary', 'no_normalization', 'all', 'num_sentences')",
  ('fmeasure',
   True): "('lsa', 'nostop', 'stem', 'bow', 'binary', 'no_normalization', 'all', 'num_sentences')",
  ('precision',
   False): "('lsa', 'stopwords', 'stem', 'bow', 'counts', 'no_normalization', 'trigram', 'num_words_lt')",
  ('precision',
   True): "('lsa', 'stopwords', 'stem', 'bow', 'binary', 'no_normalization', 'all', 'num_words_lt')",
  ('recall',
   False): "('lsa', 'nostop', 'stem', 'bow', 'binary', 'no_normalization', 'all', 'num_sentences')",
  ('recall',
   True): "('lsa', 'nostop', 'stem', 'bow', 'binary', 

## Calculate P-Value with Paired Bootstrap Test

For best configuration for each evaluation metric, compare the 3 model types: which model is the best and what is the p-value?
1. Calculate difference in stat performance (recall etc.)
2. Generate N bootstrapped samples of data 
3. Train on bootstrapped data
4. Calculate difference in performance on bootstraped data
5. Count percent of replicate diffs that are >= 2 * original diff = p-value  
  - Null hypothesis is that there is no difference and the true effect size is original diff (data happens to be biased towards one model)
  - If see a lot of replicated diffs >= 2 * original diff, then null is true and there is no difference between the models

In [None]:
def paired_bootstrap(evals, models, configs, pvalue_dict, model1, model2, metric, save_every_cnt = 10, filename = '', restart = True):
  embeddings = extractive_summarization.load_embeddings()

  # which model is better and by how much
  if evals[model1][metric]['mean'] > evals[model2][metric]['mean']:
    better_model = model1
    other_model = model2
  else:
    better_model = model2
    other_model = model1
  diff =  evals[better_model][metric]['mean'] - evals[other_model][metric]['mean']
  print(better_model, diff)

  # allow start partway through 50 bootstrap samples 
  if not restart:
    with open('/content/drive/MyDrive/data/' + filename + '_' + model1 + '_' + model2 + '_' + str(metric) + '.pkl','rb') as f:
      results_so_far = pickle.load(f) 
    gt_diff = results_so_far[0]
    lt_diff = results_so_far[1]
    start = results_so_far[2]
  else:
    gt_diff = 0
    lt_diff = 0
    start = 0

  for i in range(start+1, 51):
    print('BS', i)
    # generate bootstrap samples 
    bs_sample = {}
    bs_sample[model1] = models[model1][metric].sample(n = len(models[model1][metric]), replace = True)
    bs_sample[model2] = models[model2][metric].sample(n = len(models[model2][metric]), replace = True)

    # retrain both models on bootstrap samples with the current config
    bs_results = {}
    for m in [model1, model2]:
      config = tuple(config_dict[m][metric].strip('(').strip(')').replace("'", "").split(', '))
      if 'baseline' in config:
        tfidf, feature_array = extractive_summarization.corpus_tfidf(bs_sample[m])
      else:
        tfidf = ''
        feature_array = ''
      eval_results, _ = extractive_summarization.train_config_loop(bs_sample[m], tfidf, feature_array, embeddings, stop_words, 
                                                                    [config], eval_only = True)
      bs_results[m] = eval_results[str(config)][metric]['mean']
      
    # find difference in relevant stat
    diff_bs = bs_results[better_model] - bs_results[other_model]
    print(diff_bs)
    if diff_bs >= 2*diff:
      gt_diff += 1
    else:
      lt_diff += 1

    # save bootstrap samples every save_every_cnt in case of connection issue, timeout etc.
    if ((i % save_every_cnt) == 0 or (i == 50)) and filename != '':
      with open('/content/drive/MyDrive/data/' + filename + '_' + model1 + '_' + model2 + '_' + str(metric) + '.pkl', 'wb') as f:
        pickle.dump([gt_diff, lt_diff, i], f)
        print('saving!', i)
    
  # calculate p value
  pvalue = gt_diff / (gt_diff + lt_diff)
  pvalue_dict[metric] = (better_model, pvalue)

  return pvalue_dict

In [None]:
for models in [('baseline', 'lsa'), ('baseline', 'textrank'), ('lsa', 'textrank')]: 
  if os.path.exists('/content/drive/MyDrive/data/pvalue_' + models[0] + '_' + models[1] + '.pkl'):
    # allow loading from pvalue dict with only some of the metrics
    with open('/content/drive/MyDrive/data/pvalue_' + models[0] + '_' + models[1] + '.pkl', 'rb') as f:
      pvalue_dict = pickle.load(f)
  else:
    pvalue_dict = {}
  for metric in [('fmeasure', False), ('precision', False), ('recall', False)]:
    if str(metric) not in pvalue_dict.keys():
      pvalue_dict = paired_bootstrap(eval_dict, model_dict, config_dict, pvalue_dict, models[0], models[1], metric,
                                     filename = 'bootstrap_loop', restart = True)
    with open('/content/drive/MyDrive/data/pvalue_' + models[0] + '_' + models[1] + '.pkl', 'wb') as f:
      pickle.dump(pvalue_dict, f)

In [None]:
# open completed pvalue dicts
for models in [('baseline', 'lsa'), ('baseline', 'textrank'), ('lsa', 'textrank')]: 
  with open('/content/drive/MyDrive/data/pvalue_' + models[0] + '_' + models[1] + '.pkl', 'rb') as f:
    pvalue_dict = pickle.load(f)
    print(models, '\n', pvalue_dict)

('baseline', 'lsa') 
 {('fmeasure', False): ('baseline', 0.0), ('precision', False): ('baseline', 0.0), ('recall', False): ('lsa', 0.0)}
('baseline', 'textrank') 
 {('fmeasure', False): ('textrank', 0.0), ('recall', False): ('textrank', 0.0), ('precision', True): ('textrank', 0.1)}
('lsa', 'textrank') 
 {('precision', True): ('textrank', 0.0), ('recall', False): ('textrank', 0.0), ('fmeasure', False): ('textrank', 0.0)}


### Compare algorithm performance

__Best Configs for Respective Best Metric__

In [97]:
output_notebook()

In [313]:
# calculate mean of best metric for each model 
mean_lst = {'lsa':[], 'textrank':[], 'baseline':[]}
for metric in [('fmeasure', False), ('precision', False), ('recall', False)]:
  for model in ['lsa', 'textrank', 'baseline']:
    mean_lst[model].append(eval_dict[model][metric]['mean'])

yvalues = [[i for i in mean_lst[k]] for (k,v) in mean_lst.items()]
yvalues = [item for sublist in yvalues for item in sublist]

xvalues = [[k for i in range(len(mean_lst[k]))] for (k,v) in mean_lst.items()]
xvalues = [item for sublist in xvalues for item in sublist]

mean_lst['metrics'] = ['F-Measure', 'Precision', 'Recall']
source = ColumnDataSource(data=mean_lst)

In [314]:
# side by side metric values for each metric across models (bar plot)
p = figure(x_range=['F-Measure', 'Precision', 'Recall'], y_range=(0, 0.5), plot_height=400, title="TextRank Uniformally Outperforms other Models",
           toolbar_location=None, tools="",  background_fill_color="#fafafa")

p.vbar(x=dodge('metrics', -0.25, range=p.x_range), top='baseline', width=0.2, source=source, legend_label="TF-IDF")
p.vbar(x=dodge('metrics',  0.0,  range=p.x_range), top='lsa', width=0.2, source=source, color = 'darkorange', legend_label="LSA")
p.vbar(x=dodge('metrics',  0.25, range=p.x_range), top='textrank', width=0.2, source=source, color = 'forestgreen', legend_label="TextRank")

# formatting
p.grid.grid_line_color="white"
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'
p.title.text_font_size = '14pt'
p.xaxis.major_label_text_font_size = '12pt'
p.yaxis.major_label_text_font_size = '12pt'
p.legend.location = "top_left"
p.yaxis.minor_tick_line_color = None

show(p)

# note fmeasure and recall models the same for LSA
# note these are the best models for each - not comparing precision vs recall for the same model
# LSA always worst, then tf-idf, then textrank is the best

# precision for TF-IDF, textrank is the only difference that isn't significant. pvalue = 0.10

__Each of Best 9 models for all 3 metrics__

In [315]:
# calculate mean of all metrics for each model 

for model in ['textrank', 'lsa', 'baseline']:
  for metric in [('fmeasure', False), ('precision', False), ('recall', False)]:
    model_dict[model][metric]['precision'] = model_dict[model][metric].rouge.map(lambda row: row['rouge1'].precision)
    model_dict[model][metric]['recall'] = model_dict[model][metric].rouge.map(lambda row: row['rouge1'].recall)
    model_dict[model][metric]['fmeasure'] = model_dict[model][metric].rouge.map(lambda row: row['rouge1'].fmeasure)

mean_lst = {}
for model in ['lsa', 'textrank', 'baseline']:
  for stat in ['fmeasure', 'precision', 'recall']:
    mean_lst[model + '-' + stat] = []
for model in ['lsa', 'textrank', 'baseline']:
  for metric in [('fmeasure', False), ('precision', False), ('recall', False)]:
    for stat in ['fmeasure', 'precision', 'recall']:
      mean_lst[model + '-' + metric[0]].append(model_dict[model][metric][stat].mean())

xvalues = [[k for i in range(len(mean_lst[k]))] for (k,v) in mean_lst.items()]
xvalues = [item for sublist in xvalues for item in sublist]

yvalues = [[i for i in mean_lst[k]] for (k,v) in mean_lst.items()]
yvalues = [item for sublist in yvalues for item in sublist]

mean_lst['metrics'] = ['F-Measure', 'Precision', 'Recall']
source = ColumnDataSource(data=mean_lst)

In [316]:
# side by side metric values for each metric across models (bar plot)
p = figure(x_range=['F-Measure', 'Precision', 'Recall'], y_range=(0, 0.5), plot_height=400, plot_width = 800, title="TextRank Uniformally Outperforms other Models",
           toolbar_location=None, tools="",  background_fill_color="#fafafa")

d = 0.1
w = 0.09
p.vbar(x=dodge('metrics', -4*d, range=p.x_range), top='baseline-fmeasure', width=w, source=source, legend_label="TF-IDF: F-Measure", color = 'deepskyblue')
p.vbar(x=dodge('metrics',  -3*d,  range=p.x_range), top='baseline-precision', width=w, source=source, legend_label="TF-IDF: Precision")
p.vbar(x=dodge('metrics',  -2*d, range=p.x_range), top='baseline-recall', width=w, source=source, legend_label="TF-IDF: Recall", color = 'mediumblue')

p.vbar(x=dodge('metrics', -1*d+0.015, range=p.x_range), top='lsa-fmeasure', width=w, source=source, legend_label="LSA: F-Measure", color = 'sandybrown')
p.vbar(x=dodge('metrics',  0,  range=p.x_range), top='lsa-precision', width=w, source=source, legend_label="LSA: Precision", color = 'darkorange')
p.vbar(x=dodge('metrics',  d, range=p.x_range), top='lsa-recall', width=w, source=source, legend_label="LSA: Recall", color = 'orangered')

p.vbar(x=dodge('metrics', 2*d+0.015, range=p.x_range), top='textrank-fmeasure', width=w, source=source, legend_label="TextRank: F-Measure", color = 'limegreen')
p.vbar(x=dodge('metrics',  3*d,  range=p.x_range), top='textrank-precision', width=w, source=source, legend_label="TextRank: Precision", color = 'forestgreen')
p.vbar(x=dodge('metrics',  4*d, range=p.x_range), top='textrank-recall', width=w, source=source, legend_label="TextRank: Recall", color = 'green')

# formatting
p.grid.grid_line_color="white"
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'
p.title.text_font_size = '14pt'
p.xaxis.major_label_text_font_size = '12pt'
p.yaxis.major_label_text_font_size = '12pt'
p.add_layout(p.legend[0], 'right')
p.yaxis.minor_tick_line_color = None

show(p)

# note fmeasure and recall models the same for LSA
# Fmesaure similar to baseline for all textrank models. recall does significantly better than others and its equivalents do almost as well as other models. 
  # could do better in precision, but recall more important - human content curator will take all the info and revise the summary. Want more info to work with. 

## Compare Predicted Summaries across Algorithms and Metrics


__Length of Predicted Summaries__

In [216]:
# length of summary by metric within model
summary_len_metric = {'lsa':{}, 'textrank':{}, 'baseline':{}}
for model in ['lsa', 'textrank', 'baseline']:
  for metric in [('fmeasure', False), ('precision', False), ('recall', False)]:
    df = model_dict[model][metric]
    df['summary_num_words'] = df.predicted_summary.map(lambda row: len(''.join(row).split(' ')))
    df['summary_num_sentences'] = df.predicted_summary.map(lambda row: len(row))
    summary_len_metric[model][metric] = [df.summary_num_sentences.mean(), df.summary_num_words.mean()]

In [317]:
def barplot(top, color, yrange, ylabel, title, noy = False):
  metrics = ['F-Measure', 'Precision', 'Recall']
  
  p = figure(x_range=metrics, y_range = yrange, plot_height=350, plot_width = 300, title=title, toolbar_location=None, tools="")
  p.vbar(x=metrics, top=top, width=0.9, color = color)

  # formatting
  p.grid.grid_line_color="white"
  p.xaxis.axis_label_text_font_size = '12pt'
  p.yaxis.axis_label_text_font_size = '12pt'
  p.title.text_font_size = '13pt'
  p.xaxis.major_label_text_font_size = '12pt'
  p.yaxis.major_label_text_font_size = '12pt'
  p.yaxis.axis_label = ylabel
  p.yaxis.minor_tick_line_color = None
  p.xaxis.major_label_orientation = pi/4

  if noy:
    p.yaxis.major_label_text_font_size = '0pt'

  return p

In [297]:
p1 = barplot([i[0] for i in list(summary_len_metric['baseline'].values())], '#1F77B4', (0,5.2), ylabel = '# Summary Sentences', title = 'TF-IDF')
p2 = barplot([i[0] for i in list(summary_len_metric['lsa'].values())], 'darkorange', (0,5.2), ylabel = '', title = 'LSA', noy = True)
p3 = barplot([i[0] for i in list(summary_len_metric['textrank'].values())], 'forestgreen', (0,5.2), ylabel = '', title = 'TextRank', noy = True)

suptitle = Div(text = """
<html>
<head>
<style>
h2
</style>
</head>
<body>
<h2>Precision Favors Short Summaries</h2>
</body>
</html>
""")

show(column(suptitle,gridplot([[p1, p2, p3]])))

# all 3 models get very similar sized summaries
# precision favors shorter summaries. maximizing overlap with predicted summary, so extra non relevant info hurts 
# fmeasure balances between recall and precision

In [298]:
p1 = barplot([i[1] for i in list(summary_len_metric['baseline'].values())], '#1F77B4', (0,125), ylabel = '# Summary Words', title = 'TF-IDF')
p2 = barplot([i[1] for i in list(summary_len_metric['lsa'].values())], 'darkorange', (0,125), ylabel = '', title = 'LSA', noy = True)
p3 = barplot([i[1] for i in list(summary_len_metric['textrank'].values())], 'forestgreen', (0,125), ylabel = '', title = 'TextRank', noy = True)

suptitle = Div(text = """
<html>
<head>
<style>
h2
</style>
</head>
<body>
<h2>Precision Favors Short Summaries</h2>
</body>
</html>
""")

show(column(suptitle,gridplot([[p1, p2, p3]])))

# textrank recall longest in terms of number of words - again good thing to provide more context to human content curators
  # (as long as still reasonable precision - from above, 1 in every 5 words is in the true summary)

__Relative Length Predicted vs Label Summary__

In [299]:
# relative length of actual vs predicted summary per model 
relative_len_metric = {'lsa':{}, 'textrank':{}, 'baseline':{}}
for model in ['lsa', 'textrank', 'baseline']:
  for metric in [('fmeasure', False), ('precision', False), ('recall', False)]:
    df = model_dict[model][metric]
    df['article_num_words'] = df.summary.map(lambda row: len(row.split(' ')))
    df['diff'] = df.summary_num_words - df.article_num_words 
    relative_len_metric[model][metric] = df['diff'].mean()

In [300]:
p1 = barplot([i for i in list(relative_len_metric['baseline'].values())], '#1F77B4', (-25,75), ylabel = '# Summary Words', title = 'TF-IDF')
p2 = barplot([i for i in list(relative_len_metric['lsa'].values())], 'darkorange', (-25,75), ylabel = '', title = 'LSA', noy = True)
p3 = barplot([i for i in list(relative_len_metric['textrank'].values())], 'forestgreen', (-25,75), ylabel = '', title = 'TextRank', noy = True)

suptitle = Div(text = """
<html>
<head>
<style>
h3
</style>
</head>
<body>
<h3>Recall Summaries Longer; Precision Summaries Shorter than Gold Standard Summary</h3>
</body>
</html>
""")

show(column(suptitle,gridplot([[p1, p2, p3]])))

# getting longer than actual summary is ok because using article sentences, which we know are longer and will have some unnecessary information in them
# human labeler will narrow down from too much text to correct summary

__Length of Predicted Extracted Sentences__

In [301]:
# length of sentences: words per sentence for each model
sentence_len_metric = {'lsa':{}, 'textrank':{}, 'baseline':{}}
for model in ['lsa', 'textrank', 'baseline']:
  for metric in [('fmeasure', False), ('precision', False), ('recall', False)]:
    df = model_dict[model][metric]
    df['article_num_words'] = df.predicted_summary.map(lambda row: np.mean([len(i.split(' ')) for i in row])) 
    sentence_len_metric[model][metric] = df['article_num_words'].mean()

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [304]:
p1 = barplot([i for i in list(sentence_len_metric['baseline'].values())], '#1F77B4', (0,30), ylabel = '# Summary Words', title = 'TF-IDF')
p2 = barplot([i for i in list(sentence_len_metric['lsa'].values())], 'darkorange', (0,30), ylabel = '', title = 'LSA', noy = True)
p3 = barplot([i for i in list(sentence_len_metric['textrank'].values())], 'forestgreen', (0,30), ylabel = '', title = 'TextRank', noy = True)

suptitle = Div(text = """
<html>
<head>
<style>
h3
</style>
</head>
<body>
<h3>Extracted Sentences of Uniform Length across Models</h3>
</body>
</html>
""")

show(column(suptitle,gridplot([[p1, p2, p3]])))

# summary sentences pulled of a similar length

In [None]:
model_dict['tf-idf'][('recall', False)]

In [311]:
model_dict['textrank'][('recall', False)]

Unnamed: 0,sentences,sentences_cleaned,summary,rouge,predicted_summary
4970565f6984d6a1daa9aebd2431a839ff5ced51.story,[Few sports can compare to football in terms o...,[few sport can compar to footbal in term of it...,"Football is played all over the planet, meanin...","{'rouge1': (0.1232876712328767, 0.339622641509...",[The cistern stores rainwater that can be used...
24aa26911c68d37efc01c9ab1bb63ad47f03f893.story,[Hours before the George Zimmerman not-guilty ...,[hour befor the georg zimmerman notguilti verd...,LZ Granderson: Before verdict I'd discussed wi...,"{'rouge1': (0.20422535211267606, 0.40845070422...","[If, during this 16-month ordeal, that thought..."
42b09b870f0cb9bfc68f94dc9eafa8dbd4b52671.story,[A storm has killed at least 16 people on the ...,[a storm ha kill at least 16 peopl on the ital...,"Italian PM Enrico Letta declares emergency, al...","{'rouge1': (0.07766990291262135, 0.22857142857...",[But Vargiu said a lot of ministry staff and e...
d315c6c1a7c2d2ad0b27e1f4e22bda9639fd7968.story,"[Furious winds, giant waves and lack of sleep ...",[furiou wind giant wave and lack of sleep have...,Dilip Donde is the first Indian sailor ever to...,"{'rouge1': (0.16129032258064516, 0.42372881355...",[It is wonderful to sail on a good sunny day w...
eb7568f849dda432085cf663f0db1a4e41e53f14.story,"[In the style capital of the world, AC Milan i...",[in the style capit of the world ac milan is l...,"AC Milan looking to build a new 48,000-seater ...","{'rouge1': (0.25, 0.47058823529411764, 0.32653...","[The famed football club, seven-time champions..."
...,...,...,...,...,...
4d29db2f2f9a4337d381e3de5a3214d5fe2cd07b.story,[NASA is sending astronaut Scott Kelly to live...,[nasa is send astronaut scott kelli to live on...,Astronaut Scott Kelly to live in space one yea...,"{'rouge1': (0.10144927536231885, 0.58333333333...",[NASA is sending astronaut Scott Kelly to live...
26539d71bda602b769710c797bdd61cc6b276438.story,"[Saying that ""it's time to burn the beret and ...",[say that it time to burn the beret and buri t...,Monica Lewinsky speaks out on affair that led ...,"{'rouge1': (0.1728395061728395, 0.583333333333...","[Saying that ""it's time to burn the beret and ..."
dff5fd52c46068123ce0f253b19c2581eaf650f2.story,[Monza may be a long way from his birthplace i...,[monza may be a long way from hi birthplac in ...,F1 driver Paul di Resta tells CNN about his dr...,"{'rouge1': (0.18125, 0.46774193548387094, 0.26...","[""When I was growing up, the Ferrari dominatio..."
6e713e225194640ecb9fd39079334420d3a87d02.story,"[Last week, in an unusually public display of ...",[last week in an unusu public display of campa...,Tensions have already emerged among Hillary Cl...,"{'rouge1': (0.08152173913043478, 0.57692307692...","[As Swaab, lead author of the ""too much talent..."
