In [1]:
import spacy
import os
from spacy_readability import Readability
import numpy as np
import scipy.stats as st
from collections import defaultdict
import logging
import textacy
import pickle

logger = logging.getLogger()
logger.setLevel(logging.INFO)

Load Spacy

In [2]:
nlp = spacy.load('en_core_web_sm', disable=['ner'])
read = Readability()
nlp.add_pipe(read, last=True)

In [3]:
exp_dir = 'data/exp2'

Helpers

In [4]:
def conf_interval(a):
    return np.mean(a), st.t.interval(0.95, len(a)-1, loc=np.mean(a), scale=st.sem(a))

Stats

In [5]:
def extract_ngrams(doc, n=1):
    return textacy.extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False)

class NGrams:
    def __init__(self, corpus=None, ns=[1, 2, 3]):
        self.ns = ns
        self.ngrams = defaultdict(list)
        self.unique = defaultdict(list)
        if corpus is not None:
            self.load_corpus(corpus)
            
    def load_corpus(self, corpus):
        for doc in corpus:
            self.add_doc(doc)
            
    def add_doc(self, doc):
        for n in self.ns:
            ngrams = list(str(s) for s in extract_ngrams(doc, n))
            self.ngrams[n].append(ngrams)
            if len(ngrams) == 0:
                continue
            unique = set(ngrams)
            self.unique[n].append(len(unique)/len(ngrams))
    
class NGramStats:
    def __init__(self, corpus=None, ns=[1, 2, 3]):
        self.ngrams = NGrams(corpus, ns)
        self.overlaps = {}
        self.raw = defaultdict(list)
           
    def add_doc(self, doc):
        self.ngrams.add_doc(doc)
           
    def calc_overlaps(self, other_name, other_ngrams):
        assert len(self.ngrams.ns) == len(other_ngrams.ns)
        
        if other_name not in self.overlaps:
            self.overlaps[other_name] = defaultdict(list)

        for n in self.ngrams.ns:
            overlaps = self.calc_overlap(self.ngrams.ngrams[n], other_ngrams.ngrams[n])
            self.overlaps[other_name][n] = overlaps
    
    def calc_overlap(self, ngram, other_ngram):
        
        assert len(ngram) == len(other_ngram)
        
        perc_overlaps = []

        for i in range(len(ngram)):
            this = ngram[i]
            other = other_ngram[i]
            
            common = set(this) & set(other)
            
            if len(this) == 0:
                perc_overlaps.append(0)
                continue
                
            if common == 0:
                perc_overlaps.append(0)
                continue
                
            overlaps = 0
            for j in this:
                if str(j) in common:
                    overlaps += 1
            perc_overlaps.append(overlaps / len(this))

        return perc_overlaps
    
    def stats(self):
        results = {'overlaps': {}, 'distinct': {}}
        
        for n in self.ngrams.ns:
            results['distinct'][n] = conf_interval(self.ngrams.unique[n])
            
        for name in self.overlaps:
            results['overlaps'][name] = dict()
            for n in self.overlaps[name]:
                conf = conf_interval(self.overlaps[name][n])
                results['overlaps'][name][n] = conf
        return results

In [6]:
class TextStats:
    def __init__(self, text_path):
        
        self.general = GeneralStats()
        self.readability = ReadabilityStats()
        self.ngrams = NGramStats()
        
        self.load_corpus(text_path)
        
    def load_corpus(self, text_path, max_len=-1):
        self.corpus = []
        
        with open(text_path,'r') as f:
            self.texts = f.read().splitlines()[:max_len]
            i = 0
            for doc in nlp.pipe(self.texts):
                if i % 1000 == 0:
                    logger.info('{} docs loaded'.format(i))
                self.corpus.append(doc)
                self.readability.add_doc(doc)
                self.general.add_doc(doc)
                self.ngrams.add_doc(doc)
                i+=1
                
    def stats(self):
        return {
            'readability': self.readability.stats(),
            'general': self.general.stats(),
            'ngrams': self.ngrams.stats()
        }
                
class GeneralStats:
    def __init__(self, corpus=None):
        self.metrics = [
            'n_sents', 
            'n_words', 
        ]
        self.raw = defaultdict(list)
        if corpus is not None:
            self.add_corpus(corpus)
        
    def add_corpus(self, corpus):
        for doc in corpus:
            self.add_doc(doc)
                    
    def add_doc(self, doc):
        self.raw['n_sents'].append(len(list(doc.sents)))
        self.raw['n_words'].append(len(doc))

    def stats(self):
        results = dict()
        for metric in self.metrics:
            results[metric] = conf_interval(self.raw[metric])
        
        return results
    
class ReadabilityStats:
    def __init__(self, corpus=None):
        self.metrics = [
            'flesch_kincaid_grade_level', 
            'flesch_kincaid_reading_ease', 
            'dale_chall', 
            'coleman_liau_index', 
            'automated_readability_index']
        
        self.raw = defaultdict(list)
        if corpus is not None:
            self.add_corpus(corpus)
        
    def add_corpus(self, corpus):
        for doc in corpus:
            self.add_doc(doc)
                    
    def add_doc(self, doc):
        for metric in self.metrics:
            if not getattr(doc._, metric):
                print('{} not found'.format(metric))
            self.raw[metric].append(getattr(doc._, metric))
        
    def stats(self):
        results = dict()
        
        for metric in self.metrics:
            results[metric] = conf_interval(self.raw[metric])
        
        return results

In [7]:
class ExperimentResult:
    def __init__(self, res_path):
        self.res_path = res_path
        self.name = os.path.basename(res_path)
        # Load Hypothesis
        self.text_stats = TextStats(os.path.join(self.res_path, 'hyp.txt'))
        
        # Load Rouge
        # Todo

In [8]:
class GoldSummaries:
    def __init__(self, res_path):
        self.res_path = res_path
        
        # Load Gold
        self.text_stats = TextStats(os.path.join(self.res_path, 'tar.txt'))

In [9]:
class SourceArticles:
    def __init__(self, res_path):
        self.res_path = res_path
        
        # Load Gold
        self.text_stats = TextStats(os.path.join(self.res_path, 'src.txt'))

In [19]:
class Experiment:
    def __init__(self, exp_path):
        self.exp_path = exp_path
        self.load_experiment()
        
    def load_experiment(self):
        self.results = []
        logging.info('Loading source articles')
        self.source = self.source = SourceArticles(self.exp_path)
        
        self.gold = None
        
        for d in os.listdir(self.exp_path):
            res_path = os.path.join(self.exp_path, d)
            if os.path.isdir(res_path):
                if self.gold is None:
                    logging.info('Loading gold summaries')
                    self.gold = GoldSummaries(res_path)
                    self.gold.text_stats.ngrams.calc_overlaps('source', self.source.text_stats.ngrams.ngrams)
                logging.info('Loading {}'.format(res_path))
                r = ExperimentResult(res_path)
                r.text_stats.ngrams.calc_overlaps('source', self.source.text_stats.ngrams.ngrams)
                self.results.append(r)
                
    def stats(self):
        agg = {}
        
        agg['gold'] = self.gold.text_stats.stats()
        
        for r in self.results:
            agg[r.name] = r.text_stats.stats()
            
        return agg
    
    def save(self, path):
        with open(path, 'wb') as handle:
            pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
class LatexWriter:
    def __init__(self, stats, order=None):
        self.order = order
        self.stats = stats
        
    def ngram_distinct(self):
        def write_row(model, label):
            s = self.stats[model]['ngrams']['distinct']
            
            return '{} & {:0.2f} & {:0.2f} & {:0.2f} \\\\'.format(label, 
                                                    s[1][0], 
                                                    s[2][0], 
                                                    s[3][0])
        for model, label in self.order:
            print(write_row(model, label))
            
    def ngram_similarity(self):
        def write_row(model, label):
            s = self.stats[model]['ngrams']['overlaps']['source']
            
            return '{} & {:0.2f} & {:0.2f} & {:0.2f} \\\\'.format(label, 
                                                    s[1][0], 
                                                    s[2][0], 
                                                    s[3][0])
        for model, label in self.order:
            print(write_row(model, label))
            
    def length(self):
        def write_row(model, label):
            s = self.stats[model]['general']
            
            return '{} & {:0.2f} \\\\'.format(label, 
                                                    s['n_words'][0])
        for model, label in self.order:
            print(write_row(model, label))
        
    def readability(self):
        def write_row(model, label):
            s = self.stats[model]['readability']
            
            return '{} & {:0.2f} & {:0.2f} & {:0.2f} & {:0.2f} \\\\'.format(label, 
                                                    s['flesch_kincaid_grade_level'][0], 
                                                    s['flesch_kincaid_reading_ease'][0], 
                                                    s['dale_chall'][0], 
                                                    s['automated_readability_index'][0])
        for model, label in self.order:
            print(write_row(model, label))
            



In [12]:
exp2 = Experiment('data/exp2')

INFO:root:Loading source articles
INFO:root:0 docs loaded
INFO:root:1000 docs loaded
INFO:root:2000 docs loaded
INFO:root:3000 docs loaded
INFO:root:4000 docs loaded
INFO:root:5000 docs loaded
INFO:root:6000 docs loaded
INFO:root:7000 docs loaded
INFO:root:8000 docs loaded
INFO:root:9000 docs loaded
INFO:root:10000 docs loaded
INFO:root:11000 docs loaded
INFO:root:Loading gold summaries
INFO:root:0 docs loaded
INFO:root:1000 docs loaded
INFO:root:2000 docs loaded
INFO:root:3000 docs loaded
INFO:root:4000 docs loaded
INFO:root:5000 docs loaded
INFO:root:6000 docs loaded
INFO:root:7000 docs loaded
INFO:root:8000 docs loaded
INFO:root:9000 docs loaded
INFO:root:10000 docs loaded
INFO:root:11000 docs loaded
INFO:root:Loading data/exp2/xsum-entities-encoder-segments-encoder
INFO:root:0 docs loaded
INFO:root:1000 docs loaded
INFO:root:2000 docs loaded
INFO:root:3000 docs loaded
INFO:root:4000 docs loaded
INFO:root:5000 docs loaded
INFO:root:6000 docs loaded
INFO:root:7000 docs loaded
INFO:ro

In [13]:
exp2.save('exp2.pkl')

In [21]:
import gc
import pickle 
def load_pickle_gc(f):
    output = open(f, 'rb')

    # disable garbage collector
    gc.disable()

    mydict = pickle.load(output)

    # enable garbage collector again
    gc.enable()
    output.close()
    return mydict
exp2 = load_pickle_gc('exp2.pkl')

In [22]:
ltx_writer = LatexWriter(exp2.stats(), 
                         order=[('gold', 'Gold summaries'),
                                ('xsum-vanilla', 'MASS reproduced'),
                                ('xsum-entities-encoder', 'NER-Enc'),
                                ('xsum-segments-encoder', 'SEG-Enc'),
                                ('xsum-entities-encoder-segments-encoder', 'NER-Enc, SEG-Enc')])



  lower_bound = self.a * scale + loc
  upper_bound = self.b * scale + loc


In [37]:
exp2.results[3].text_stats.ngrams.ngrams.unique[3]

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0

In [23]:
ltx_writer.readability()

Gold summaries & 10.96 & 54.01 & 10.07 & 11.48 \\
MASS reproduced & 9.66 & 60.67 & 9.53 & 9.53 \\
NER-Enc & 9.73 & 60.27 & 9.56 & 9.74 \\
SEG-Enc & 9.74 & 60.01 & 9.61 & 9.71 \\
NER-Enc, SEG-Enc & 9.88 & 59.54 & 9.64 & 9.87 \\


In [24]:
ltx_writer.ngram_distinct()

Gold summaries & 0.99 & 1.00 & 1.00 \\
MASS reproduced & 0.97 & 1.00 & 1.00 \\
NER-Enc & 0.97 & 1.00 & 1.00 \\
SEG-Enc & 0.97 & 1.00 & 1.00 \\
NER-Enc, SEG-Enc & 0.97 & 1.00 & 1.00 \\


In [25]:
ltx_writer.ngram_similarity()

Gold summaries & 0.50 & 0.16 & 0.04 \\
MASS reproduced & 0.63 & 0.28 & 0.08 \\
NER-Enc & 0.62 & 0.28 & 0.08 \\
SEG-Enc & 0.63 & 0.29 & 0.08 \\
NER-Enc, SEG-Enc & 0.62 & 0.27 & 0.07 \\


In [26]:
ltx_writer.length()

Gold summaries & 24.34 \\
MASS reproduced & 22.70 \\
NER-Enc & 22.98 \\
SEG-Enc & 22.72 \\
NER-Enc, SEG-Enc & 23.06 \\
