In [2]:
# read data 
import json
import pandas as pd
    
def lines2df(lines):
    ex_dicts = []
    for line in lines:
        ex_dict = json.loads(line.strip())
        ex_dicts.append(ex_dict)
    df = pd.DataFrame.from_records(ex_dicts, columns=list(ex_dicts[0].keys()))
    return df

lines = [l for l in open('/home/ubuntu/efs/lei/emerald_new/test.jsonl', 'r').readlines()]

emerald_df = lines2df(lines)

## Process Data

In [3]:
def concat_paragraph(sections):
    return '\n'.join(j for i in sections for j in i)
emerald_df['fulltext'] = emerald_df.sections.apply(concat_paragraph)
emerald_df['abstracttext'] = emerald_df.abstract_sections.apply(concat_paragraph)

In [4]:
def find_section_title_like(section_names, section_text, cuewords, include_title=True):
    text = []
    for sn, st in zip(section_names, section_text):
        sn = sn.lower()
        for cueword in cuewords:
            if cueword in sn:
                if include_title:
                    text.append(sn)
                text.append('\n'.join(st))
                break
    return '\n'.join(text)

def find_section_title_not_like(section_names, section_text, cuewords, include_title=True):
    text = []
    for sn, st in zip(section_names, section_text):
        sn = sn.lower()
        positive = False
        for cueword in cuewords:
            if cueword in sn:
                positive = True
                break
        if not positive:
            if include_title:
                text.append(sn)
            text.append('\n'.join(st))
    return '\n'.join(text)


emerald_df['src-full'] = emerald_df['fulltext']
emerald_df['src-intro'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['intro', 'purpose']), axis=1)
emerald_df['src-design'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['design', 'method', 'approach']), axis=1)
emerald_df['src-result'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['result', 'find', 'discuss', 'analy']), axis=1)
emerald_df['src-conclu'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['conclu', 'future']), axis=1)
emerald_df['src-related'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['related work', 'literat', 'background']), axis=1)

emerald_df['src-IC'] = emerald_df['src-intro'] + '\n' + emerald_df['src-conclu']

# ('Purpose', 'Design/methodology/approach', 'Findings', 'Originality/value')
emerald_df['tgt-full'] = emerald_df['abstracttext']
emerald_df['tgt-intro'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                         ['Purpose'.lower()], False), axis=1)
emerald_df['tgt-design'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                   ['Design/methodology/approach'.lower()], False), axis=1)
emerald_df['tgt-find'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                     ['Findings'.lower()], False), axis=1)
emerald_df['tgt-origin'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                        ['Originality/value'.lower()], False),axis=1)

In [5]:
import numpy as np
from nltk.stem.porter import PorterStemmer
import spacy
from pysbd.utils import PySBDFactory

# spacy_nlp = spacy.load('en_core_web_sm')
spacy_nlp = spacy.blank('en')
spacy_nlp.add_pipe(PySBDFactory(spacy_nlp))
from functools import partial

src2tgt_name_map = [
    ('src-full', 'tgt-full'),
    ('src-intro', 'tgt-intro'),
    ('src-design', 'tgt-design'),
    ('src-result', 'tgt-find'),
    ('src-IC', 'tgt-origin'),
]

tgt_sent_map = {'tgt-full': 10, 'tgt-intro': 2, 'tgt-design': 2, 'tgt-find': 3, 'tgt-origin': 2}
tgt_word_map = {'tgt-full': 271, 'tgt-intro': 54, 'tgt-design': 52, 'tgt-find': 68, 'tgt-origin': 47}

## Determine #word/#sent with dev set

In [41]:
lines = [l for l in open('/home/ubuntu/efs/emerald/dev.jsonl', 'r').readlines()]
dev_df = lines2df(lines)

dev_df['fulltext'] = dev_df.sections.apply(concat_paragraph)
dev_df['abstracttext'] = dev_df.abstract_sections.apply(concat_paragraph)

dev_df['tgt-full'] = dev_df['abstracttext']
dev_df['tgt-intro'] = dev_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'], ['Purpose'.lower()], False), axis=1)
dev_df['tgt-design'] = dev_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'], ['Design/methodology/approach'.lower()], False), axis=1)
dev_df['tgt-find'] = dev_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'], ['Findings'.lower()], False), axis=1)
dev_df['tgt-origin'] = dev_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'], ['Originality/value'.lower()], False),axis=1)

tgt_col_names = ['tgt-full', 'tgt-intro', 'tgt-design', 'tgt-find', 'tgt-origin']

for tgt_col_name in tgt_col_names:
    tgts = [[[w.text for w in sent] for sent in spacy_nlp(r[tgt_col_name]).sents] for _, r in list(emerald_df.iterrows())]
    num_sent = [len(sents) for sents in tgts]
    len_word = [np.sum([len(sent) for sent in sents]) for sents in tgts]
    print('%s, #dp=%d, #avgsent=%.2f, #avgword=%.2f' % (tgt_col_name, len(tgts), np.mean(num_sent), np.mean(len_word)))
#     for sents in tgts:
#         for sent in sents:
#             print(len(sent), ' '.join(sent))
#         print('*' * 20)
    print('*' * 50)

tgt-full, #dp=6000, #avgsent=10.22, #avgword=271.06
**************************************************
tgt-intro, #dp=6000, #avgsent=1.83, #avgword=53.56
**************************************************
tgt-design, #dp=6000, #avgsent=2.13, #avgword=52.16
**************************************************
tgt-find, #dp=6000, #avgsent=2.60, #avgword=68.48
**************************************************
tgt-origin, #dp=6000, #avgsent=1.79, #avgword=47.01
**************************************************


## Example

In [6]:
emerald_df.columns

Index(['title', 'keywords', 'url', 'section_names', 'sections',
       'abstract_sections_names', 'abstract_sections', 'references',
       'appendix', 'journal', 'id', 'category', 'fulltext', 'abstracttext',
       'src-full', 'src-intro', 'src-design', 'src-result', 'src-conclu',
       'src-related', 'src-IC', 'tgt-full', 'tgt-intro', 'tgt-design',
       'tgt-find', 'tgt-origin'],
      dtype='object')

In [10]:
line = jsonl[0]
# line

In [12]:
text = emerald_df.fulltext[0]
# text

In [13]:
abst  = emerald_df.fullabs[0]
# abst

## Start Summarizing

### Summarize LSA

In [16]:
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

import sumy
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


def summ_lsa(text, tgt_word):
    LANGUAGE = "english"
    SENTENCES_COUNT = 20
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    
    c = 0
    s = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        if c < tgt_word:
            s.append(str(sentence))
            c+= len(str(sentence).split(' '))
        else:
            break
    return " ".join(s)

# summ_lsa(text)
for src_sec, tgt_sec in src2tgt_name_map:
    tgt_word = tgt_word_map[tgt_sec]
    summ_lsa_fn = partial(summ_lsa, tgt_word=tgt_word)
    emerald_df['lsa_summ.' + src_sec]= emerald_df[src_sec].apply(summ_lsa_fn)


  warn(message % (words_count, sentences_count))
  warn(message % (words_count, sentences_count))
  warn(message % (words_count, sentences_count))
  warn(message % (words_count, sentences_count))
  warn(message % (words_count, sentences_count))
  warn(message % (words_count, sentences_count))
  warn(message % (words_count, sentences_count))
  warn(message % (words_count, sentences_count))
  warn(message % (words_count, sentences_count))
  warn(message % (words_count, sentences_count))
  warn(message % (words_count, sentences_count))
  warn(message % (words_count, sentences_count))


## Summarize TextRank

In [17]:
from summa import summarizer
def summ_textrank(text, tgt_word):
    return summarizer.summarize(text, words=tgt_word)

In [18]:
for src_sec, tgt_sec in src2tgt_name_map:
    summ_textrank_fn = partial(summ_textrank, tgt_word=tgt_word_map[tgt_sec])
    emerald_df['textrank_summ.' + src_sec]= emerald_df[src_sec].apply(summ_textrank_fn)

In [19]:
emerald_df['textrank_summ.src-full'][1]

'On the other hand, FMS focuses on implementing certain dimensions, such as labor, machine, operations and material handling flexibilities, which allows the manufacturing system to absorb problems with little or no impact to overall system performance or throughput (Gunasekaran et al., 2008; Harrison, 1997; Hazen et al., 2017; Hormozi, 2001).\nDriven by the lack of consensus regarding the relationship between lean and flexibility, and their impact on operational performance metrics, this paper examines the mediating effect of LM on the relationship between FMS dimensions and operational performance metrics.\nSeveral empirical studies have considered the impact of flexibility dimensions on certain operational performance metrics (Inman et al., 2011; Oke, 2013; Purvis et al., 2014; Wei et al., 2017).\nwe consulted with facility managers who were involved in LM and FMS implementation to determine which lean and flexibility dimensions have been implemented and what operational performance 

## Summarize LexRank

In [20]:
from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer

def summ_lexrank(text, tgt_word):
    LANGUAGE = "english"
    SENTENCES_COUNT = 20
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    
    c = 0
    s = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        if c < tgt_word:
            s.append(str(sentence))
            c+= len(str(sentence).split(' '))
        else:
            break
    return " ".join(s)
# summ_lsa(text)
# emerald_df['lexrank_summ']= emerald_df.fulltext.apply(summ_traditional)

for src_sec, tgt_sec in src2tgt_name_map:
    summ_lexrank_fn = partial(summ_lexrank, tgt_word=tgt_word_map[tgt_sec])
    emerald_df['lexrank_summ.' + src_sec]= emerald_df[src_sec].apply(summ_lexrank_fn)

## Summarize SumBasics

In [21]:
from sumy.summarizers.sum_basic import SumBasicSummarizer as Summarizer
# summ_lsa(text)
def summ_sumbasic(text, tgt_word):
    LANGUAGE = "english"
    SENTENCES_COUNT = 20
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    
    c = 0
    s = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        if c < tgt_word:
            s.append(str(sentence))
            c+= len(str(sentence).split(' '))
        else:
            break
    return " ".join(s)
# emerald_df['sumbasic_summ']= emerald_df.fulltext.apply(summ_sumbasic)

for src_sec, tgt_sec in src2tgt_name_map:
    summ_sumbasic_fn = partial(summ_sumbasic, tgt_word=tgt_word_map[tgt_sec])
    emerald_df['sumbasic_summ.' + src_sec]= emerald_df[src_sec].apply(summ_sumbasic_fn)

In [22]:
emerald_df.columns

Index(['title', 'keywords', 'url', 'section_names', 'sections',
       'abstract_sections_names', 'abstract_sections', 'references',
       'appendix', 'journal', 'id', 'category', 'fulltext', 'abstracttext',
       'src-full', 'src-intro', 'src-design', 'src-result', 'src-conclu',
       'src-related', 'src-IC', 'tgt-full', 'tgt-intro', 'tgt-design',
       'tgt-find', 'tgt-origin', 'leadK.src-full', 'leadK.src-intro',
       'leadK.src-design', 'leadK.src-result', 'leadK.src-IC',
       'tailK.src-full', 'tailK.src-intro', 'tailK.src-design',
       'tailK.src-result', 'tailK.src-IC', 'lsa_summ.src-full',
       'lsa_summ.src-intro', 'lsa_summ.src-design', 'lsa_summ.src-result',
       'lsa_summ.src-IC', 'textrank_summ.src-full', 'textrank_summ.src-intro',
       'textrank_summ.src-design', 'textrank_summ.src-result',
       'textrank_summ.src-IC', 'lexrank_summ.src-full',
       'lexrank_summ.src-intro', 'lexrank_summ.src-design',
       'lexrank_summ.src-result', 'lexrank_summ.src-

### LEAD-k

In [13]:
def summ_lead_tail_K(text, num_tgt_sent, is_tail=False):
    tgt_sents = []
    sents = spacy_nlp(text).sents
    if is_tail:
        sents = list(sents)[::-1]
    for i, sent in enumerate(sents):
        if len(tgt_sents) > num_tgt_sent:
            break
        sent = ' '.join([w.text for w in sent])
        tgt_sents.append(sent)
        
    return ' '.join(tgt_sents)

# for src_sec, tgt_sec in src2tgt_name_map[1:]:
#     print(src_sec, tgt_sec)
#     summ_lead_K_fn = partial(summ_lead_tail_K, num_tgt_sent=tgt_sent_map[tgt_sec], is_tail=False)
#     emerald_df['leadK.' + src_sec] = emerald_df[src_sec].apply(summ_lead_K_fn)

for src_sec, tgt_sec in src2tgt_name_map:
    print(src_sec, tgt_sec)
    summ_tail_K_fn = partial(summ_lead_tail_K, num_tgt_sent=tgt_sent_map[tgt_sec], is_tail=True)
    emerald_df['tailK.' + src_sec] = emerald_df[src_sec].apply(summ_tail_K_fn)

src-full tgt-full
src-intro tgt-intro
src-design tgt-design
src-result tgt-find
src-IC tgt-origin


In [14]:
emerald_df.columns

Index(['title', 'keywords', 'url', 'section_names', 'sections',
       'abstract_sections_names', 'abstract_sections', 'references',
       'appendix', 'journal', 'id', 'category', 'fulltext', 'abstracttext',
       'src-full', 'src-intro', 'src-design', 'src-result', 'src-conclu',
       'src-related', 'src-IC', 'tgt-full', 'tgt-intro', 'tgt-design',
       'tgt-find', 'tgt-origin', 'leadK.src-full', 'leadK.src-intro',
       'leadK.src-design', 'leadK.src-result', 'leadK.src-IC',
       'tailK.src-full', 'tailK.src-intro', 'tailK.src-design',
       'tailK.src-result', 'tailK.src-IC'],
      dtype='object')

### Dump output

In [25]:
### check summary length
model_names = ['lsa_summ', 'textrank_summ', 'lexrank_summ', 'sumbasic_summ', 'leadK', 'tailK']
# model_names = ['leadK', 'tailK']
for model_name in model_names:
    for src_sec, tgt_sec in src2tgt_name_map:
        hyps = emerald_df[model_name + '.' + src_sec].tolist()
        num_words = [len(hyp.split()) for hyp in hyps]
        
        print('%s.%s.%s.txt' % (model_name, src_sec, tgt_sec), '#word=%d' % np.mean(num_words))


lsa_summ.src-full.tgt-full.txt #word=291
lsa_summ.src-intro.tgt-intro.txt #word=61
lsa_summ.src-design.tgt-design.txt #word=45
lsa_summ.src-result.tgt-find.txt #word=63
lsa_summ.src-IC.tgt-origin.txt #word=60
textrank_summ.src-full.tgt-full.txt #word=271
textrank_summ.src-intro.tgt-intro.txt #word=46
textrank_summ.src-design.tgt-design.txt #word=34
textrank_summ.src-result.tgt-find.txt #word=49
textrank_summ.src-IC.tgt-origin.txt #word=43
lexrank_summ.src-full.tgt-full.txt #word=293
lexrank_summ.src-intro.tgt-intro.txt #word=60
lexrank_summ.src-design.tgt-design.txt #word=46
lexrank_summ.src-result.tgt-find.txt #word=63
lexrank_summ.src-IC.tgt-origin.txt #word=60
sumbasic_summ.src-full.tgt-full.txt #word=281
sumbasic_summ.src-intro.tgt-intro.txt #word=60
sumbasic_summ.src-design.tgt-design.txt #word=44
sumbasic_summ.src-result.tgt-find.txt #word=60
sumbasic_summ.src-IC.tgt-origin.txt #word=59
leadK.src-full.tgt-full.txt #word=391
leadK.src-intro.tgt-intro.txt #word=59
leadK.src-design.

In [26]:
def write_output(output_list, output_path):
    with open(output_path,'w') as f:
        for i, l in enumerate(output_list):
            f.write(l+'\n')
    print('Write %d lines into %s' % (i + 1, output_path))

output_dir = '/home/ubuntu/efs/rum20/long_summ/result/rui/'

model_names = ['lsa_summ', 'textrank_summ', 'lexrank_summ', 'sumbasic_summ', 'leadK', 'tailK']
for model_name in model_names:
    for src_sec, tgt_sec in src2tgt_name_map:
        hyps = emerald_df[model_name + '.' + src_sec].tolist()
        
        new_hyps = []
        for i in range(len(hyps)):
            l = hyps[i].strip()
            if l == "\n" or len(l.strip())==0:
                l = '__NONE__'
            l = l.replace('\n',' ')
            new_hyps.append(l)

        output_path = output_dir + '%s.%s.%s.txt' % (model_name, src_sec, tgt_sec)
        write_output(new_hyps, output_path)

        print('Dumped to %s, #result=%d' % (output_path, len(new_hyps)))
    

Write 6000 lines into /home/ubuntu/efs/rum20/long_summ/result/rui/lsa_summ.src-full.tgt-full.txt
Dumped to /home/ubuntu/efs/rum20/long_summ/result/rui/lsa_summ.src-full.tgt-full.txt, #result=6000
Write 6000 lines into /home/ubuntu/efs/rum20/long_summ/result/rui/lsa_summ.src-intro.tgt-intro.txt
Dumped to /home/ubuntu/efs/rum20/long_summ/result/rui/lsa_summ.src-intro.tgt-intro.txt, #result=6000
Write 6000 lines into /home/ubuntu/efs/rum20/long_summ/result/rui/lsa_summ.src-design.tgt-design.txt
Dumped to /home/ubuntu/efs/rum20/long_summ/result/rui/lsa_summ.src-design.tgt-design.txt, #result=6000
Write 6000 lines into /home/ubuntu/efs/rum20/long_summ/result/rui/lsa_summ.src-result.tgt-find.txt
Dumped to /home/ubuntu/efs/rum20/long_summ/result/rui/lsa_summ.src-result.tgt-find.txt, #result=6000
Write 6000 lines into /home/ubuntu/efs/rum20/long_summ/result/rui/lsa_summ.src-IC.tgt-origin.txt
Dumped to /home/ubuntu/efs/rum20/long_summ/result/rui/lsa_summ.src-IC.tgt-origin.txt, #result=6000
Writ

### Evaluate