In [1]:
import json
import pandas as pd
import numpy as np

def lines2df(lines):
    ex_dicts = []
    for line in lines:
        ex_dict = json.loads(line.strip())
        ex_dicts.append(ex_dict)
    df = pd.DataFrame.from_records(ex_dicts, columns=list(ex_dicts[0].keys()))
    return df

# lines = [l for l in open('/home/ubuntu/efs/lei/emerald_new/emerald.jsonl', 'r').readlines()]
lines = [l for l in open('/home/ubuntu/efs/lei/emerald_new/test.jsonl', 'r').readlines()]

emerald_df = lines2df(lines)

In [2]:
def concat_paragraph(sections):
    return '\n'.join(j for i in sections for j in i)
emerald_df['fulltext'] = emerald_df.sections.apply(concat_paragraph)
emerald_df['abstracttext'] = emerald_df.abstract_sections.apply(concat_paragraph)


In [3]:
def find_section_title_like(section_names, section_text, cuewords, return_fulltext_if_not_found=False):
    text = []
    for sn, st in zip(section_names, section_text):
        sn = sn.lower()
        for cueword in cuewords:
            if cueword in sn:
                text.append(' '.join(st))
                break
    if return_fulltext_if_not_found and len(text) == 0:
        for st in section_text:
            text.append(' '.join(st))

    return ' '.join(text)

emerald_df['source_introduction'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['intro', 'purpose']), axis=1)
emerald_df['source_design'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['design', 'method', 'approach']), axis=1)
emerald_df['source_result'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['result', 'find', 'discuss', 'analy']), axis=1)
emerald_df['source_conclusion'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['conclu', 'future']), axis=1)
emerald_df['source_related'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['related work', 'literat', 'background']), axis=1)

emerald_df['source_introduction_orfulltext'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['intro', 'purpose'], return_fulltext_if_not_found=True), axis=1)
emerald_df['source_design_orfulltext'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['design', 'method', 'approach'], return_fulltext_if_not_found=True), axis=1)
emerald_df['source_result_orfulltext'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['result', 'find', 'discuss', 'analy'], return_fulltext_if_not_found=True), axis=1)
emerald_df['source_conclusion_orfulltext'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['conclu', 'future'], return_fulltext_if_not_found=True), axis=1)
emerald_df['source_related_orfulltext'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['related work', 'literat', 'background'], return_fulltext_if_not_found=True), axis=1)

In [4]:
print(len(emerald_df[emerald_df.source_introduction.str.len() > 0]))
print(np.mean([len(ex.source_introduction.split()) for _, ex in emerald_df[emerald_df.source_introduction.str.len() > 0].iterrows()]))

print(len(emerald_df[emerald_df.source_design.str.len() > 0]))
print(np.mean([len(ex.source_design.split()) for _, ex in emerald_df[emerald_df.source_design.str.len() > 0].iterrows()]))

print(len(emerald_df[emerald_df.source_result.str.len() > 0]))
print(np.mean([len(ex.source_result.split()) for _, ex in emerald_df[emerald_df.source_result.str.len() > 0].iterrows()]))

print(len(emerald_df[emerald_df.source_conclusion.str.len() > 0]))
print(np.mean([len(ex.source_conclusion.split()) for _, ex in emerald_df[emerald_df.source_conclusion.str.len() > 0].iterrows()]))

print(len(emerald_df[emerald_df.source_related.str.len() > 0]))
print(np.mean([len(ex.source_related.split()) for _, ex in emerald_df[emerald_df.source_related.str.len() > 0].iterrows()]))

5045
726.209514370664
3984
966.1051706827309
4359
1986.139022711631
4727
627.7319653056907
2274
1469.5540897097626


In [5]:
print(len(emerald_df[emerald_df.source_introduction_orfulltext.str.len() > 0]))
print(np.mean([len(ex.source_introduction_orfulltext.split()) for _, ex in emerald_df[emerald_df.source_introduction_orfulltext.str.len() > 0].iterrows()]))

print(len(emerald_df[emerald_df.source_design_orfulltext.str.len() > 0]))
print(np.mean([len(ex.source_design_orfulltext.split()) for _, ex in emerald_df[emerald_df.source_design_orfulltext.str.len() > 0].iterrows()]))

print(len(emerald_df[emerald_df.source_result_orfulltext.str.len() > 0]))
print(np.mean([len(ex.source_result_orfulltext.split()) for _, ex in emerald_df[emerald_df.source_result_orfulltext.str.len() > 0].iterrows()]))

print(len(emerald_df[emerald_df.source_conclusion_orfulltext.str.len() > 0]))
print(np.mean([len(ex.source_conclusion_orfulltext.split()) for _, ex in emerald_df[emerald_df.source_conclusion_orfulltext.str.len() > 0].iterrows()]))

print(len(emerald_df[emerald_df.source_related_orfulltext.str.len() > 0]))
print(np.mean([len(ex.source_related_orfulltext.split()) for _, ex in emerald_df[emerald_df.source_related_orfulltext.str.len() > 0].iterrows()]))

6000
1275.7016666666666
6000
2293.447833333333
6000
2715.8456666666666
6000
1512.175
6000
3810.058666666667


In [6]:
# ('Purpose', 'Design/methodology/approach', 'Findings', 'Originality/value')
emerald_df['target_introduction'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                         ['Purpose'.lower()]), axis=1)
emerald_df['target_design'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                   ['Design/methodology/approach'.lower()]), axis=1)
emerald_df['target_findings'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                     ['Findings'.lower()]), axis=1)
emerald_df['target_originality'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                        ['Originality/value'.lower()]),axis=1)


print(len(emerald_df[emerald_df.target_introduction.str.len() > 0]))
print(len(emerald_df[emerald_df.target_design.str.len() > 0]))
print(len(emerald_df[emerald_df.target_findings.str.len() > 0]))
print(len(emerald_df[emerald_df.target_originality.str.len() > 0]))

6000
6000
6000
6000


In [19]:
emerald_df.head(3)

Unnamed: 0,title,keywords,url,section_names,sections,abstract_sections_names,abstract_sections,references,appendix,journal,...,abstracttext,source_introduction,source_design,source_result,source_conclusion,source_related,target_introduction,target_design,target_findings,target_originality
0,Qualitative entrepreneurship authorship: antec...,"[Research work, Entrepreneurialism, Qualitativ...",https://www.emerald.com/insight/content/doi/10...,"[1. Introduction, 2. Writing qualitatively in ...",[[The increasing institutional pressure to pub...,"[Purpose, Design/methodology/approach, Finding...",[[- The purpose of this paper is to report on ...,"[Aldrich, H.E. (1992), ""Methods in our madness...",Appendix 1. Development of sample and breakdow...,International Journal of Entrepreneurial Behav...,...,- The purpose of this paper is to report on a ...,The increasing institutional pressure to publi...,Our aim was to unearth data from like-minded c...,"Reading, and re-reading, the questions and ans...",Figure 3 pulls together the various patterned ...,,- The purpose of this paper is to report on a ...,- Scholars who had published qualitative paper...,- Entrepreneurship scholars perceive their qua...,- Although there is a vigorous debate within t...
1,The mediating effect of lean management on the...,"[Manufacturing industry, Automotive industry, ...",https://www.emerald.com/insight/content/doi/10...,"[1. Introduction, 2. Literature review and hyp...",[[Today's manufacturing companies are far more...,"[Purpose, Design/methodology/approach, Finding...",[[The purpose of this paper is to identify the...,"[Agarwal, A., Shankar, R. and Tiwari, M.K. (20...",Appendix\nTable AI\nTable AII,Journal of Manufacturing Technology Management,...,The purpose of this paper is to identify the m...,Today's manufacturing companies are far more a...,The implementation of LM and FMS in US automot...,4.1 Measurement model assessment Four tests we...,"Ambiguity, uncertainty and complexity are just...","LM, also known as the Toyota Production System...",The purpose of this paper is to identify the m...,A survey questionnaire was developed based on ...,Lean and FMS are multi-dimensional philosophie...,This research empirically develops a framework...
2,Logics of pre-merger decision-making processes...,"[Hospitals, Universities, Acquisitions and mer...",https://www.emerald.com/insight/content/doi/10...,"[Introduction, Methodology, Case description, ...","[[In the last 20 years, there has been a drama...","[Purpose, Design/methodology/approach, Finding...",[[- The purpose of this paper is to examine ho...,"[Alexander, J., Halpern, M. and Lee, S. (1996)...",Corresponding author\nSoki Choi can be contact...,Journal of Health Organization and Management,...,- The purpose of this paper is to examine how ...,"In the last 20 years, there has been a dramati...",Study design We chose an in-depth case study a...,Empirical patterns In the chain of pre-merger ...,,,- The purpose of this paper is to examine how ...,- Based on extensive document analysis and 35 ...,"- Spanning nearly a decade, the pre-merger pro...",- This is the first systematic in-depth study ...


In [25]:
print(emerald_df.iloc[0].section_names)
print(emerald_df.iloc[0].sections)

['1. Introduction', '2. Writing qualitatively in entrepreneurship: keeping the discipline open', '3. Methodological approaches and dilemmas', '4. Analysis and findings', '5. Value, implications and concluding remarks']
[['The increasing institutional pressure to publish in top ranked three and four star journals is a pressing concern for scholars in both the UK and elsewhere. This is the perennial problem of "publish or perish" (Finn, 1999; Lussier, 2010). Failure to publish can prevent academics from getting a faculty position, tenure and promotion. A track record in publishing can and does increase salary potential. Anecdotally, academics are being told not to conduct research that is not capable of being published in top-flight journals, and not to submit manuscripts to lower ranked journals. As many top journals have a rejection rate of 90 per cent or above, this poses a very real dilemma for scholarship. As such advice becomes institutionalised, it is shared by senior members of t

In [7]:
# copy https://github.com/memray/bigsum
import numpy as np
from nltk.stem.porter import PorterStemmer
import spacy
from helper import rouge

stemmer = PorterStemmer()

def lcs(X, Y):
    # find the length of the strings
    m = len(X)
    n = len(Y)

    # declaring the array for storing the dp values
    L = [[None] * (n + 1) for i in range(m + 1)]

    """Following steps build L[m + 1][n + 1] in bottom up fashion 
    Note: L[i][j] contains length of LCS of X[0..i-1] 
    and Y[0..j-1]"""
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif X[i - 1] == Y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])

                # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1]
    return L[m][n]

def oracle_extract(sents, summary_sents, match_idx_acc, lower=True, stemming=True):
    match_idx = []
    match_title = []
    match_scores = []

    # lowercase
    if lower:
        sents = [[w.lower() for w in sent] for sent in sents]
        summary_sents = [[w.lower() for w in sent] for sent in summary_sents]
    # stemming
    if stemming:
        sents = [[stemmer.stem(w) for w in sent] for sent in sents]
        summary_sents = [[stemmer.stem(w) for w in sent] for sent in summary_sents]

    for summary_id, summary_sent in enumerate(summary_sents):
        # rarely happens, number of summaries is larger than sents
        if summary_id >= len(sents):
            break

        match_score = [lcs(sent, summary_sent) for sent in sents]

        # remove the previously selected sents by setting their scores to 0
        for match_id in (match_idx + match_idx_acc):
            match_score[match_id] = 0

        match_id = np.argmax(match_score)
        match_idx.append(match_id)
        match_scores.append(match_score[match_id])

    return match_idx, match_scores


def eval_rouge(sents, summary_sents, extract_sent_idx, number_to_cutoff=3, stopwords_removal=False, stemming=True, logger=None):
    rouge_ = rouge.Rouge(stopwords_removal=stopwords_removal, stemming=stemming)
    
    extracted_sents = [sents[idx] for idx in extract_sent_idx if idx < len(sents)]
    hypothesis = ' '.join(' '.join(i) for i in extracted_sents)
    reference = ' '.join(' '.join(i) for i in summary_sents)
    
#     print('*' * 50)
#     print(extracted_sents)
#     print(summary_sents)
#     print('*' * 50)

    if hypothesis == None or reference == None or len(hypothesis.strip()) == 0 or len(reference.strip()) == 0:
        fscores = {k: 0.0 for k in metric_keys}
    else:
        scores = rouge_.get_scores(hypothesis, reference)
        fscores = {k: v['f'] for k, v in scores[0].items()}

    return fscores

In [8]:
from multiprocessing import Pool

# spacy_nlp = spacy.load('en_core_web_sm')
from pysbd.utils import PySBDFactory
spacy_nlp = spacy.blank('en')
spacy_nlp.add_pipe(PySBDFactory(spacy_nlp))

def oracle_score(section_text, target_introduction, target_design, target_findings, target_originality):
    source_sents = [[w.text for w in sent] for sent in spacy_nlp(section_text).sents]
#     print(len(source_sents))
#     print(source_sents[0])
    
    extract_sent_idx_acc = []
    for abstract_title, abstract_text in zip(('Purpose', 'Design/methodology/approach', 'Findings', 'Originality/value'),
                                             (target_introduction, target_design, target_findings, target_originality)):
        target_sents = [[w.text for w in sent] for sent in spacy_nlp(abstract_text).sents]

        extract_sent_idx, _ = oracle_extract(source_sents, target_sents, extract_sent_idx_acc)
        extract_sent_idx_acc = extract_sent_idx_acc + extract_sent_idx

    score = eval_rouge(source_sents,
                       ' '.join((target_introduction, target_design, target_findings, target_originality)),
                       extract_sent_idx_acc)
    return score


In [9]:
import sys
print(sys.getrecursionlimit())
sys.setrecursionlimit(5000 * 1000 + 10)
print(sys.getrecursionlimit())

3000
5000010


In [None]:
from multiprocessing import Pool
from tqdm import tqdm

p = Pool(4)

tasks = []
for src_col in ('source_introduction_orfulltext', 
#                 'source_design_orfulltext', 
#                 'source_result_orfulltext', 
#                 'source_conclusion_orfulltext',
#                 'source_related_orfulltext'
               ):
    for target_col in ('target_introduction', 'target_design', 'target_findings', 'target_originality'):
        task_name = '_TO_'.join((src_col, target_col))
        tasks.append((src_col, target_col, task_name))

def func(a):
    src_col, target_col, task_name = a
    task_scores = []
    for i in range(emerald_df.shape[0]):
        src_text = emerald_df[src_col][i]
        if not src_text.strip():
            src_text = emerald_df['fulltext'][i]
        r = oracle_score(src_text, emerald_df[target_col][i])
        task_scores.append(r)
    return task_name, task_scores

scores = {}
r = p.imap_unordered(func, tasks, chunksize=1)
for task_name, task_scores in tqdm(r, total=len(tasks)):
    scores[task_name] = task_scores

scores = {}
r = p.imap_unordered(func, tasks, chunksize=1)
for task_name, task_scores in tqdm(r, total=len(tasks)):
    scores[task_name] = task_scores


Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Process ForkPoolWorker-4:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ubuntu/anaconda3/l

In [82]:
pd.DataFrame.from_records(scores, columns=list(scores[0].keys())).describe()

Unnamed: 0,rouge-1,rouge-2,rouge-l
count,6000.0,6000.0,6000.0
mean,0.370458,0.173134,0.212374
std,0.087278,0.097217,0.091241
min,0.101266,0.0,0.047201
25%,0.310545,0.104137,0.147651
50%,0.361446,0.152727,0.193983
75%,0.42,0.221802,0.254866
max,0.772152,0.703863,0.738875


In [80]:
scores

[]