In [1]:
import json
import pandas as pd

def lines2df(lines):
    ex_dicts = []
    for line in lines:
        ex_dict = json.loads(line.strip())
        ex_dicts.append(ex_dict)
    df = pd.DataFrame.from_records(ex_dicts, columns=list(ex_dicts[0].keys()))
    return df

# lines = [l for l in open('/home/ubuntu/efs/lei/emerald_new/emerald.jsonl', 'r').readlines()]
lines = [l for l in open('/home/ubuntu/efs/lei/emerald_new/test.jsonl', 'r').readlines()]

emerald_df = lines2df(lines)

In [2]:
def concat_paragraph(sections):
    return '\n'.join(j for i in sections for j in i)
emerald_df['fulltext'] = emerald_df.sections.apply(concat_paragraph)
emerald_df['abstracttext'] = emerald_df.abstract_sections.apply(concat_paragraph)

In [3]:
def find_section_title_like(section_names, section_text, cuewords):
    text = []
    for sn, st in zip(section_names, section_text):
        sn = sn.lower()
        for cueword in cuewords:
            if cueword in sn:
                text.append(' '.join(st))
                break
    return ' '.join(text)

emerald_df['source_introduction'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['intro', 'purpose']), axis=1)
emerald_df['source_design'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['design', 'method', 'approach']), axis=1)
emerald_df['source_result'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['result', 'find', 'discuss', 'analy']), axis=1)
emerald_df['source_conclusion'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['conclu', 'future']), axis=1)
emerald_df['source_related'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['related work', 'literat', 'background']), axis=1)

In [4]:
# ('Purpose', 'Design/methodology/approach', 'Findings', 'Originality/value')
emerald_df['target_introduction'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                         ['Purpose'.lower()]), axis=1)
emerald_df['target_design'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                   ['Design/methodology/approach'.lower()]), axis=1)
emerald_df['target_findings'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                     ['Findings'.lower()]), axis=1)
emerald_df['target_originality'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                        ['Originality/value'.lower()]),axis=1)

In [5]:
emerald_df = emerald_df[['section_names', 'sections', 'target_introduction', 'target_design', 'target_findings', 'target_originality']]
emerald_df.head(3)

Unnamed: 0,section_names,sections,target_introduction,target_design,target_findings,target_originality
0,"[1. Introduction, 2. Writing qualitatively in ...",[[The increasing institutional pressure to pub...,- The purpose of this paper is to report on a ...,- Scholars who had published qualitative paper...,- Entrepreneurship scholars perceive their qua...,- Although there is a vigorous debate within t...
1,"[1. Introduction, 2. Literature review and hyp...",[[Today's manufacturing companies are far more...,The purpose of this paper is to identify the m...,A survey questionnaire was developed based on ...,Lean and FMS are multi-dimensional philosophie...,This research empirically develops a framework...
2,"[Introduction, Methodology, Case description, ...","[[In the last 20 years, there has been a drama...",- The purpose of this paper is to examine how ...,- Based on extensive document analysis and 35 ...,"- Spanning nearly a decade, the pre-merger pro...",- This is the first systematic in-depth study ...


In [6]:
# copy https://github.com/memray/bigsum
import numpy as np
from nltk.stem.porter import PorterStemmer
import spacy
from helper import rouge

stemmer = PorterStemmer()

def lcs(X, Y):
    # find the length of the strings
    m = len(X)
    n = len(Y)

    # declaring the array for storing the dp values
    L = [[None] * (n + 1) for i in range(m + 1)]

    """Following steps build L[m + 1][n + 1] in bottom up fashion 
    Note: L[i][j] contains length of LCS of X[0..i-1] 
    and Y[0..j-1]"""
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif X[i - 1] == Y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])

                # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1]
    return L[m][n]

def oracle_extract(sents, summary_sents, match_idx_acc, lower=True, stemming=True):
    match_idx = []
    match_title = []
    match_scores = []

    # lowercase
    if lower:
        sents = [[w.lower() for w in sent] for sent in sents]
        summary_sents = [[w.lower() for w in sent] for sent in summary_sents]
    # stemming
    if stemming:
        sents = [[stemmer.stem(w) for w in sent] for sent in sents]
        summary_sents = [[stemmer.stem(w) for w in sent] for sent in summary_sents]

    for summary_id, summary_sent in enumerate(summary_sents):
        # rarely happens, number of summaries is larger than sents
        if summary_id >= len(sents):
            break

        match_score = [lcs(sent, summary_sent) for sent in sents]

        # remove the previously selected sents by setting their scores to 0
        for match_id in (match_idx + match_idx_acc):
            match_score[match_id] = 0

        match_id = np.argmax(match_score)
        match_idx.append(match_id)
        match_scores.append(match_score[match_id])

    return match_idx, match_scores


def eval_rouge(sents, summary_sents, extract_sent_idx, number_to_cutoff=3, stopwords_removal=False, stemming=True, logger=None):
    rouge_ = rouge.Rouge(stopwords_removal=stopwords_removal, stemming=stemming)
    # extract_sent_idx = extract_sent_idx[: min(number_to_cutoff, len(extract_sent_idx))]
    # sort extracted sentences in the order of their appearance
    # extract_sent_idx = sorted(extract_sent_idx)
    extracted_sents = [sents[idx] for idx in extract_sent_idx if idx < len(sents)]
    hypothesis = ' '.join(' '.join(i) for i in extracted_sents)
    reference = summary_sents

    if hypothesis == None or reference == None or len(hypothesis.strip()) == 0 or len(reference.strip()) == 0:
        fscores = {k: 0.0 for k in metric_keys}
    else:
        scores = rouge_.get_scores(hypothesis, reference)
        fscores = {k: v['f'] for k, v in scores[0].items()}

    return fscores

In [7]:
from multiprocessing import Pool

# spacy_nlp = spacy.load('en_core_web_sm')
from pysbd.utils import PySBDFactory
spacy_nlp = spacy.blank('en')
spacy_nlp.add_pipe(PySBDFactory(spacy_nlp))

def oracle_score(section_titles, section_text, target_introduction, target_design, target_findings, target_originality):
    source_sents = []
    source_titles = []
    for title, section in zip(section_titles, section_text):
        for sent in spacy_nlp(' '.join(section)).sents:
            source_sents.append([w.text for w in sent])
            source_titles.append(title)
    
    title_distribution = {}
    extract_sent_idx_acc = []
    for abstract_title, abstract_text in zip(('Purpose', 'Design/methodology/approach', 'Findings', 'Originality/value'),
                                             (target_introduction, target_design, target_findings, target_originality)):
        target_sents = [[w.text for w in sent] for sent in spacy_nlp(abstract_text).sents]

        extract_sent_idx, _ = oracle_extract(source_sents, target_sents, extract_sent_idx_acc)
        extract_sent_idx_acc = extract_sent_idx_acc + extract_sent_idx
        title_distribution[abstract_title] = [source_titles[i] for i in extract_sent_idx]

    score = eval_rouge(source_sents,
                       ' '.join((target_introduction, target_design, target_findings, target_originality)),
                       extract_sent_idx_acc)
    return score, title_distribution

# p = Pool(4)
# scores = list(tqdm(p.imap(lambda x: oracle_score(x[0], x[1]), zip(emerald_df['fulltext'], emerald_df['abstracttext']))))

In [8]:
from multiprocessing import Pool
from tqdm import tqdm
import sys
sys.setrecursionlimit(25000)

def func(i):
    return oracle_score(emerald_df['section_names'][i], emerald_df['sections'][i],
                                   emerald_df['target_introduction'][i], emerald_df['target_design'][i],
                                   emerald_df['target_findings'][i], emerald_df['target_originality'][i])

scores = []
title_distributions = []

p = Pool(20)

limit = emerald_df.shape[0]
r = p.imap_unordered(func, list(range(limit)), chunksize=10)
for score, dist in tqdm(r, total=limit):
    scores.append(score)
    title_distributions.append(dist)


100%|██████████| 6000/6000 [15:51<00:00,  6.31it/s] 


In [9]:
import json
json.dump(scores, open('oracle_score_testset_unordered.json', 'w'))
json.dump(title_distributions, open('oracle_match_distributions_testset_unordered.json', 'w'))

In [10]:
from pprint import pprint
pprint(title_distributions[0])

{'Design/methodology/approach': ['ComProCom'],
 'Findings': ['Conclusion', 'Conclusion', 'Introduction'],
 'Originality/value': ['Competence standards in emerging fields: innovation '
                       'management and management of social co-operative '
                       'enterprises'],
 'Purpose': ['Introduction']}


In [11]:
pd.DataFrame.from_records(scores, columns=list(scores[0].keys())).describe()

Unnamed: 0,rouge-1,rouge-2,rouge-l
count,6000.0,6000.0,6000.0
mean,0.418821,0.21169,0.286222
std,0.109211,0.116855,0.136582
min,0.083333,0.0,0.031511
25%,0.343527,0.126885,0.184215
50%,0.412167,0.190038,0.264639
75%,0.485932,0.27004,0.364435
max,0.878378,0.793249,0.875505


In [26]:
match_section_dist_tmp = pd.DataFrame.from_records(title_distributions, columns=list(title_distributions[0].keys()))
match_section_dist = pd.DataFrame()
for col in ('Purpose', 'Design/methodology/approach', 'Findings', 'Originality/value'):
    t = match_section_dist_tmp[col].explode().apply(lambda x: x.strip('0123456789. ').lower()).value_counts().head(20)
    match_section_dist[col] = list(zip(t.index, t))
match_section_dist.head(20)

Unnamed: 0,Purpose,Design/methodology/approach,Findings,Originality/value
0,"(introduction, 4788)","(introduction, 2653)","(introduction, 2061)","(introduction, 2861)"
1,"(__no_title__, 586)","(methodology, 1188)","(discussion, 1301)","(discussion, 720)"
2,"(discussion, 394)","(method, 595)","(conclusion, 1183)","(conclusion, 685)"
3,"(conclusion, 364)","(research methodology, 415)","(results, 1177)","(conclusions, 465)"
4,"(literature review, 270)","(methods, 402)","(conclusions, 837)","(__no_title__, 369)"
5,"(conclusions, 236)","(__no_title__, 389)","(__no_title__, 424)","(literature review, 353)"
6,"(results, 171)","(results, 372)","(findings, 337)","(results, 158)"
7,"(methodology, 144)","(conclusion, 306)","(literature review, 294)","(methodology, 133)"
8,"(background, 79)","(discussion, 263)","(results and discussion, 287)","(discussion and conclusions, 110)"
9,"(findings, 67)","(literature review, 239)","(discussion and conclusion, 207)","(discussion and conclusion, 105)"


In [29]:
match_section_dist_tmp = pd.DataFrame.from_records(title_distributions, columns=list(title_distributions[0].keys()))
match_section_dist = pd.DataFrame()
for col in ('Purpose', 'Design/methodology/approach', 'Findings', 'Originality/value'):
    explode = match_section_dist_tmp[col].explode()
    t = explode.apply(lambda x: x.strip('0123456789. ').lower()).value_counts().head(20) / explode.shape[0]
    match_section_dist[col] = list(zip(t.index, [str(i)[:5] for i in t]))
match_section_dist.head(20)

Unnamed: 0,Purpose,Design/methodology/approach,Findings,Originality/value
0,"(introduction, 0.436)","(introduction, 0.207)","(introduction, 0.132)","(introduction, 0.266)"
1,"(__no_title__, 0.053)","(methodology, 0.093)","(discussion, 0.083)","(discussion, 0.067)"
2,"(discussion, 0.035)","(method, 0.046)","(conclusion, 0.076)","(conclusion, 0.063)"
3,"(conclusion, 0.033)","(research methodology, 0.032)","(results, 0.075)","(conclusions, 0.043)"
4,"(literature review, 0.024)","(methods, 0.031)","(conclusions, 0.053)","(__no_title__, 0.034)"
5,"(conclusions, 0.021)","(__no_title__, 0.030)","(__no_title__, 0.027)","(literature review, 0.032)"
6,"(results, 0.015)","(results, 0.029)","(findings, 0.021)","(results, 0.014)"
7,"(methodology, 0.013)","(conclusion, 0.023)","(literature review, 0.018)","(methodology, 0.012)"
8,"(background, 0.007)","(discussion, 0.020)","(results and discussion, 0.018)","(discussion and conclusions, 0.010)"
9,"(findings, 0.006)","(literature review, 0.018)","(discussion and conclusion, 0.013)","(discussion and conclusion, 0.009)"


# result from last run

In [28]:
match_section_dist = pd.DataFrame.from_records(title_distributions, columns=list(title_distributions[0].keys()))
match_section_dist['Purpose'].explode().value_counts().head(20)

Introduction                  2672
1. Introduction               1816
__NO_TITLE__                   586
1 Introduction                 297
Discussion                     292
Conclusion                     216
Literature review              133
2. Literature review           120
Results                        117
Conclusions                    102
Methodology                     93
Background                      53
Findings                        51
6. Conclusion                   48
5. Discussion                   42
5. Conclusion                   40
5. Conclusions                  36
6. Conclusions                  36
Method                          35
Discussion and conclusions      35
Name: Purpose, dtype: int64

In [31]:
match_section_dist['Findings'].explode().value_counts().head(20)

Introduction                 1119
Discussion                    996
Results                       834
1. Introduction               798
Conclusion                    693
__NO_TITLE__                  424
Conclusions                   405
Findings                      254
4. Results                    189
5. Conclusion                 167
Literature review             155
Results and discussion        151
5. Discussion                 147
1 Introduction                144
5. Conclusions                121
6. Conclusion                 121
2. Literature review          114
Discussion and conclusion     108
6. Conclusions                103
Methodology                    93
Name: Findings, dtype: int64

In [32]:
match_section_dist['Originality/value'].explode().value_counts().head(20)

Introduction                  1501
1. Introduction               1147
Discussion                     548
Conclusion                     395
__NO_TITLE__                   369
Conclusions                    237
1 Introduction                 211
2. Literature review           165
Literature review              157
Results                         97
6. Conclusion                   84
5. Conclusion                   84
5. Discussion                   79
Methodology                     77
5. Conclusions                  76
6. Conclusions                  59
Findings                        57
Discussion and conclusion       57
Discussion and conclusions      54
Background                      46
Name: Originality/value, dtype: int64

In [82]:
#### 
pd.DataFrame.from_records(scores, columns=list(scores[0].keys())).describe()

Unnamed: 0,rouge-1,rouge-2,rouge-l
count,6000.0,6000.0,6000.0
mean,0.370458,0.173134,0.212374
std,0.087278,0.097217,0.091241
min,0.101266,0.0,0.047201
25%,0.310545,0.104137,0.147651
50%,0.361446,0.152727,0.193983
75%,0.42,0.221802,0.254866
max,0.772152,0.703863,0.738875
