In [1]:
import json
import pandas as pd

def lines2df(lines):
    ex_dicts = []
    for line in lines:
        ex_dict = json.loads(line.strip())
        ex_dicts.append(ex_dict)
    df = pd.DataFrame.from_records(ex_dicts, columns=list(ex_dicts[0].keys()))
    return df

# lines = [l for l in open('/home/ubuntu/efs/lei/emerald_new/emerald.jsonl', 'r').readlines()]
lines = [l for l in open('/home/ubuntu/efs/lei/emerald_new/test.jsonl', 'r').readlines()]

emerald_df = lines2df(lines)

In [2]:
def concat_paragraph(sections):
    return '\n'.join(j for i in sections for j in i)
emerald_df['fulltext'] = emerald_df.sections.apply(concat_paragraph)
emerald_df['abstracttext'] = emerald_df.abstract_sections.apply(concat_paragraph)

In [3]:
def find_section_title_like(section_names, section_text, cuewords):
    text = []
    for sn, st in zip(section_names, section_text):
        sn = sn.lower()
        for cueword in cuewords:
            if cueword in sn:
                text.append(' '.join(st))
                break
    return ' '.join(text)

emerald_df['source_introduction'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['intro', 'purpose']), axis=1)
emerald_df['source_design'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['design', 'method', 'approach']), axis=1)
emerald_df['source_result'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['result', 'find', 'discuss', 'analy']), axis=1)
emerald_df['source_conclusion'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['conclu', 'future']), axis=1)
emerald_df['source_related'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['related work', 'literat', 'background']), axis=1)

In [4]:
# ('Purpose', 'Design/methodology/approach', 'Findings', 'Originality/value')
emerald_df['target_introduction'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                         ['Purpose'.lower()]), axis=1)
emerald_df['target_design'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                   ['Design/methodology/approach'.lower()]), axis=1)
emerald_df['target_findings'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                     ['Findings'.lower()]), axis=1)
emerald_df['target_originality'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                        ['Originality/value'.lower()]),axis=1)

In [5]:
emerald_df = emerald_df[['section_names', 'sections', 'fulltext',
                         'source_introduction', 'source_design', 'source_result', 'source_conclusion', 'source_related',
                         'target_introduction', 'target_design', 'target_findings', 'target_originality']]
emerald_df.head(3)

Unnamed: 0,section_names,sections,fulltext,source_introduction,source_design,source_result,source_conclusion,source_related,target_introduction,target_design,target_findings,target_originality
0,"[1. Introduction, 2. Writing qualitatively in ...",[[The increasing institutional pressure to pub...,The increasing institutional pressure to publi...,The increasing institutional pressure to publi...,Our aim was to unearth data from like-minded c...,"Reading, and re-reading, the questions and ans...",Figure 3 pulls together the various patterned ...,,- The purpose of this paper is to report on a ...,- Scholars who had published qualitative paper...,- Entrepreneurship scholars perceive their qua...,- Although there is a vigorous debate within t...
1,"[1. Introduction, 2. Literature review and hyp...",[[Today's manufacturing companies are far more...,Today's manufacturing companies are far more a...,Today's manufacturing companies are far more a...,The implementation of LM and FMS in US automot...,4.1 Measurement model assessment Four tests we...,"Ambiguity, uncertainty and complexity are just...","LM, also known as the Toyota Production System...",The purpose of this paper is to identify the m...,A survey questionnaire was developed based on ...,Lean and FMS are multi-dimensional philosophie...,This research empirically develops a framework...
2,"[Introduction, Methodology, Case description, ...","[[In the last 20 years, there has been a drama...","In the last 20 years, there has been a dramati...","In the last 20 years, there has been a dramati...",Study design We chose an in-depth case study a...,Empirical patterns In the chain of pre-merger ...,,,- The purpose of this paper is to examine how ...,- Based on extensive document analysis and 35 ...,"- Spanning nearly a decade, the pre-merger pro...",- This is the first systematic in-depth study ...


In [6]:
# copy https://github.com/memray/bigsum
import numpy as np
from nltk.stem.porter import PorterStemmer
import spacy
from helper import rouge

stemmer = PorterStemmer()

def lcs(X, Y):
    # find the length of the strings
    m = len(X)
    n = len(Y)

    # declaring the array for storing the dp values
    L = [[None] * (n + 1) for i in range(m + 1)]

    """Following steps build L[m + 1][n + 1] in bottom up fashion 
    Note: L[i][j] contains length of LCS of X[0..i-1] 
    and Y[0..j-1]"""
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif X[i - 1] == Y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])

                # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1]
    return L[m][n]

def oracle_extract(sents, summary_sents, match_idx_acc, lower=True, stemming=True):
    match_idx = []
    match_title = []
    match_scores = []

    # lowercase
    if lower:
        sents = [[w.lower() for w in sent] for sent in sents]
        summary_sents = [[w.lower() for w in sent] for sent in summary_sents]
    # stemming
    if stemming:
        sents = [[stemmer.stem(w) for w in sent] for sent in sents]
        summary_sents = [[stemmer.stem(w) for w in sent] for sent in summary_sents]

    for summary_id, summary_sent in enumerate(summary_sents):
        # rarely happens, number of summaries is larger than sents
        if summary_id >= len(sents):
            break

        match_score = [lcs(sent, summary_sent) for sent in sents]

        # remove the previously selected sents by setting their scores to 0
        for match_id in (match_idx + match_idx_acc):
            match_score[match_id] = 0

        match_id = np.argmax(match_score)
        match_idx.append(match_id)
        match_scores.append(match_score[match_id])

    return match_idx, match_scores


def eval_rouge(sents, summary_sents, extract_sent_idx, number_to_cutoff=3, stopwords_removal=False, stemming=True, logger=None):
    rouge_ = rouge.Rouge(stopwords_removal=stopwords_removal, stemming=stemming)
    # extract_sent_idx = extract_sent_idx[: min(number_to_cutoff, len(extract_sent_idx))]
    # sort extracted sentences in the order of their appearance
    # extract_sent_idx = sorted(extract_sent_idx)
    extracted_sents = [sents[idx] for idx in extract_sent_idx if idx < len(sents)]
    hypothesis = ' '.join(' '.join(i) for i in extracted_sents)
    reference = summary_sents

    if hypothesis == None or reference == None or len(hypothesis.strip()) == 0 or len(reference.strip()) == 0:
        fscores = {k: 0.0 for k in metric_keys}
    else:
        scores = rouge_.get_scores(hypothesis, reference)
        fscores = {k: v['f'] for k, v in scores[0].items()}

    return fscores

In [7]:
from multiprocessing import Pool

# spacy_nlp = spacy.load('en_core_web_sm')
from pysbd.utils import PySBDFactory
spacy_nlp = spacy.blank('en')
spacy_nlp.add_pipe(PySBDFactory(spacy_nlp))

def oracle_score(section_text, abstract_text):
    source_sents = [[w.text for w in sent] for sent in spacy_nlp(section_text).sents]
    target_sents = [[w.text for w in sent] for sent in spacy_nlp(abstract_text).sents]

    extract_sent_idx, _ = oracle_extract(source_sents, target_sents, [])

    score = eval_rouge(source_sents,
                       ' '.join(w for sent in target_sents for w in sent),
                       extract_sent_idx)
    return score

In [8]:
from multiprocessing import Pool
import sys
# fix a issue in rouge code
sys.setrecursionlimit(25000)

from tqdm import tqdm


tasks = []
for src_col in ('source_introduction', 'source_design', 'source_result', 'source_conclusion', 'source_related'):
    for target_col in ('target_introduction', 'target_design', 'target_findings', 'target_originality'):
        task_name = '_TO_'.join((src_col, target_col))
        tasks.append((src_col, target_col, task_name))
        
def func(a):
    src_col, target_col, task_name = a
    task_scores = []
    for i in range(emerald_df.shape[0]):
        src_text = emerald_df[src_col][i]
        if not src_text.strip():
            src_text = emerald_df['fulltext'][i]
        r = oracle_score(src_text, emerald_df[target_col][i])
        task_scores.append(r)
    return task_name, task_scores

p = Pool(10)

scores = {}
r = p.imap_unordered(func, tasks, chunksize=1)
for task_name, task_scores in tqdm(r, total=len(tasks)):
    scores[task_name] = task_scores


100%|██████████| 20/20 [2:30:43<00:00, 529.15s/it] 


In [9]:
import json
json.dump(scores, open('oracle_score_testset_per_section.json', 'w'))

In [10]:
df_scores = []
for col in scores:
    df_score = pd.DataFrame.from_records(scores[col], columns=list(scores[col][0].keys())).describe()
    df_score.columns = [col + '-' + i for i in df_score.columns]
    df_scores.append(df_score)
df_score = pd.concat(df_scores, axis=1)
df_score

Unnamed: 0,source_introduction_TO_target_originality-rouge-1,source_introduction_TO_target_originality-rouge-2,source_introduction_TO_target_originality-rouge-l,source_introduction_TO_target_introduction-rouge-1,source_introduction_TO_target_introduction-rouge-2,source_introduction_TO_target_introduction-rouge-l,source_introduction_TO_target_findings-rouge-1,source_introduction_TO_target_findings-rouge-2,source_introduction_TO_target_findings-rouge-l,source_introduction_TO_target_design-rouge-1,...,source_related_TO_target_introduction-rouge-l,source_related_TO_target_design-rouge-1,source_related_TO_target_design-rouge-2,source_related_TO_target_design-rouge-l,source_related_TO_target_originality-rouge-1,source_related_TO_target_originality-rouge-2,source_related_TO_target_originality-rouge-l,source_related_TO_target_findings-rouge-1,source_related_TO_target_findings-rouge-2,source_related_TO_target_findings-rouge-l
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,...,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.368808,0.155045,0.269508,0.457045,0.238482,0.354857,0.35723,0.147054,0.247229,0.332931,...,0.318317,0.35466,0.155778,0.261765,0.375155,0.172119,0.282177,0.419727,0.212939,0.313637
std,0.167722,0.181125,0.177858,0.178697,0.202183,0.200119,0.161776,0.177803,0.174079,0.165885,...,0.198707,0.181047,0.178707,0.18367,0.185575,0.204824,0.199178,0.190497,0.214868,0.210663
min,0.032,0.0,0.014038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.027491,0.0,0.011916,0.0,0.0,0.0
25%,0.255319,0.045455,0.153249,0.328767,0.095973,0.207318,0.252632,0.04789,0.140375,0.215385,...,0.171882,0.220339,0.035503,0.130454,0.25,0.045977,0.149159,0.28,0.064516,0.161508
50%,0.334606,0.095849,0.219268,0.432266,0.178114,0.308848,0.321429,0.088999,0.195224,0.303571,...,0.262959,0.320755,0.094229,0.208929,0.333333,0.098361,0.219553,0.376747,0.134328,0.247592
75%,0.44,0.186145,0.328123,0.557692,0.322705,0.458208,0.412371,0.166667,0.285062,0.416667,...,0.414231,0.455508,0.211567,0.341025,0.451613,0.20339,0.341603,0.523364,0.285714,0.405685
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
tmp = {}
for col in df_score.columns:
    src, tgt = col.split('_TO_')
    if src not in tmp:
        tmp[src] = {}
    tmp[src][tgt] = df_score[col]['mean']
df_cmp = pd.DataFrame(tmp)
df_cmp.sort_index(key=lambda x: x.str[-1])

Unnamed: 0,source_introduction,source_design,source_conclusion,source_result,source_related
target_originality-rouge-1,0.368808,0.313198,0.388202,0.345507,0.375155
target_introduction-rouge-1,0.457045,0.35489,0.396434,0.369133,0.41803
target_findings-rouge-1,0.35723,0.351481,0.446608,0.44052,0.419727
target_design-rouge-1,0.332931,0.396064,0.324536,0.317935,0.35466
target_originality-rouge-2,0.155045,0.113653,0.175267,0.141955,0.172119
target_introduction-rouge-2,0.238482,0.140232,0.163931,0.151171,0.202797
target_findings-rouge-2,0.147054,0.148875,0.229408,0.225146,0.212939
target_design-rouge-2,0.133771,0.179992,0.11519,0.11436,0.155778
target_originality-rouge-l,0.269508,0.223925,0.291038,0.252618,0.282177
target_introduction-rouge-l,0.354857,0.258315,0.290877,0.26928,0.318317


In [29]:
emerald_df.source_introduction.apply(lambda x: len(x.split(' '))).describe()

count    6000.000000
mean      610.789500
std       506.085361
min         1.000000
25%       316.000000
50%       571.000000
75%       830.000000
max      7260.000000
Name: source_introduction, dtype: float64

In [31]:
emerald_df.source_conclusion.apply(lambda x: len(x.split(' '))).describe()

count    6000.000000
mean      494.767500
std       491.688532
min         1.000000
25%       132.000000
50%       393.500000
75%       705.250000
max      7239.000000
Name: source_conclusion, dtype: float64

In [30]:
emerald_df.columns

Index(['section_names', 'sections', 'fulltext', 'source_introduction',
       'source_design', 'source_result', 'source_conclusion', 'source_related',
       'target_introduction', 'target_design', 'target_findings',
       'target_originality'],
      dtype='object')