In [12]:
import json
import pandas as pd

def lines2df(lines):
    ex_dicts = []
    for line in lines:
        ex_dict = json.loads(line.strip())
        ex_dicts.append(ex_dict)
    df = pd.DataFrame.from_records(ex_dicts, columns=list(ex_dicts[0].keys()))
    return df

# lines = [l for l in open('/home/ubuntu/efs/lei/emerald_new/emerald.jsonl', 'r').readlines()]
lines = [l for l in open('/home/ubuntu/efs/lei/emerald_new/test.jsonl', 'r').readlines()]

emerald_df = lines2df(lines)

In [13]:
def concat_paragraph(sections):
    return '\n'.join(j for i in sections for j in i)
emerald_df['fulltext'] = emerald_df.sections.apply(concat_paragraph)
emerald_df['abstracttext'] = emerald_df.abstract_sections.apply(concat_paragraph)

In [14]:
def find_section_title_like(section_names, section_text, cuewords):
    text = []
    for sn, st in zip(section_names, section_text):
        sn = sn.lower()
        for cueword in cuewords:
            if cueword in sn:
                text.append(' '.join(st))
                break
    return ' '.join(text)

def find_section_title_not_like(section_names, section_text, cuewords):
    text = []
    for sn, st in zip(section_names, section_text):
        sn = sn.lower()
        positive = False
        for cueword in cuewords:
            if cueword in sn:
                positive = True
                break
        if not positive:
            text.append(' '.join(st))
    return ' '.join(text)

emerald_df['source_introduction'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['intro', 'purpose']), axis=1)
emerald_df['source_conclusion'] = emerald_df.apply(lambda row: find_section_title_like(row['section_names'], row['sections'], ['conclu', 'future']), axis=1)

emerald_df['source_loo_introduction'] = emerald_df.apply(lambda row: find_section_title_not_like(row['section_names'], row['sections'], ['intro', 'purpose']), axis=1)
emerald_df['source_loo_design'] = emerald_df.apply(lambda row: find_section_title_not_like(row['section_names'], row['sections'], ['design', 'method', 'approach']), axis=1)
emerald_df['source_loo_result'] = emerald_df.apply(lambda row: find_section_title_not_like(row['section_names'], row['sections'], ['result', 'find', 'discuss', 'analy']), axis=1)
emerald_df['source_loo_conclusion'] = emerald_df.apply(lambda row: find_section_title_not_like(row['section_names'], row['sections'], ['conclu', 'future']), axis=1)
emerald_df['source_loo_related'] = emerald_df.apply(lambda row: find_section_title_not_like(row['section_names'], row['sections'], ['related work', 'literat', 'background']), axis=1)

In [15]:
emerald_df['source_IC'] = emerald_df['source_introduction'] + ' ' + emerald_df['source_conclusion']

In [16]:
# ('Purpose', 'Design/methodology/approach', 'Findings', 'Originality/value')
emerald_df['target_introduction'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                         ['Purpose'.lower()]), axis=1)
emerald_df['target_design'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                   ['Design/methodology/approach'.lower()]), axis=1)
emerald_df['target_findings'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                     ['Findings'.lower()]), axis=1)
emerald_df['target_originality'] = emerald_df.apply(lambda row: find_section_title_like(row['abstract_sections_names'], row['abstract_sections'],
                                                                                        ['Originality/value'.lower()]),axis=1)

In [17]:
emerald_df = emerald_df[['section_names', 'sections', 'fulltext', 'source_IC',
                         'source_loo_introduction', 'source_loo_design',
                         'source_loo_result', 'source_loo_conclusion', 'source_loo_related',
                         'target_introduction', 'target_design', 'target_findings', 'target_originality']]
emerald_df.head(3)

Unnamed: 0,section_names,sections,fulltext,source_IC,source_loo_introduction,source_loo_design,source_loo_result,source_loo_conclusion,source_loo_related,target_introduction,target_design,target_findings,target_originality
0,"[1. Introduction, 2. Writing qualitatively in ...",[[The increasing institutional pressure to pub...,The increasing institutional pressure to publi...,The increasing institutional pressure to publi...,Research in the areas of entrepreneurship and ...,The increasing institutional pressure to publi...,The increasing institutional pressure to publi...,The increasing institutional pressure to publi...,The increasing institutional pressure to publi...,- The purpose of this paper is to report on a ...,- Scholars who had published qualitative paper...,- Entrepreneurship scholars perceive their qua...,- Although there is a vigorous debate within t...
1,"[1. Introduction, 2. Literature review and hyp...",[[Today's manufacturing companies are far more...,Today's manufacturing companies are far more a...,Today's manufacturing companies are far more a...,"LM, also known as the Toyota Production System...",Today's manufacturing companies are far more a...,Today's manufacturing companies are far more a...,Today's manufacturing companies are far more a...,Today's manufacturing companies are far more a...,The purpose of this paper is to identify the m...,A survey questionnaire was developed based on ...,Lean and FMS are multi-dimensional philosophie...,This research empirically develops a framework...
2,"[Introduction, Methodology, Case description, ...","[[In the last 20 years, there has been a drama...","In the last 20 years, there has been a dramati...","In the last 20 years, there has been a dramati...",Study design We chose an in-depth case study a...,"In the last 20 years, there has been a dramati...","In the last 20 years, there has been a dramati...","In the last 20 years, there has been a dramati...","In the last 20 years, there has been a dramati...",- The purpose of this paper is to examine how ...,- Based on extensive document analysis and 35 ...,"- Spanning nearly a decade, the pre-merger pro...",- This is the first systematic in-depth study ...


In [18]:
# copy https://github.com/memray/bigsum
import numpy as np
from nltk.stem.porter import PorterStemmer
import spacy
from helper import rouge

stemmer = PorterStemmer()

def lcs(X, Y):
    # find the length of the strings
    m = len(X)
    n = len(Y)

    # declaring the array for storing the dp values
    L = [[None] * (n + 1) for i in range(m + 1)]

    """Following steps build L[m + 1][n + 1] in bottom up fashion 
    Note: L[i][j] contains length of LCS of X[0..i-1] 
    and Y[0..j-1]"""
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif X[i - 1] == Y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])

                # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1]
    return L[m][n]

def oracle_extract(sents, summary_sents, match_idx_acc, lower=True, stemming=True):
    match_idx = []
    match_title = []
    match_scores = []

    # lowercase
    if lower:
        sents = [[w.lower() for w in sent] for sent in sents]
        summary_sents = [[w.lower() for w in sent] for sent in summary_sents]
    # stemming
    if stemming:
        sents = [[stemmer.stem(w) for w in sent] for sent in sents]
        summary_sents = [[stemmer.stem(w) for w in sent] for sent in summary_sents]

    for summary_id, summary_sent in enumerate(summary_sents):
        # rarely happens, number of summaries is larger than sents
        if summary_id >= len(sents):
            break

        match_score = [lcs(sent, summary_sent) for sent in sents]

        # remove the previously selected sents by setting their scores to 0
        for match_id in (match_idx + match_idx_acc):
            match_score[match_id] = 0

        match_id = np.argmax(match_score)
        match_idx.append(match_id)
        match_scores.append(match_score[match_id])

    return match_idx, match_scores


def eval_rouge(sents, summary_sents, extract_sent_idx, number_to_cutoff=3, stopwords_removal=False, stemming=True, logger=None):
    rouge_ = rouge.Rouge(stopwords_removal=stopwords_removal, stemming=stemming)
    # extract_sent_idx = extract_sent_idx[: min(number_to_cutoff, len(extract_sent_idx))]
    # sort extracted sentences in the order of their appearance
    # extract_sent_idx = sorted(extract_sent_idx)
    extracted_sents = [sents[idx] for idx in extract_sent_idx if idx < len(sents)]
    hypothesis = ' '.join(' '.join(i) for i in extracted_sents)
    reference = summary_sents

    if hypothesis == None or reference == None or len(hypothesis.strip()) == 0 or len(reference.strip()) == 0:
        fscores = {k: 0.0 for k in metric_keys}
    else:
        scores = rouge_.get_scores(hypothesis, reference)
        fscores = {k: v['f'] for k, v in scores[0].items()}

    return fscores

In [19]:
from multiprocessing import Pool

# spacy_nlp = spacy.load('en_core_web_sm')
from pysbd.utils import PySBDFactory
spacy_nlp = spacy.blank('en')
spacy_nlp.add_pipe(PySBDFactory(spacy_nlp))

def oracle_score(section_text, abstract_text):
    source_sents = [[w.text for w in sent] for sent in spacy_nlp(section_text).sents]
    target_sents = [[w.text for w in sent] for sent in spacy_nlp(abstract_text).sents]

    extract_sent_idx, _ = oracle_extract(source_sents, target_sents, [])

    score = eval_rouge(source_sents,
                       ' '.join(w for sent in target_sents for w in sent),
                       extract_sent_idx)
    return score

In [20]:
from multiprocessing import Pool
import sys
# fix a issue in rouge code
sys.setrecursionlimit(25000)

from tqdm import tqdm


tasks = []
for src_col in ('fulltext', 'source_IC', 'source_loo_introduction', 'source_loo_design', 'source_loo_result',
                'source_loo_conclusion', 'source_loo_related'):
    for target_col in ('target_introduction', 'target_design', 'target_findings', 'target_originality'):
        task_name = '_TO_'.join((src_col, target_col))
        tasks.append((src_col, target_col, task_name))
        
def func(a):
    src_col, target_col, task_name = a
    task_scores = []
    for i in range(emerald_df.shape[0]):
        src_text = emerald_df[src_col][i]
        if not src_text.strip():
            src_text = emerald_df['fulltext'][i]
        r = oracle_score(src_text, emerald_df[target_col][i])
        task_scores.append(r)
    return task_name, task_scores

p = Pool(10)

scores = {}
r = p.imap_unordered(func, tasks, chunksize=1)
for task_name, task_scores in tqdm(r, total=len(tasks)):
    scores[task_name] = task_scores


100%|██████████| 28/28 [10:16:39<00:00, 901.58s/it]   


In [21]:
import json
json.dump(scores, open('oracle_score_testset_per_section_loo.json', 'w'))

In [22]:
df_scores = []
for col in scores:
    df_score = pd.DataFrame.from_records(scores[col], columns=list(scores[col][0].keys())).describe()
    df_score.columns = [col + '-' + i for i in df_score.columns]
    df_scores.append(df_score)
df_score = pd.concat(df_scores, axis=1)
df_score

Unnamed: 0,source_IC_TO_target_originality-rouge-1,source_IC_TO_target_originality-rouge-2,source_IC_TO_target_originality-rouge-l,source_IC_TO_target_design-rouge-1,source_IC_TO_target_design-rouge-2,source_IC_TO_target_design-rouge-l,source_IC_TO_target_introduction-rouge-1,source_IC_TO_target_introduction-rouge-2,source_IC_TO_target_introduction-rouge-l,source_IC_TO_target_findings-rouge-1,...,source_loo_related_TO_target_introduction-rouge-l,source_loo_related_TO_target_findings-rouge-1,source_loo_related_TO_target_findings-rouge-2,source_loo_related_TO_target_findings-rouge-l,source_loo_related_TO_target_design-rouge-1,source_loo_related_TO_target_design-rouge-2,source_loo_related_TO_target_design-rouge-l,source_loo_related_TO_target_originality-rouge-1,source_loo_related_TO_target_originality-rouge-2,source_loo_related_TO_target_originality-rouge-l
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,...,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.411291,0.202381,0.314888,0.34351,0.140927,0.246822,0.460803,0.241298,0.359503,0.434082,...,0.360751,0.474111,0.274937,0.370539,0.412812,0.209302,0.313914,0.415939,0.223127,0.325789
std,0.193979,0.220806,0.211459,0.165022,0.159036,0.165172,0.179731,0.202788,0.201515,0.192515,...,0.221776,0.205962,0.232769,0.231855,0.193023,0.194915,0.201481,0.217379,0.239191,0.235116
min,0.032,0.0,0.014038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.02926,0.0,0.009626,0.0,0.0,0.0,0.013468,0.0,0.006851
25%,0.273684,0.056838,0.166449,0.228571,0.039216,0.133688,0.330935,0.095238,0.208654,0.294118,...,0.193454,0.326013,0.09943,0.19201,0.280374,0.070175,0.165802,0.26087,0.055925,0.154648
50%,0.36716,0.123077,0.250407,0.315789,0.092593,0.202678,0.436131,0.181818,0.313902,0.392157,...,0.32061,0.45283,0.205267,0.322916,0.4,0.15534,0.278892,0.373908,0.133333,0.260106
75%,0.5,0.255963,0.390953,0.426667,0.181048,0.309635,0.565657,0.333333,0.467365,0.53125,...,0.490291,0.597267,0.380636,0.500726,0.525547,0.285714,0.417734,0.528905,0.30078,0.428571
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
tmp = {}
for col in df_score.columns:
    src, tgt = col.split('_TO_')
    if src not in tmp:
        tmp[src] = {}
    tmp[src][tgt] = df_score[col]['mean']
df_cmp = pd.DataFrame(tmp)
df_cmp.sort_index(key=lambda x: x.str[-1])

Unnamed: 0,source_IC,fulltext,source_loo_introduction,source_loo_result,source_loo_design,source_loo_conclusion,source_loo_related
target_originality-rouge-1,0.411291,0.43817,0.377425,0.403969,0.417433,0.377115,0.415939
target_design-rouge-1,0.34351,0.434247,0.382421,0.409257,0.35706,0.405143,0.412812
target_introduction-rouge-1,0.460803,0.481072,0.385701,0.459135,0.457322,0.447145,0.455875
target_findings-rouge-1,0.434082,0.496096,0.460755,0.431595,0.473933,0.422686,0.474111
target_originality-rouge-2,0.202381,0.237694,0.184535,0.206075,0.225019,0.17848,0.223127
target_design-rouge-2,0.140927,0.223999,0.177804,0.206195,0.159489,0.20251,0.209302
target_introduction-rouge-2,0.241298,0.267863,0.174263,0.251069,0.251273,0.242354,0.250718
target_findings-rouge-2,0.224632,0.290113,0.260221,0.230368,0.274492,0.215559,0.274937
target_originality-rouge-l,0.314888,0.347934,0.28699,0.311206,0.327451,0.283194,0.325789
target_design-rouge-l,0.246822,0.337025,0.282871,0.310591,0.260639,0.305866,0.313914
