## In this file we experiment with train.jsonl to see how to extract Intro and Conclu.

In [1]:
# Read train dataset.
import json
import pandas as pd

def lines2df(lines):
    ex_dicts = []
    for line in lines:
        ex_dict = json.loads(line.strip())
        ex_dicts.append(ex_dict)
    df = pd.DataFrame.from_records(ex_dicts, columns=list(ex_dicts[0].keys()))
    return df
f = open('/home/ubuntu/efs/emerald/train.jsonl', 'r')
lines = [l for l in f.readlines()]

emerald_df = lines2df(lines)

In [65]:
# Functions for extract Intro+Conclu (IC).
from tqdm.auto import tqdm
def find_section_title_like(section_names, section_text, cuewords, include_title=True):
    text = []
    for sn, st in zip(section_names, section_text):
        sn = sn.lower()
        for cueword in cuewords:
            if cueword in sn:
                if include_title:
                    text.append(sn)
                text.append('\n'.join(st))
                break
    return '\n'.join(text)

def find_intro(df, include_title=True):
    except_titles = []
    except_texts = []
    intr_text = []
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        text = find_section_title_like(section_names=row['section_names'],
                                       section_text=row['sections'],
                                       cuewords=['intro', 'purpose'], #### keywords for intro sections
                                       include_title=include_title)
        if not text:
            title = row['section_names'][0]
            text = '\n'.join(row['sections'][0])
            if title == '__NO_TITLE__':
                title = ''
            # add section title into text results
            if include_title and title:
                 text = '\n'.join((title, text))
            
            # cut long intro :1000
            if len(text.split()) > 1000:
                text = ' '.join(text.split()[:1000])
            
            except_titles.append(title)
            except_texts.append(text)

        intr_text.append(text)
    return intr_text, except_titles, except_texts

def find_conclu(df, include_title=True):
    except_titles = []
    except_texts = []
    conclu_text = []
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        text = find_section_title_like(section_names=row['section_names'],
                                       section_text=row['sections'],
                                       cuewords=['conclu', 'future'], #### keywords for conclu sections
                                       include_title=include_title)
        if not text:
            title = row['section_names'][-1]
            text = '\n'.join(row['sections'][-1])
            if title == '__NO_TITLE__':
                title = ''
            # add section title into text results
            if include_title and title:
                 text = '\n'.join((title, text))
            
            # cut long conclu :1000
            if len(text.split()) > 1000:
                if len(row['section_names']) == 1 or 'intro' in title.lower() or 'purpose' in title.lower():
                    # to avoid same intro&conclu
                    text = ' '.join(text.split()[-1000:])
                else:
                    text = ' '.join(text.split()[:1000])

            except_titles.append(title)
            except_texts.append(text)

        conclu_text.append(text)
    return conclu_text, except_titles, except_texts

intr_text, intr_except_titles, intr_except_texts = find_intro(emerald_df, include_title=True)
conclu_text, conclu_except_titles, conclu_except_texts = find_conclu(emerald_df, include_title=True)

  0%|          | 0/48024 [00:00<?, ?it/s]

  0%|          | 0/48024 [00:00<?, ?it/s]

In [66]:
print("total: {}, missing intro: {}, missing conclu: {}".format(len(intr_text), len(intr_except_titles), len(conclu_except_titles)))

total: 48024, missing intro: 7578, missing conclu: 10061


### If we can not find intro section, what is the first section?

In [67]:
pd.DataFrame({'intr_except_titles': intr_except_titles}).value_counts().head(20)

intr_except_titles         
                               5624
Background                      304
Review                           72
1. Background                    63
Overview                         19
1 Background                     14
Context                           8
Preamble                          8
Background to the study           7
1. Background of the study        7
Literature review                 6
Background of the study           5
Summary                           5
The context                       5
Main body of article review       5
Theoretical framework             5
Motivation                        5
Background and context            4
I                                 4
Theoretical background            4
dtype: int64

### If we can not find conclu section, what is the last section?

In [68]:
pd.DataFrame({'conclu_except_titles': conclu_except_titles}).value_counts().head(20)

conclu_except_titles            
Discussion                          2577
                                     498
Comment                              467
5. Discussion                        336
Discussion and implications          257
Summary                              218
General discussion                   211
Note                                 203
6. Discussion                        156
4. Discussion                        114
Implications                         113
Managerial implications               88
Limitations                           70
Commentary                            69
Limitations and further research      60
5. Discussion and implications        55
Recommendations                       40
5. Final considerations               39
Introduction                          36
Results and discussion                35
dtype: int64

### length of first/last sections

In [69]:
## when intro is missing, first section is:
pd.DataFrame({'': [len(i.split()) for i in intr_except_texts]}).describe(
    percentiles=[.01, .25, .5, .75, .99])

Unnamed: 0,Unnamed: 1
count,7578.0
mean,517.781869
std,309.98248
min,2.0
1%,33.0
25%,249.0
50%,468.0
75%,780.0
99%,1000.0
max,1000.0


In [70]:
## when conclu is missing, last section is:
pd.DataFrame({'': [len(i.split()) for i in conclu_except_texts]}).describe(
    percentiles=[.01, .25, .5, .75, .99])

Unnamed: 0,Unnamed: 1
count,10061.0
mean,692.710566
std,350.490648
min,1.0
1%,12.0
25%,361.0
50%,877.0
75%,1000.0
99%,1000.0
max,1000.0


In [46]:
# all first sections
emerald_df['sections'].apply(lambda x: len(' '.join(x[0]).split())).describe(
    percentiles=[.01, .25, .5, .75, .99])

count    48024.000000
mean       683.633662
std        499.337976
min          0.000000
1%          17.000000
25%        388.000000
50%        603.000000
75%        859.000000
99%       2469.770000
max      16668.000000
Name: sections, dtype: float64

In [47]:
# all last sections
emerald_df['sections'].apply(lambda x: len(' '.join(x[-1]).split())).describe()

count    48024.000000
mean       654.837810
std        565.278487
min          0.000000
25%        278.000000
50%        496.000000
75%        868.000000
max      16668.000000
Name: sections, dtype: float64

In [71]:
## extracted intro
pd.DataFrame({'': [len(i.split()) for i in intr_text]}).describe(
    percentiles=[.01, .25, .5, .75, .99])

Unnamed: 0,Unnamed: 1
count,48024.0
mean,696.012202
std,456.711925
min,2.0
1%,92.0
25%,416.0
50%,621.0
75%,876.0
99%,2363.0
max,9871.0


In [72]:
## extracted conclu
pd.DataFrame({'': [len(i.split()) for i in conclu_text]}).describe(
    percentiles=[.01, .25, .5, .75, .99])

Unnamed: 0,Unnamed: 1
count,48024.0
mean,647.443258
std,456.652343
min,1.0
1%,61.0
25%,317.0
50%,552.0
75%,940.0
99%,2158.77
max,8338.0


# extract IC for all datasets

In [73]:
import os
filenames = ['dev.jsonl',
            'dev_rm_oa.jsonl',
            'test.jsonl',
            'test_oa.jsonl',
            '',
            'train_plus.jsonl',
            'train_rm_oa.jsonl']

for fname in filenames:
    print(f"reading file {fname}...")
    with open(os.path.join('/home/ubuntu/efs/emerald/', fname), 'r') as f:
        lines = [l for l in f.readlines()]
    df = lines2df(lines)

    intr_text, intr_except_titles, _ = find_intro(df, include_title=True)
    print(f"  {len(intr_except_titles)}/{df.shape[0]} ({len(intr_except_titles)/df.shape[0]:.3f}) documents can not find a intro section, use 1st section instead.")
    
    conclu_text, conclu_except_titles, _ = find_conclu(df, include_title=True)
    print(f"  {len(conclu_except_titles)}/{df.shape[0]} ({len(conclu_except_titles)/df.shape[0]:.3f}) documents can not find a conclu section, use last section instead.")
    
    out_intro = open(os.path.join('/home/ubuntu/efs/emerald/data_ic/', fname.replace('.jsonl', '.intro')), 'w')
    out_conclu = open(os.path.join('/home/ubuntu/efs/emerald/data_ic/', fname.replace('.jsonl', '.conclu')), 'w')
    for i, c in zip(intr_text, conclu_text):
        # remove \n from ic
        i, c = i.replace('\n', ' '), c.replace('\n', ' ')
        out_intro.write(i + '\n')
        out_conclu.write(c + '\n')
    print(f"done, {len(intr_text)} lines.")

reading file dev.jsonl...


  0%|          | 0/6000 [00:00<?, ?it/s]

  915/6000 (0.152) documents can not find a intro section, use 1st section instead.


  0%|          | 0/6000 [00:00<?, ?it/s]

  1261/6000 (0.210) documents can not find a conclu section, use last section instead.
done, 6000 lines.
reading file dev_rm_oa.jsonl...


  0%|          | 0/6000 [00:00<?, ?it/s]

  930/6000 (0.155) documents can not find a intro section, use 1st section instead.


  0%|          | 0/6000 [00:00<?, ?it/s]

  1269/6000 (0.211) documents can not find a conclu section, use last section instead.
done, 6000 lines.
reading file test.jsonl...


  0%|          | 0/6000 [00:00<?, ?it/s]

  955/6000 (0.159) documents can not find a intro section, use 1st section instead.


  0%|          | 0/6000 [00:00<?, ?it/s]

  1273/6000 (0.212) documents can not find a conclu section, use last section instead.
done, 6000 lines.
reading file test_oa.jsonl...


  0%|          | 0/2243 [00:00<?, ?it/s]

  188/2243 (0.084) documents can not find a intro section, use 1st section instead.


  0%|          | 0/2243 [00:00<?, ?it/s]

  305/2243 (0.136) documents can not find a conclu section, use last section instead.
done, 2243 lines.
reading file train.jsonl...


  0%|          | 0/48024 [00:00<?, ?it/s]

  7578/48024 (0.158) documents can not find a intro section, use 1st section instead.


  0%|          | 0/48024 [00:00<?, ?it/s]

  10061/48024 (0.209) documents can not find a conclu section, use last section instead.
done, 48024 lines.
reading file train_plus.jsonl...


  0%|          | 0/92733 [00:00<?, ?it/s]

  12319/92733 (0.133) documents can not find a intro section, use 1st section instead.


  0%|          | 0/92733 [00:00<?, ?it/s]

  16201/92733 (0.175) documents can not find a conclu section, use last section instead.
done, 92733 lines.
reading file train_rm_oa.jsonl...


  0%|          | 0/46289 [00:01<?, ?it/s]

  7429/46289 (0.160) documents can not find a intro section, use 1st section instead.


  0%|          | 0/46289 [00:00<?, ?it/s]

  9831/46289 (0.212) documents can not find a conclu section, use last section instead.
done, 46289 lines.
