In [11]:
import os
import json
import numpy as np

In [4]:
def find_section_title_like(j, cuewords):
    section_names, section_text = j['section_names'], j['sections']
    text = []
    for sn, st in zip(section_names, section_text):
        sn = sn.lower()
        for cueword in cuewords:
            if cueword in sn:
                text.append(f"<|{sn.strip()}|>")
                text.append(' '.join(st))
                break
    return ' '.join(text)

# 'source_introduction' ['intro', 'purpose']
# 'source_design' ['design', 'method', 'approach']
# 'source_result' ['result', 'find', 'discuss', 'analy']
# 'source_conclusion' ['conclu', 'future']
# 'source_related' ['related work', 'literat', 'background']

# find intro section, select first section if intro doesn't exist
def get_intro_text(j):
    intro_text = find_section_title_like(j, ['intro', 'purpose']).strip()
    if not intro_text:
        section_names, section_text = j['section_names'], j['sections']
        if section_names and section_names[0] != '__NO_TITLE__':
            intro_text = intro_text + ' ' + section_names[0]
        if section_text:
            intro_text = intro_text + ' ' + ' '.join(section_text[0])
        intro_text = intro_text.strip()
    return intro_text


def get_conclu_text(j):
    conclu_text = find_section_title_like(j, ['conclu', 'future']).strip()
    if not conclu_text:
        section_names, section_text = j['section_names'], j['sections']
        if section_names and section_names[-1] != '__NO_TITLE__':
            conclu_text = conclu_text + ' ' + section_names[-1]
        if section_text:
            conclu_text = conclu_text + ' ' + ' '.join(section_text[-1])
        conclu_text = conclu_text.strip()
    return conclu_text

In [12]:
def get_source_IC(j):
    return ' '.join((get_intro_text(j), get_conclu_text(j))).replace('\n', ' ').strip()


def get_source_str(j):
    sec_titles = j['section_names']
    sections = j['sections']
    r = []
    for title, sec in zip(sec_titles, sections):
        if title and title != '__NO_TITLE__':
            r.append(f"<|{title.strip()}|>")
        for par in sec:
            r.append(par)
            r.append("<|par|>")
    return ' '.join(r).replace('\n', ' ').strip()


def get_target_str(j):
    abstract_sections_names = j['abstract_sections_names']
    abstract_sections = j['abstract_sections']
    
    title2section = dict(zip(abstract_sections_names, abstract_sections))
    r = []
    for title in ('Purpose', 'Design/methodology/approach', 'Findings', 'Originality/value'):
        section = title2section[title]

        title = title.split('/')[0]
        r.append(f"<|{title}|>")

        for par in section:
            r.append(par)
    return ' '.join(r).replace('\n', ' ').strip()


def get_target_sections(j):
    abstract_sections_names = j['abstract_sections_names']
    abstract_sections = j['abstract_sections']
    
    title2section = dict(zip(abstract_sections_names, abstract_sections))
    r = []
    for title in ('Purpose', 'Design/methodology/approach', 'Findings', 'Originality/value'):
        section = title2section[title]

        title = title.split('/')[0]
        
        r.append((
            title,
            ' '.join(section).replace('\n', ' ').strip()
        ))
    return r


def generate_combined_data(src_dir, dst_dir, func_get_source_str=get_source_str):
    def jsonl2data(src, dst):
        f = open(src, encoding='utf8')
        source_out = open(f"{dst}.source", 'w', encoding='utf8')
        target_out = open(f"{dst}.target", 'w', encoding='utf8')
        for line in f:
            j = json.loads(line.strip())
            s = func_get_source_str(j)
            t = get_target_str(j)
            source_out.write(s + '\n')
            target_out.write(t + '\n')
    for fname in ('train', 'dev', 'test'):
        src = os.path.join(src_dir, f"{fname}.jsonl")
        dst = os.path.join(dst_dir, fname)
        print(f"writing from \n  :{src} to \n  :{dst}")
        jsonl2data(src, dst)
        
        
def generate_seperate_data(src_dir, dst_dir, func_get_source_str=get_source_str):
    def jsonl2data(src, dst):
        f = open(src, encoding='utf8')
        source_out = open(f"{dst}.source", 'w', encoding='utf8')
        target_out = open(f"{dst}.target", 'w', encoding='utf8')
        for line in f:
            j = json.loads(line.strip())
            s = func_get_source_str(j)
            ts = get_target_sections(j)
            for title, t in ts:
                source_out.write(f"<|{title}|> " + s + '\n')
                target_out.write(t + '\n')
    for fname in ('train', 'dev', 'test'):
        src = os.path.join(src_dir, f"{fname}.jsonl")
        dst = os.path.join(dst_dir, fname)
        print(f"writing from \n  :{src} to \n  :{dst}")
        jsonl2data(src, dst)


In [7]:
! mkdir /home/ubuntu/efs/emerald/data_ic_to_combined_target/
! mkdir /home/ubuntu/efs/emerald/data_ic_to_seperate_target/

In [13]:
generate_combined_data('/home/ubuntu/efs/emerald/', '/home/ubuntu/efs/emerald/data_ic_to_combined_target/', func_get_source_str=get_source_IC)
generate_seperate_data('/home/ubuntu/efs/emerald/', '/home/ubuntu/efs/emerald/data_ic_to_seperate_target/', func_get_source_str=get_source_IC)

writing from 
  :/home/ubuntu/efs/emerald/train.jsonl to 
  :/home/ubuntu/efs/emerald/data_ic_to_combined_target/train
writing from 
  :/home/ubuntu/efs/emerald/dev.jsonl to 
  :/home/ubuntu/efs/emerald/data_ic_to_combined_target/dev
writing from 
  :/home/ubuntu/efs/emerald/test.jsonl to 
  :/home/ubuntu/efs/emerald/data_ic_to_combined_target/test
writing from 
  :/home/ubuntu/efs/emerald/train.jsonl to 
  :/home/ubuntu/efs/emerald/data_ic_to_seperate_target/train
writing from 
  :/home/ubuntu/efs/emerald/dev.jsonl to 
  :/home/ubuntu/efs/emerald/data_ic_to_seperate_target/dev
writing from 
  :/home/ubuntu/efs/emerald/test.jsonl to 
  :/home/ubuntu/efs/emerald/data_ic_to_seperate_target/test


In [3]:
l = []
for fn in ('train', 'dev', 'test'):
    with open('/home/ubuntu/efs/emerald/data_ic_to_combined_target/{}.source'.format(fn), 'r') as f:
        l = l + [len(line.split(' ')) for line in f]
import pandas as pd
pd.DataFrame({'l': l}).describe()

Unnamed: 0,l
count,60024.0
mean,1425.079951
std,872.231249
min,27.0
25%,880.0
50%,1261.0
75%,1778.0
max,33336.0


In [4]:
# generate_combined_data('/home/ubuntu/efs/emerald/', '/home/ubuntu/efs/emerald/data_combined_target/')

writing from 
  :/home/ubuntu/efs/emerald/train.jsonl to 
  :/home/ubuntu/efs/emerald/data_combined_target/train
writing from 
  :/home/ubuntu/efs/emerald/dev.jsonl to 
  :/home/ubuntu/efs/emerald/data_combined_target/dev
writing from 
  :/home/ubuntu/efs/emerald/test.jsonl to 
  :/home/ubuntu/efs/emerald/data_combined_target/test


In [5]:
# generate_seperate_data('/home/ubuntu/efs/emerald/', '/home/ubuntu/efs/emerald/data_seperate_target/')

writing from 
  :/home/ubuntu/efs/emerald/train.jsonl to 
  :/home/ubuntu/efs/emerald/data_seperate_target/train
writing from 
  :/home/ubuntu/efs/emerald/dev.jsonl to 
  :/home/ubuntu/efs/emerald/data_seperate_target/dev
writing from 
  :/home/ubuntu/efs/emerald/test.jsonl to 
  :/home/ubuntu/efs/emerald/data_seperate_target/test
