 # Human-annotated portion of dataset (~5k samples)

In [3]:
import json, random
import utils_diff

with open("data/swipe_train.json", "r") as f: # See: data/swipe_val.json, data/swipe_test_id.json, data/swipe_test_ood.json for the validation, in-domain test, and out-of-domain test sets
    swipe_train = json.load(f)

sample = random.choice(swipe_train)

In [4]:
print("Page Pairing: [En Wiki: %s; Revision ID: %s] [Simple Wiki: %s; Revision ID: %s]" % (sample["r_page"], sample["r_revid"], sample["s_page"], sample["s_revid"]))

# Raw text of En page: sample["r_content"]
# Raw text of Simple page: sample["s_content"]

edits = sample["edits"] # Can be recreated through: `utils_diff.get_edit_operations(sample["r_content"], sample["s_content"], split_replace=True, split_sentences=True)`

print("---- Here is the edit sequence to go from the original page to the simplified page ----")
print("Legend: Green text is added in the simple page, red text is deleted from the original page")
print("---")
print(utils_diff.make_colored_text(sample["r_content"], sample["s_content"]))

Page Pairing: [En Wiki: Epsom; Revision ID: 1006880447] [Simple Wiki: Epsom; Revision ID: 3865189]
---- Here is the edit sequence to go from the original page to the simplified page ----
Legend: Green text is added in the simple page, red text is deleted from the original page
---
Epsom is[1;31mthe principal[0m[1;32ma[0m town[1;31mof the Borough of Epsom and Ewell[0m in[1;32mnorthern[0m Surrey[1;31m, England, approximately 13.5 mi(21.7 km) south of Charing Cross and 4.75 mi(7.64 km) northeast of Leatherhead[0m. The town is[1;31mrecorded as Ebbesham in[0m[1;32mknown for its race course and[0m the[1;31m13th century and its name probably derives from that of a Saxon landowner[0m[1;32mhorse race held there[0m.[1;31mFounded as a spring line settlement where the permeable chalk of the North Downs meets the impermeable London Clay, Epsom developed as a spa town in the Georgian period. The mineral waters were found to be rich in magnesium sulphate, which became known as Epso

In [9]:
# Annotation format specifies each edit group through the operation index (opi) of the category assigned to the group
sample["annotations"]

[{'gi': 0, 'opis': [1], 'category': 'syntactic_generic'},
 {'opis': [2], 'category': 'semantic_deletion'},
 {'opis': [4], 'category': 'semantic_deletion'},
 {'gi': 3, 'opis': [6], 'category': 'semantic_elaboration_background'},
 {'opis': [8, 9, 10, 11, 12], 'category': 'semantic_deletion'},
 {'gi': 5, 'opis': [14, 17], 'category': 'nonsim_extraneous_information'},
 {'gi': 6,
  'opis': [15, 18, 20, 21, 22, 23],
  'category': 'semantic_elaboration_generic'},
 {'opis': [24], 'category': 'nonsim_noise_deletion'}]

In [7]:
# To visualize what each group correspond to, use the utils_vis module
from utils_vis import visualize_edit_groups

visualize_edit_groups(sample["r_content"], sample["s_content"], sample["annotations"])

There are a total of 8 identified groups.
[syntactic_generic             ] Epsom is[1;32ma[0m [...]
[semantic_deletion             ] [...] [1;31mthe principal[0m town [...]
[semantic_deletion             ] [...] town[1;31mof the Borough of Epsom and Ewell[0m in [...]
[semantic_elaboration_background] [...] in[1;32mnorthern[0m Surrey [...]
[semantic_deletion             ] [...] Surrey[1;31m, England, approximately 13.[0m[1;31m5 mi(21.[0m[1;31m7 km) south of Charing Cross and 4.[0m[1;31m75 mi(7.[0m[1;31m64 km) northeast of Leatherhead[0m. The town is [...]
[nonsim_extraneous_information ] [...] . The town is[1;32mknown for its race course and[0m[1;31mrecorded as Ebbesham in[0m the[1;32mhorse race held there[0m [...]
[semantic_elaboration_generic  ] [...] [1;31mrecorded as Ebbesham in[0m the[1;32mhorse race held there[0m[1;31m13th century and its name probably derives from that of a Saxon landowner[0m.[1;31mFounded as a spring line settlement where the perme

In [8]:
edits = utils_diff.get_edit_operations(sample["r_content"], sample["s_content"], split_replace=True, split_sentences=True)
for edit in edits:
    print(edit)

{'type': 'equal', 'text': 'Epsom is', 'N_words': 2}
{'type': 'insert', 'insert': 'a', 'N_words': 1}
{'type': 'delete', 'delete': 'the principal', 'N_words': 2}
{'type': 'equal', 'text': 'town', 'N_words': 1}
{'type': 'delete', 'delete': 'of the Borough of Epsom and Ewell', 'N_words': 7}
{'type': 'equal', 'text': 'in', 'N_words': 1}
{'type': 'insert', 'insert': 'northern', 'N_words': 1}
{'type': 'equal', 'text': 'Surrey', 'N_words': 1}
{'type': 'delete', 'delete': ', England, approximately 13.', 'N_words': 4}
{'type': 'delete', 'delete': '5\xa0mi(21.', 'N_words': 1}
{'type': 'delete', 'delete': '7\xa0km) south of Charing Cross and 4.', 'N_words': 7}
{'type': 'delete', 'delete': '75\xa0mi(7.', 'N_words': 1}
{'type': 'delete', 'delete': '64\xa0km) northeast of Leatherhead', 'N_words': 4}
{'type': 'equal', 'text': '. The town is', 'N_words': 4}
{'type': 'insert', 'insert': 'known for its race course and', 'N_words': 6}
{'type': 'delete', 'delete': 'recorded as Ebbesham in', 'N_words': 4}
{

In [17]:
for edit_group in sample["annotations"]:
    category = edit_group['category']
    opis = edit_group['opis']
    min_opi, max_opi = min(opis), max(opis)
    before_sentence, after_sentence = "", ""
    before, after = "", ""
    before_N_tokens, after_N_tokens = 0, 0

    for opi in range(min_opi):
        edit = edits[opi]
        N_tokens = edit['N_words']
        if edit['type'] == 'delete':
            before_N_tokens += N_tokens
        elif edit['type'] == 'insert':
            after_N_tokens += N_tokens
        else:
            before_N_tokens += N_tokens
            after_N_tokens += N_tokens
    before_token_range, after_token_range = [before_N_tokens, before_N_tokens], [after_N_tokens, after_N_tokens]
    for opi in range(min_opi, max_opi+1):
        edit = edits[opi]
        N_tokens = edit['N_words']
        if edit['type'] == 'delete':
            before += edit['delete']
            before_token_range[1] += N_tokens
        elif edit['type'] == 'insert':
            after += edit['insert']
            after_token_range[1] += N_tokens
        else:
            before += edit['text']
            after += edit['text']
            before_token_range[1] += N_tokens
            after_token_range[1] += N_tokens
    print("\n>>===========================")
    print("before:", before)
    print(" after:", after)
    print("before_token_range:", before_token_range)
    r_tokens = utils_diff.tokenize(sample['r_content'])
    print(r_tokens[before_token_range[0]: before_token_range[1]])
    s_tokens = utils_diff.tokenize(sample['s_content'])
    print("after_token_range:", after_token_range)
    print(s_tokens[after_token_range[0]: after_token_range[1]])

    print("===========================<<\n")

    

opi 1
{'type': 'insert', 'insert': 'a', 'N_words': 1}
N_tokens 1

before: 
 after: a
before_token_range: [2, 2]
[]
after_token_range: [2, 3]
['a']

opi 2
{'type': 'delete', 'delete': 'the principal', 'N_words': 2}
N_tokens 2

before: the principal
 after: 
before_token_range: [2, 4]
['the', 'principal']
after_token_range: [3, 3]
[]

opi 4
{'type': 'delete', 'delete': 'of the Borough of Epsom and Ewell', 'N_words': 7}
N_tokens 7

before: of the Borough of Epsom and Ewell
 after: 
before_token_range: [5, 12]
['of', 'the', 'Borough', 'of', 'Epsom', 'and', 'Ewell']
after_token_range: [4, 4]
[]

opi 6
{'type': 'insert', 'insert': 'northern', 'N_words': 1}
N_tokens 1

before: 
 after: northern
before_token_range: [13, 13]
[]
after_token_range: [5, 6]
['northern']

opi 8
{'type': 'delete', 'delete': ', England, approximately 13.', 'N_words': 4}
N_tokens 4
opi 9
{'type': 'delete', 'delete': '5\xa0mi(21.', 'N_words': 1}
N_tokens 1
opi 10
{'type': 'delete', 'delete': '7\xa0km) south of Charing C

# Entire dataset (~140k samples)

In [63]:
from collections import Counter
import json, random

with open("data/swipe_full.json", "r") as f:
    swipe_full = json.load(f)

print("Size of dataset: %d" % (len(swipe_full)))

sample = random.choice(swipe_full)
print("Example sample. Input (English Wikipedia) -> Output (Simple Wikipedia)")
print(sample)

Size of dataset: 143359
Example sample. Input (English Wikipedia) -> Output (Simple Wikipedia)
{'input': "Burnham-on-Sea is a seaside town in Somerset, England, at the mouth of the River Parrett, upon Bridgwater Bay. Burnham was a small fishing village until the late 18th century when it began to grow because of its popularity as a seaside resort.\nBurnham-on-Sea forms part of the parish of Burnham-on-Sea and Highbridge and shares a town council with its neighbouring small market town of Highbridge. According to the 2011 census the population of the parish (including Highbridge) was 19,576, of which the most populous wards 'Burnham Central' and 'Burnham North'; totalled 13,601.\nBurnham-on-Sea is most famous for its low lighthouse. The now-decommissioned lighthouse was built in 1832 and is a Grade-II listed building. The lighthouse is famous for its red and white striped facade.\nThe position of the town on the edge of the Somerset Levels and moors where they meet the Bristol Channel, 

In [64]:
import utils_diff

# To make the edits between the input to the output
print(utils_diff.make_colored_text(sample["input"], sample["output"]))

Burnham-on-Sea is a[1;31mseaside[0m[1;32msmall[0m town in[1;32mthe county of[0m Somerset[1;31m,[0m[1;32min[0m England[1;31m, at[0m[1;32mon[0m the mouth of the River Parrett[1;31m, upon[0m[1;32mat[0m Bridgwater Bay. Burnham was a small fishing village until the[1;31mlate 18th[0m[1;32mlate-18th[0m century when it[1;31mbegan to grow because of its popularity[0m[1;32mbecame popular[0m as a seaside resort.[1;31mBurnham-on-Sea forms part of[0m[1;32mThis made[0m the[1;31mparish of Burnham-on-Sea and Highbridge and shares[0m[1;32mvillage grow larger into[0m a town[1;31mcouncil with its neighbouring small market town of Highbridge[0m.[1;31mAccording to the 2011 census the population of the parish(including Highbridge) was 19,576, of which the most populous wards'Burnham Central' and'Burnham North'; totalled 13,601. Burnham-on-Sea is most famous for its low lighthouse. The now-decommissioned lighthouse was built in 1832 and is a Grade-II listed building. The 