In [1]:
import pandas as pd
import numpy as np
import re
from alignment_utils import get_components, count_alignment_types
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from os import path
from nltk.tokenize import RegexpTokenizer

# Newsela Sentence Merging
The Newsela corpus provided does not reflect the sentence alignments properly, where e.g., in a 1-n alignment the training instance contains 1 complex sentence -> 2 simple sentences. Also for aligned data, no train/test/dev splits are done properly.
This notebook does following:
1. Loads the aligned sentence pairs
2. Loads the unaligned train/test/dev splits
3. Assigns the original splits to the aligned sentence pairs
4. Filters some data and removes some readability levels like the authors of Newsela did
5. Merges aligned sentences

In [3]:
# Config
newsela_base_dir = "/homes/julez/datasets-raw/newsela-auto/newsela-auto/"
output_dir = "datasets-raw/newsela_test/"
tqdm.pandas()

In [None]:
pd.set_option('display.max_colwidth', None)

# Load Data
This loads the aligned data from the Newsela corpus. Some lines contain multiple tabstops, therefore this is read and split manually.

In [None]:
with open(path.join(newsela_base_dir, "all_data/aligned-sentence-pairs-all.tsv")) as file:
    data = []
    for row in file.readlines():
        data.append(row.strip().split('\t'))

In [None]:
col_names = ['simple_sent_id', 'simple_sent', 'complex_sent_id', 'complex_sent']
df_aligned = pd.DataFrame(data, columns=col_names)

In [None]:
df_aligned

# Reproduce splits
Reproduce Train/Test/Dev splits from ACL2020 Paper. The original dataset doesn't provide sentence IDs or Document IDs for the train/test/dev splits, only the sentences. Therefore we search for each sent in the splits to regain the split information.

In [None]:
def read_split_data(split: str):
    with open(path.join(newsela_base_dir, f"ACL2020/{split}.src")) as file:
        src = []
        for row in file.read().splitlines():
            src.append(row.strip())
        
    with open(path.join(newsela_base_dir, f"ACL2020/{split}.dst")) as file:
        tgt = []
        for row in file.read().splitlines():
            tgt.append(row.strip())

    return pd.DataFrame({"complex_sent": src, "simple_sent": tgt, "split": split})

In [None]:
# Read the data and create one DF with complex - simple sents and split
df_train = read_split_data("train")
df_test = read_split_data("test")
df_valid = read_split_data("valid")

df_splits = pd.concat([df_train, df_test, df_valid], ignore_index=True)

In [None]:
df_splits

Since the text in ACL2020 DS is processed and punctuation is surrounded with whitespace we cannot match directly.
Solution: Complex and Simple is tokenized to remove punctuation and then hashed.
This is done for the splits and the auto aligned data.

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
df_splits["complex_tokenized"] = df_splits.progress_apply(
    lambda row: tokenizer.tokenize(row["complex_sent"]), axis=1)
df_splits["simple_tokenized"] = df_splits.progress_apply(
    lambda row: tokenizer.tokenize(row["simple_sent"]), axis=1)
df_splits["hash"] = df_splits.progress_apply(
    lambda row: hash(" ".join(row["complex_tokenized"] + row["simple_tokenized"])), axis=1)

In [None]:
df_aligned["complex_tokenized"] = df_aligned.progress_apply(
    lambda row: tokenizer.tokenize(row["complex_sent"]), axis=1)
df_aligned["simple_tokenized"] = df_aligned.progress_apply(
    lambda row: tokenizer.tokenize(row["simple_sent"]), axis=1)
df_aligned["hash"] = df_aligned.progress_apply(
    lambda row: hash(" ".join(row["complex_tokenized"] + row["simple_tokenized"])), axis=1)

In [None]:
df_aligned

In [None]:
# Then merge and remove temp columns
# Rename merged columns
df_auto_all = df_aligned.merge(df_splits, on="hash")
df_auto_all = df_auto_all.drop(columns=[
    "complex_tokenized_x",
    "simple_tokenized_x",
    "complex_sent_y",
    "simple_sent_y",
    "complex_tokenized_y",
    "simple_tokenized_y"
])
df_auto_all = df_auto_all.rename(columns={
    "simple_sent_x": "simple_sent",
    "complex_sent_x": "complex_sent"
})

In [None]:
df_auto_all

# Filter Data
The original paper filters out:
* Instances where Simple == Complex
* Removed some readability level

In [None]:
# Set article ID
df_auto_all['article_id'] = df_auto_all.apply(lambda x: re.search('^\w*-?\w*\.?\w*', x['simple_sent_id']).group(), axis=1)

In [None]:
# 666k mentioned in paper
df_filtered = df_auto_all[df_auto_all['simple_sent'] != df_auto_all['complex_sent']]

In [None]:
df_filtered

In [None]:
# Set Reading Level Transition
simple_rl = df_filtered['simple_sent_id'].str.split(pat='-', expand=True)
complex_rl = df_filtered['complex_sent_id'].str.split(pat='-', expand=True)

#df_uni['rl_test'] = pd.concat([complex_rl[2], simple_rl[2]], axis = 1).apply(lambda x: '-'.join(x))

df_filtered['simple_rl'] = simple_rl[2]
df_filtered['complex_rl'] = complex_rl[2]
df_filtered['rl_transition'] = df_filtered[['complex_rl', 'simple_rl']].apply(lambda row: '-'.join(row.values.astype(str)), axis=1)
df_filtered = df_filtered.drop(['simple_rl', 'complex_rl'], axis=1)

In [None]:
# Remove Readability Levels 0-1, 1-2, 2-3
# Keep 0-2, 0-3, 1-3, 2-3, 0-4, 1-4, 2-4, 3-4
# 481k mentioned in paper -> 482k
filters = ['0-1', '1-2', '2-3']
transitions = ["0-2", "0-3", "0-4", "1-3", "1-4", "2-4", "3-4"]
df_filtered = df_filtered[~df_filtered.rl_transition.isin(filters)]
df_filtered["is_aligned"] = True

In [None]:
# Test to get Text Simplfication Phenomenon Stats
transitions = ["0-1", "0-2", "0-3", "0-4", "1-2", "1-3", "1-4", "2-3", "2-4", "3-4"]
df_filtered["is_aligned"] = True

In [None]:
dataframes = {}
for transition in transitions:
    dataframes[f"df_filtered_{transition}"] = df_filtered[df_filtered["rl_transition"] == transition]

# Get Alignments for Sentences

In [None]:
# fmt: <doc_id.lang>-<level>-<par_id>-<sent_id>
# level 1 == complex, level 0 == simple
#DOC_ID = lambda x: x.split('-')[-5:-3]
DOC_ID = lambda x: re.search('^\w*-?\w*\.?\w*', x).group()
LEVEL = lambda x: x.split('-')[-3]
SENT_ID = lambda x: int(x.split('-')[-1]) # cast to int for numeric ordering
IS_COMPLEX = lambda x, y: LEVEL(x) == y
GET_COMPLEXITY_LVL = lambda x: re.search(r'\d+', x).group()

In [None]:
def get_corresponding_nodes(components: set, complexity_level) -> list:
    output = []
    for component in components:
        alignment = {"complex": [], "simple": []}

        for key in component:
            if IS_COMPLEX(key, complexity_level):
                alignment["complex"].append(key)
            else:
                alignment["simple"].append(key)
        
        # Assume that Sentence IDs have asc order
        alignment["complex"].sort()
        alignment["simple"].sort()

        output.append(alignment)
    return output

In [None]:
def merge_sents(nodes: list[dict], df):
    d = {
        "complex_sent_ids": [],
        "complex_sent": [],
        "simple_sent_ids": [],
        "simple_sent": [],
        "split": [],
    }
    
    for node in tqdm(nodes):
        # Extract + concat complex sents
        d["complex_sent_ids"].append(node["complex"])
        complex_sents = []
        for sent_id in node["complex"]: 
            row = df[df["complex_sent_id"] == sent_id]
            complex_sents.append(row["complex_sent"].iloc[0])
        d["complex_sent"].append(" ".join(complex_sents))

        # Extract + concat simple sents
        d["simple_sent_ids"].append(node["simple"])
        simple_sents = []
        for sent_id in node["simple"]: 
            row = df[df["simple_sent_id"] == sent_id]
            simple_sents.append(row["simple_sent"].iloc[0])
            split = row["split"].iloc[0]

        d["simple_sent"].append(" ".join(simple_sents))
        d["split"].append(split)
        
    df_output = pd.DataFrame(data=d)
    
    return df_output

In [None]:
orig_df = dataframes["df_filtered_0-2"]
orig_df = orig_df[:1000]
df_key = "df_filtered_0-2"

df_out = pd.DataFrame()
alignments = list(orig_df[['complex_sent_id', 'simple_sent_id', 'is_aligned']].itertuples(index=False))
components = get_components(alignments)
nodes = get_corresponding_nodes(components, GET_COMPLEXITY_LVL(df_key))
df_out = pd.concat([df_out, merge_sents(nodes, orig_df)], ignore_index=True)

In [None]:
# This iterates over DataFrames for all Transitions (Readability Level 0-2, ...)
# Computes the alignments and gets corresponding sentences
# All aligned sentences are then added to one DataFrame
df_out = pd.DataFrame()
for df_key, orig_df in dataframes.items():
    print("---")
    print(df_key)
    alignments = list(orig_df[['complex_sent_id', 'simple_sent_id', 'is_aligned']].itertuples(index=False))
    components = get_components(alignments)
    nodes = get_corresponding_nodes(components, GET_COMPLEXITY_LVL(df_key))
    df_out = pd.concat([df_out, merge_sents(nodes, orig_df)], ignore_index=True)

In [None]:
# Write all data
df_out.to_csv("datasets-raw/newsela_test/aligned-sentence-pairs-merged.csv")

In [7]:
df = pd.read_csv("/homes/julez/ts-adapters/src/data/newsela/aligned-sentence-pairs-merged.csv", index_col=0)  

In [8]:
df

Unnamed: 0,complex_sent_ids,complex_sent,simple_sent_ids,simple_sent,split
0,['brain-gender.en-0-4-0'],"To figure this out, the team — led by psychobi...",['brain-gender.en-2-8-0'],"To figure out more, the team did more research...",valid
1,['brain-gender.en-0-4-1'],"In other words, they looked for examples of me...",['brain-gender.en-2-8-1'],They looked for measurements that appeared to ...,valid
2,['brain-gender.en-0-4-2'],"Then, after identifying these elements, the re...",['brain-gender.en-2-8-2'],"After identifying the elements, researchers lo...",valid
3,['brain-gender.en-0-5-1'],"On these scans, they examined 116 separate reg...","['brain-gender.en-2-11-1', 'brain-gender.en-2-...","On their scans, researchers examined 116 separ...",valid
4,['brain-gender.en-0-5-2'],"In each case, the 281 scans were divided into ...",['brain-gender.en-2-11-4'],"In each case, the scans were divided into thre...",valid
...,...,...,...,...,...
299226,['bbking-obit.en-3-21-3'],When he found out that the men had been fighti...,"['bbking-obit.en-4-11-6', 'bbking-obit.en-4-11...",When he found out that the men had been fighti...,train
299227,['bbking-obit.en-3-22-0'],"King's guitar was stolen two years later, but ...",['bbking-obit.en-4-12-0'],"From then on, King named every guitar he owned...",train
299228,['bbking-obit.en-3-23-0'],B.B. King recorded more than 50 albums through...,['bbking-obit.en-4-21-0'],King recorded more than 50 albums.,train
299229,['bbking-obit.en-3-23-1'],He once said he had lost count of how many rec...,['bbking-obit.en-4-21-1'],"At one time, he said he could not remember how...",train


In [14]:
train = df[df["split"] == "train"]
test = df[df["split"] == "test"]
validation = df[df["split"] == "valid"]

In [15]:
train.to_csv("/homes/julez/ts-adapters/src/data/newsela/train.csv")
test.to_csv("/homes/julez/ts-adapters/src/data/newsela/test.csv")
validation.to_csv("/homes/julez/ts-adapters/src/data/newsela/valid.csv")