In [None]:
!git clone https://dev:dtKN5sX9We7pw1soPB19@gitlab.lrz.de/josh-o/leichte-sprache-corpus.git

#Compute Controll Tokens

In [None]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install sacremoses
!pip install levenshtein

In [None]:
!python -m spacy download de_core_news_sm

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.de.vec

In [None]:
#compute char ratio
def char_ratio(row):
  return len(row['simple_phrase']) / len(row['normal_phrase'])

dataframe['nbchars'] = dataframe.apply(char_ratio, axis=1, result_type='reduce')

In [None]:
#compute Levenshtein similarity

import Levenshtein as lev

def lev_ratio(row):
  return lev.ratio(row['normal_phrase'], row['simple_phrase'])

dataframe['lev_sim'] = dataframe.apply(lev_ratio, axis=1, result_type='reduce')

In [None]:
#compute mean sentence depth

import spacy
import numpy as np

nlp = spacy.load("de_core_news_sm")

def get_dependency_tree_depth(sentence):
  def get_subtree_depth(node):
      if len(list(node.children)) == 0:
          return 0
      return 1 + max([get_subtree_depth(child) for child in node.children])

  tree_depths = [get_subtree_depth(spacy_sentence.root) for spacy_sentence in nlp(sentence).sents]

  if len(tree_depths) == 0:
      return 0
  return np.mean(tree_depths)

def dep_ratio(row):
  return get_dependency_tree_depth(row['normal_phrase']) / get_dependency_tree_depth(row['simple_phrase'])

dataframe['dep'] = dataframe.apply(dep_ratio, axis=1, result_type='reduce')

In [None]:
#compute complexity

from functools import lru_cache
import numpy as np
from pathlib import Path

from nltk.corpus import stopwords as nltk_stopwords
from string import punctuation

import nltk
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('german'))

#FASTTEXT_EMBEDDINGS_PATH = "/content/cc.de.300.vec.gz"
FASTTEXT_EMBEDDINGS_PATH = "/content/wiki.de.vec"

def remove_stopwords(text):
    return ' '.join([w for w in to_words(text) if w.lower() not in stopwords])

def to_words(sentence):
    return sentence.split()

def remove_punctuation_characters(text):
    return ''.join([char for char in text if char not in punctuation])

def remove_punctuation_characters(text):
    return ''.join([char for char in text if char not in punctuation])


@lru_cache(maxsize=1000)
def is_punctuation(word):
    return remove_punctuation_characters(word) == ''


@lru_cache(maxsize=100)
def remove_punctuation_tokens(text):
    return ' '.join([w for w in to_words(text) if not is_punctuation(w)])

def count_lines(filepath):
    n_lines = 0
    with Path(filepath).open() as f:
        for l in f:
            n_lines += 1
    return 

def yield_lines(filepath, n_lines=float('inf'), prop=1):
    if prop < 1:
        assert n_lines == float('inf')
        n_lines = int(prop * count_lines(filepath))
    with open(filepath, 'r') as f:
        for i, l in enumerate(f):
            if i >= n_lines:
                break
            yield l.rstrip('\n')

@lru_cache(maxsize=1)
def get_word2rank(vocab_size=np.inf):
    # TODO: Decrease vocab size or load from smaller file
    word2rank = {}
    line_generator = yield_lines(FASTTEXT_EMBEDDINGS_PATH)
    next(line_generator)  # Skip the first line (header)
    for i, line in enumerate(line_generator):
        if (i + 1) > vocab_size:
            break
        word = line.split(' ')[0]
        word2rank[word] = i
    return word2rank


def get_rank(word):
    return get_word2rank().get(word, len(get_word2rank()))

def get_log_rank(word):
    return np.log(1 + get_rank(word))

def get_lexical_complexity_score(sentence):
    words = to_words(remove_stopwords(remove_punctuation_tokens(sentence)))
    words = [word for word in words if word in get_word2rank()]
    if len(words) == 0:
        return np.log(1 + len(get_word2rank()))  # TODO: This is completely arbitrary
    return np.quantile([get_log_rank(word) for word in words], 0.75)


def complexity_ratio(row):
  return  get_lexical_complexity_score(row['simple_phrase']) / get_lexical_complexity_score(row['normal_phrase'])

dataframe['rank'] = dataframe.apply(complexity_ratio, axis=1, result_type='reduce')

In [None]:
dataframe.to_csv("valid_tokens.csv", index=False)

In [None]:
dataframe.describe()

#Prepare mlsum dataset

In [None]:
%%capture
!pip install datasets

In [None]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("mlsum", "de")

articles = dataset['train']['text'][:17100]
data = {'normal_phrase' : articles, 'simple_phrase' : articles}

regular_df = pd.DataFrame(data)
regular_df.head()

#Prepare DeepL files
As deepL requires .txt files with a maximum of 1 million characters, we split the csv fileinto .txt files of 1 million characters

In [None]:
!mkdir full_text

In [None]:
import pandas as pd

#dataframe = pd.read_csv("/content/leichte-sprache-corpus/aligned/20min/20min_aligned_train.csv")
#dataframe = pd.read_csv("/content/leichte-sprache-corpus/aligned/20min/augmented/pure_simple_english_deepl.csv")
dataframe = pd.read_csv("/content/leichte-sprache-corpus/aligned/kurier/kurier_aligned_train.csv")

full_text = list(dataframe['normal_phrase'].values)

i = 0
while len(full_text) > 0:
  total_length = 0
  current_items = []
  while len(full_text) > 0 and (total_length + len(full_text[0]) + 5 < 1_000_000):
    current_item = full_text.pop(0)
    current_items.append(current_item)
    total_length += len(current_item) + 5
  
  full_text_part = "\n<#>\n".join(current_items)
  print(f"Part {i} has {total_length} chars")
  with open(f'full_text/part_{str(i).zfill(2)}.txt', 'w') as f:
    f.write(full_text_part)
  i += 1

In [None]:
!zip -r /content/full_text.zip /content/full_text

In [None]:
!pip install sentencepiece
!pip install evaluate
!pip install sacremoses
!pip install sacrebleu==2.3.1

## DeepL tp CSV
Finally the .txt files are parsed back into the original .csv format.

In [None]:
full_text = ""

number_of_files = 10

PDF_PREFIX = "/content/"

parts = [str(i).zfill(2) for i in range(0, number_of_files)]
for number in parts:
  print(f"Process {number}")
  file_path = (PDF_PREFIX + f'part_{number} de.txt')
  file_str = open(file_path, 'r').read()
  full_text += file_str.replace("\n","")
  full_text += "<#>"

en_texts = full_text.split("<#>")

In [None]:
import csv
import pandas as pd

dataframe = pd.read_csv("/content/leichte-sprache-corpus/aligned/kurier/kurier_aligned_train.csv")

with open('inputs_back_deepl.csv', 'w') as myfile:
  myfile.write('normal_phrase,simple_phrase\n')
for i, en_text in enumerate(en_texts):
  if en_text == "" and i == len(en_texts) -1:
    continue
  with open('inputs_back_deepl.csv', 'a') as myfile:
    csvwriter = csv.writer(myfile)
    normal_sample = en_text.strip()
    simple_sample = dataframe.iloc[i]['simple_phrase']
    csvwriter.writerow([normal_sample,simple_sample])

In [None]:
dataframe = pd.read_csv("inputs_back_deepl.csv")
dataframe.tail()

#Prepare Google Translate Files
Same thing as for deepL, but Google allows excel files which is more convenient

In [None]:
!mkdir full_text

In [None]:
import pandas as pd
#to excel
dataframe = pd.read_csv("/content/leichte-sprache-corpus/aligned/kurier/augmented/inputs_english_deepl.csv")
cropped_df = dataframe.copy()['normal_phrase']

splits = 1

part_length = int(len(cropped_df.index)/splits)

for i in range(0, splits):
  with pd.ExcelWriter(f'full_text/output_{str(i).zfill(2)}.xlsx') as writer:  
    cropped_df[i*part_length:(i+1)*part_length].to_excel(writer, sheet_name='Sheet_1')

if part_length*splits < len(cropped_df.index):
  with pd.ExcelWriter(f'full_text/output_{str(splits).zfill(2)}.xlsx') as writer:  
    cropped_df[splits*part_length:].to_excel(writer, sheet_name='Sheet_1')

## Google -> CSV

In [None]:
from pandas.io.formats.format import DataFrameFormatter
full_text = ""

number_of_files = 1

PDF_PREFIX = "/content/"

all_dataframes = []

parts = [str(i).zfill(2) for i in range(0, number_of_files)]
for number in parts:
  print(f"Process {number}")
  file_path = (PDF_PREFIX + f'output_{number}_de.xlsx')
  current_dataframe = pd.read_excel(file_path, index_col=0)  
  all_dataframes.append(current_dataframe)

back_translated_df = pd.concat(all_dataframes)
dataframe = pd.read_csv("/content/leichte-sprache-corpus/aligned/kurier/kurier_aligned_train.csv")
dataframe['normal_phrase'] = back_translated_df['normale_phrase']
dataframe.to_csv('pure_simple_back_google.csv') 

#Pure Simple


In [None]:
dataframe = pd.read_csv("/content/leichte-sprache-corpus/aligned/20min/20min_aligned_train.csv")
dataframe['normal_phrase'] = dataframe['simple_phrase']
dataframe.to_csv("pure_simple.csv", index=False)
dataframe.head()

#Add Simple Noise

In [None]:
!git clone https://github.com/valentinmace/noisy-text.git noisy_text

In [None]:
import os
os.chdir('/content/noisy_text')

In [None]:
from noise_functions import delete_random_token, replace_random_token, random_token_permutation

def add_noise(line):
  line = delete_random_token(line, probability=0.1)
  line = replace_random_token(line, probability=0.1, filler_token="<mask>")
  line = random_token_permutation(line, _range=3)
  return line

In [None]:
import pandas as pd
dataframe = pd.read_csv("/content/leichte-sprache-corpus/aligned/kurier/augmented/inputs_back_google.csv")
dataframe['normal_phrase'] = dataframe['normal_phrase'].apply(add_noise)
dataframe.to_csv("../inputs_back_google_simple_noise.csv", index=False)
dataframe.tail()

#Add Bart Noise

In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
from transformers import MBartTokenizer

tokenizer =  MBartTokenizer.from_pretrained("josh-oo/modified-mbart")

In [None]:
#adapted from https://github.com/facebookresearch/fairseq/blob/58cc6cca18f15e6d56e3f60c959fe4f878960a60/fairseq/data/denoising_dataset.py#L257

import re
import numpy as np
import torch
import math
from random import randrange

def poison_distribution(poisson_lambda=3):
  _lambda = poisson_lambda

  lambda_to_the_k = 1
  e_to_the_minus_lambda = math.exp(-_lambda)
  k_factorial = 1
  ps = []
  for k in range(0, 128):
      ps.append(e_to_the_minus_lambda * lambda_to_the_k / k_factorial)
      lambda_to_the_k *= _lambda
      k_factorial *= k + 1
      if ps[-1] < 0.0000001:
          break
  ps = torch.FloatTensor(ps)
  return torch.distributions.Categorical(ps)

def split_into_sentences(text):
  # Zerlegung des Textes in Sätze nach diesen Regeln:
  # (?<!\.\.)\s -> Keine Trennung bei ...
  # (?<!\w\.\w.)\s 		-> Keine Trennung wenn zwei Zeichen mit einem Punkt in der Mitte und am Ende dem Leerzeichen vorausgehen (z.B. etc.)
  # (?<![0-9]\.)\s 		-> Keine Trennung wenn eine Nummer folgend von einem Punkt dem Leerzeichen vorausgeht (9. etc.)
  # (?<![0-9][0-9]\.)\s 	-> Keine Trennung wenn zwei Nummern folgend von einem Punkt dem Leerzeichen vorausgehen (18. etc.)
  # (?<![A-Z]\.)			-> Keine Trennung wenn ein Großbuchstabe gefolgt von einem Punkt dem Leerzeichen vorausgehen (W. etc)
  # (?<![A-Z][a-z]\.)\s 	-> Keine Trennung wenn ein Großbuchstabe gefolgt von einem Kleibuchstaben und einem Punkt dem Leerzeichen vorausgehen (Dr. etc)
  # (?<=\.|\?|\!)\s 		-> Trennen wenn ein Punkt, Fragezeichen oder Ausrufezeichen dem Leerzeichen vorausgehen
  
  sentences = re.split(r"(?<!\w\.\w.)(?<![0-9]\.)(?<![0-9][0-9]\.)(?<![A-Z]\.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", text)
  sentences = [s for s in sentences if s]
  return sentences


def permute_sentences(line, p=1.0):
  sentences = split_into_sentences(line)

  result = sentences.copy()

  num_sentences = len(sentences)
  num_to_permute = math.ceil((num_sentences * 2 * p) / 2.0)
  substitutions = torch.randperm(num_sentences)[:num_to_permute]
  ordering = torch.arange(0, num_sentences)
  ordering[substitutions] = substitutions[torch.randperm(num_to_permute)]

  for result_index, i in enumerate(ordering):
    result[result_index] = sentences[i]
  return " ".join(result)

def get_random_token():
  while True:
    index = randrange(tokenizer.vocab_size)
    out = tokenizer.decode([index])
    if len(tokenizer([out])['input_ids'][0]) == 3: #TODO adapt for other tokenizers
      return out

def add_whole_word_mask(line, p=0.3, random_ratio=0.1):
  words = line.split(" ")
  num_to_mask = int(math.ceil(len(words) * p))
  num_inserts = 0
  if num_to_mask == 0:
      return line

  mask_span_distribution = poison_distribution(poisson_lambda=1.8)
  lengths = mask_span_distribution.sample(sample_shape=(num_to_mask,))
  # Make sure we have enough to mask
  cum_length = torch.cumsum(lengths, 0)
  while cum_length[-1] < num_to_mask:
      lengths = torch.cat(
          [
              lengths,
              mask_span_distribution.sample(sample_shape=(num_to_mask,)),
          ],
          dim=0,
      )
      cum_length = torch.cumsum(lengths, 0)

  # Trim to masking budget
  i = 0
  while cum_length[i] < num_to_mask:
      i += 1
  lengths[i] = num_to_mask - (0 if i == 0 else cum_length[i - 1])
  num_to_mask = i + 1
  lengths = lengths[:num_to_mask]

  # Handle 0-length mask (inserts) separately
  lengths = lengths[lengths > 0]
  num_inserts = num_to_mask - lengths.size(0)
  num_to_mask -= num_inserts
  if num_to_mask == 0:
      return " ".join(words)

  assert (lengths > 0).all()
  indices = torch.randperm(len(words))[:num_to_mask]
  mask_random = torch.FloatTensor(num_to_mask).uniform_() < random_ratio
  source_length = len(words)

  #to_keep = torch.ones(source_length, dtype=torch.bool)
  # keep index, but replace it with [MASK]
  index_to_remove = []
  to_keep = torch.ones(len(words), dtype=torch.bool)
  for i, index in enumerate(indices):
    words[index] = "<mask>"
    if mask_random[i]:
      words[index] = get_random_token()
    current_length = lengths[i]
    for shift in range(1,current_length):
      if index + shift < len(to_keep):
        to_keep[index + shift] = False

  assert len(lengths.size()) == 1
  assert lengths.size() == indices.size()
  
  words = np.array(words)[to_keep]

  return " ".join(words)

def crop_texts(line, max_length=512):
  tokens = tokenizer.tokenize(line)[:max_length]
  return tokenizer.convert_tokens_to_string(tokens)

def add_bart_noise(line, permutation=1.0, masking=0.3, random_masking=0.1):
  if permutation > 0:
    line = permute_sentences(line, permutation)
  if masking > 0:
    line = add_whole_word_mask(line, masking, random_masking)
  return line

In [None]:
import pandas as pd
dataframe = pd.read_csv("/content/leichte-sprache-corpus/aligned/20min/20min_aligned_train.csv")
#dataframe['normal_phrase'] = dataframe['normal_phrase'].apply(crop_texts)
dataframe['normal_phrase'] = dataframe['normal_phrase'].apply(add_bart_noise)
dataframe.to_csv("inputs_back_bart_noise.csv", index=False)
dataframe.head()