In [4]:
# !pip install sentence_transformers
# !pip install --upgrade jupyterlab_widgets
# !jupyter lab build
# !pip install python-Levenshtein
# !pip install ipywidgets --upgrade


In [5]:
from dotenv import load_dotenv
import os
from datasets import load_dataset, Dataset
import pandas as pd
import Levenshtein
import re

In [6]:
# Load environment variables (API keys) from .env file
load_dotenv()
HUGGINFACE_TOKEN = os.getenv("HUGGINFACE_TOKEN")

In [7]:
## Functions for preprocessing

# Function to calculate structural complexity
def length(text):
    return len(text.split())

# Function to calculate Levenshtein distance
def calculate_levenshtein(prompt, result):
    return Levenshtein.distance(prompt, result)

def is_list_like(text):
    # Regex to find non-word, non-space characters
    non_word_chars = re.findall(r'[^\w\s]', text)
    # Split text into words
    words = re.findall(r'\w+', text)
    
    # Calculate ratio: number of non-word chars to words
    ratio = len(non_word_chars) / len(words) if len(words) > 0 else 0
    
    # You can adjust this threshold based on your data
    return ratio > 0.6


In [8]:
## Load the dataset and shuffle it
ds = load_dataset("UWV/Leesplank_NL_wikipedia_simplifications", split="train")
ds = ds.shuffle(seed=42)

# Convert to pandas dataframe for deduplication
df = ds.to_pandas()

# Remove duplicates and reset index
df = df.drop_duplicates().reset_index(drop=True)


Generating train split: 100%|██████████| 2867757/2867757 [00:23<00:00, 120666.91 examples/s]


In [9]:
df['prompt_lenght'] = df['prompt'].apply(length)
df['result_lenght'] = df['result'].apply(length)
df['levenshtein_distance'] = df.apply(lambda row: calculate_levenshtein(row['prompt'], row['result']), axis=1)
print(df.shape)

(2771144, 5)


In [6]:
# removing prompts shorter than 7 words
df = df[df['prompt_lenght'] >= 7]
# Order by Levenstein distance
df = df.sort_values(by='levenshtein_distance', ascending=True)

In [7]:
df = df[~df['prompt'].apply(is_list_like)]
print(df.shape)

(2694555, 5)


In [8]:
# add [S2S] prefix to prompt
df['prompt'] = '[S2S] ' + df['prompt']

In [9]:
df = df[["prompt", "result"]]
print(df.head())

                                                    prompt  \
1945736  [S2S] Jan Waaijer is getrouwd en heeft twee ki...   
2001729       [S2S] Reinout III was een van zijn kinderen.   
1141980      [S2S] Hij is de zoon van André Van Den Bosch.   
2677840    [S2S] Beute is getrouwd en heeft drie kinderen.   
1624     [S2S] Herrera is getrouwd met Lourdes Betia Cu...   

                                                  result  
1945736  Jan Waaijer is getrouwd en heeft twee kinderen.  
2001729           Reinout III was een van zijn kinderen.  
1141980          Hij is de zoon van André Van Den Bosch.  
2677840        Beute is getrouwd en heeft drie kinderen.  
1624        Herrera is getrouwd met Lourdes Betia Cuico.  


In [11]:
# Convert back to Hugging Face Dataset and select only needed columns
ds = Dataset.from_pandas(df)


In [13]:
from huggingface_hub import login
login(HUGGINGFACE_TOKEN)

# Push the dataset to Hugging Face Hub
ds.push_to_hub("UWV/Leesplank_NL_wikipedia_simplifications_preprocessed")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\ElinedeKinkelder\.cache\huggingface\token
Login successful


Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/674 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/674 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/674 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/674 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/UWV/Leesplank_NL_wikipedia_simplifications_preprocessed/commit/b995946b7e8d2412a7a65d9a2e5bb7c67f25f85a', commit_message='Upload dataset', commit_description='', oid='b995946b7e8d2412a7a65d9a2e5bb7c67f25f85a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/UWV/Leesplank_NL_wikipedia_simplifications_preprocessed', endpoint='https://huggingface.co', repo_type='dataset', repo_id='UWV/Leesplank_NL_wikipedia_simplifications_preprocessed'), pr_revision=None, pr_num=None)