GED dataset

In [1]:
import json
import random
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [2]:
def jsonl_to_token_dataset(input_file, output_prefix, give_splits=True, test_size=0.2, dev_size=0.1, seed=42):
    sentences = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            sentences.append(json.loads(line))
    token_data = []
    for item in sentences:
        text = item['text']
        tokens = text.split()
        token_labels = ['correct'] * len(tokens)
        token_positions = []
        current_pos = 0
        for token in tokens:
            start = text.find(token, current_pos)
            end = start + len(token)
            token_positions.append((start, end))
            current_pos = end
        for tag in item.get('tags', []):
            error_start = int(tag['span_start'])
            error_end = int(tag['span_end'])
            for i, (token_start, token_end) in enumerate(token_positions):
                if not (token_end <= error_start or token_start >= error_end):
                    token_labels[i] = 'erroneous'
        token_data.append({
            'tokens': tokens,
            'labels': token_labels,
            'text': text
        })
    if give_splits == True:
        train, test = train_test_split(token_data, test_size=test_size, random_state=seed)
        train, dev = train_test_split(train, test_size=dev_size/(1-test_size), random_state=seed)
        splits = {
            'train': train,
            'dev': dev,
            'test': test
        }
    elif give_splits == False:
        test = token_data
        splits = {
            'test' : test
        }
    for split_name, data in splits.items():
        output_file = f"{output_prefix}_{split_name}.jsonl"
        with open(output_file, 'w', encoding='utf-8') as f:
            for item in data:
                f.write(json.dumps(item) + '\n')
        total_tokens = sum(len(item['tokens']) for item in data)
        error_tokens = sum(label == 'erroneous' for item in data for label in item['labels'])
        print(f"{split_name.upper()}: {len(data)} sentences, {total_tokens} tokens")
        print(f"  Errors: {error_tokens} ({error_tokens/total_tokens:.2%})")
        print(f"  Saved to {output_file}\n")

In [None]:
jsonl_to_token_dataset('/home/zlovoblachko/GD_correction_diploma/data_preparation/sentencewise_full.jsonl', 'bea_grammar', True)

TRAIN: 24906 sentences, 526954 tokens
  Errors: 62577 (11.88%)
  Saved to bea_grammar_train.jsonl

DEV: 3559 sentences, 75406 tokens
  Errors: 8957 (11.88%)
  Saved to bea_grammar_dev.jsonl

TEST: 7117 sentences, 150908 tokens
  Errors: 18079 (11.98%)
  Saved to bea_grammar_test.jsonl



In [3]:
jsonl_to_token_dataset('/home/zlovoblachko/GD_correction_diploma/data_preparation/overlapping_full.jsonl', 'overlapping_benchmark', False)

TEST: 5518 sentences, 119831 tokens
  Errors: 33218 (27.72%)
  Saved to overlapping_benchmark_test.jsonl



In [12]:
def jsonl_to_four_label_token(input_file, output_prefix, give_splits=True, test_size=0.2, dev_size=0.1, seed=42):
    sentences = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            sentences.append(json.loads(line))
    first_level_tags = {
        'M': 'Missing',
        'U': 'Unnecessary',
        'R': 'Replace',
        'correct': 'correct'
    }
    token_data = []
    for item in sentences:
        text = item['text']
        tokens = text.split()
        token_labels = ['correct'] * len(tokens)
        token_positions = []
        current_pos = 0
        for token in tokens:
            start = text.find(token, current_pos)
            end = start + len(token)
            token_positions.append((start, end))
            current_pos = end
        for tag in item.get('tags', []):
            error_start = int(tag['span_start'])
            error_end = int(tag['span_end'])
            first_level_tag = tag.get('first_level_tag', 'Unknown')
            for i, (token_start, token_end) in enumerate(token_positions):
                if not (token_end <= error_start or token_start >= error_end):
                    token_labels[i] = first_level_tag
        token_data.append({
            'tokens': tokens,
            'labels': token_labels,
            'text': text
        })
    if give_splits:
        train, test = train_test_split(token_data, test_size=test_size, random_state=seed)
        train, dev = train_test_split(train, test_size=dev_size/(1-test_size), random_state=seed)
        splits = {
            'train': train,
            'dev': dev,
            'test': test
        }
    else:
        test = token_data
        splits = {
            'test': test
        }
    for split_name, data in splits.items():
        output_file = f"{output_prefix}_{split_name}.jsonl"
        with open(output_file, 'w', encoding='utf-8') as f:
            for item in data:
                f.write(json.dumps(item) + '\n')
        total_tokens = sum(len(item['tokens']) for item in data)
        error_counts = {}
        for item in data:
            for label in item['labels']:
                if label != 'correct':
                    error_counts[label] = error_counts.get(label, 0) + 1
        total_errors = sum(error_counts.values())
        print(f"{split_name.upper()}: {len(data)} sentences, {total_tokens} tokens")
        print(f"  Total Errors: {total_errors} ({total_errors/total_tokens:.2%})")
        if error_counts:
            print("  Error distribution:")
            for error_type, count in error_counts.items():
                tag_name = first_level_tags.get(error_type, error_type)
                print(f"    {error_type} ({tag_name}): {count} ({count/total_errors:.2%})")
        
        print(f"  Saved to {output_file}\n")

In [13]:
jsonl_to_four_label_token('/home/zlovoblachko/GD_correction_diploma/data_preparation/sentencewise_full.jsonl', '4tag_grammar_GED', True)

TRAIN: 24906 sentences, 526954 tokens
  Total Errors: 62577 (11.88%)
  Error distribution:
    M (Missing): 19983 (31.93%)
    R (Replace): 40316 (64.43%)
    U (Unnecessary): 2278 (3.64%)
  Saved to 4tag_grammar_GED_train.jsonl

DEV: 3559 sentences, 75406 tokens
  Total Errors: 8957 (11.88%)
  Error distribution:
    R (Replace): 5682 (63.44%)
    M (Missing): 2942 (32.85%)
    U (Unnecessary): 333 (3.72%)
  Saved to 4tag_grammar_GED_dev.jsonl

TEST: 7117 sentences, 150908 tokens
  Total Errors: 18079 (11.98%)
  Error distribution:
    M (Missing): 5734 (31.72%)
    R (Replace): 11661 (64.50%)
    U (Unnecessary): 684 (3.78%)
  Saved to 4tag_grammar_GED_test.jsonl



In [14]:
jsonl_to_four_label_token('/home/zlovoblachko/GD_correction_diploma/data_preparation/overlapping_full.jsonl', 'overlapping_benchmark', False)

TEST: 5518 sentences, 119831 tokens
  Total Errors: 33218 (27.72%)
  Error distribution:
    M (Missing): 6430 (19.36%)
    R (Replace): 25426 (76.54%)
    U (Unnecessary): 1362 (4.10%)
  Saved to overlapping_benchmark_test.jsonl



In [19]:
def jsonl_to_eleven_label_token(input_file, output_prefix, give_splits=True, test_size=0.2, dev_size=0.1, seed=42):
    sentences = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            sentences.append(json.loads(line))
    
    second_level_tags = {
        'ORTH': 'Orthography',
        'FORM': 'Formation',
        'MORPH': 'Morphology',
        'DET': 'Determiners',
        'POS': 'Part of Speech',
        'VERB': 'Verb Issues',
        'NUM': 'Number Issues',
        'WORD': 'Word Choice',
        'PUNCT': 'Punctuation',
        'RED': 'Redundancy',
        'MULTIWORD': 'Multi-word Structures',
        'SPELL': 'Spelling',
        'correct': 'correct'
    }
    
    token_data = []
    for item in sentences:
        text = item['text']
        tokens = text.split()
        token_labels = ['correct'] * len(tokens)
        token_positions = []
        current_pos = 0
        
        # Find all token positions in the text
        for token in tokens:
            start = text.find(token, current_pos)
            end = start + len(token)
            token_positions.append((start, end))
            current_pos = end
        
        # Process each error tag
        for tag in item.get('tags', []):
            error_start = int(tag['span_start'])
            error_end = int(tag['span_end'])
            
            # Get second-level tag directly if provided
            if 'second_level_tag' in tag:
                second_level = tag['second_level_tag']
            else:
                second_level = 'UNKNOWN'
            
            # Find tokens that overlap with the error span
            for i, (token_start, token_end) in enumerate(token_positions):
                if not (token_end <= error_start or token_start >= error_end):
                    token_labels[i] = second_level
        
        token_data.append({
            'tokens': tokens,
            'labels': token_labels,
            'text': text
        })
    
    # Create train/dev/test splits if requested
    if give_splits:
        train, test = train_test_split(token_data, test_size=test_size, random_state=seed)
        train, dev = train_test_split(train, test_size=dev_size/(1-test_size), random_state=seed)
        splits = {
            'train': train,
            'dev': dev,
            'test': test
        }
    else:
        test = token_data
        splits = {
            'test': test
        }
    
    # Save and print statistics for each split
    for split_name, data in splits.items():
        output_file = f"{output_prefix}_{split_name}.jsonl"
        with open(output_file, 'w', encoding='utf-8') as f:
            for item in data:
                f.write(json.dumps(item) + '\n')
        
        # Count tokens and errors by type
        total_tokens = sum(len(item['tokens']) for item in data)
        
        # Count occurrences of each error type
        error_counts = {}
        for item in data:
            for label in item['labels']:
                if label != 'correct':
                    error_counts[label] = error_counts.get(label, 0) + 1
        
        total_errors = sum(error_counts.values()) if error_counts else 0
        
        # Print statistics
        print(f"{split_name.upper()}: {len(data)} sentences, {total_tokens} tokens")
        print(f"  Total Errors: {total_errors} ({total_errors/total_tokens:.2%})")
        if error_counts:
            print("  Error distribution:")
            for error_type, count in sorted(error_counts.items(), key=lambda x: -x[1]):
                tag_name = second_level_tags.get(error_type, error_type)
                print(f"    {error_type} ({tag_name}): {count} ({count/total_errors:.2%})")
        
        print(f"  Saved to {output_file}\n")

In [None]:
jsonl_to_eleven_label_token(jsonl_to_four_label_token('/home/zlovoblachko/GD_correction_diploma/data_preparation/sentencewise_full.jsonl', '4tag_grammar_GED', True)

TRAIN: 24906 sentences, 526954 tokens
  Total Errors: 62577 (11.88%)
  Error distribution:
    DET (Determiners): 20519 (32.79%)
    SPELL (Spelling): 11961 (19.11%)
    WORD (Word Choice): 5590 (8.93%)
    MULTIWORD (Multi-word Structures): 4791 (7.66%)
    PUNCT (Punctuation): 4445 (7.10%)
    VERB (Verb Issues): 3899 (6.23%)
    FORM (Formation): 3594 (5.74%)
    POS (Part of Speech): 2736 (4.37%)
    RED (Redundancy): 2051 (3.28%)
    ORTH (Orthography): 1424 (2.28%)
    NUM (Number Issues): 956 (1.53%)
    MORPH (Morphology): 611 (0.98%)
  Saved to 11tag_grammar_GED_train.jsonl

DEV: 3559 sentences, 75406 tokens
  Total Errors: 8957 (11.88%)
  Error distribution:
    DET (Determiners): 3034 (33.87%)
    SPELL (Spelling): 1651 (18.43%)
    WORD (Word Choice): 702 (7.84%)
    PUNCT (Punctuation): 682 (7.61%)
    MULTIWORD (Multi-word Structures): 666 (7.44%)
    VERB (Verb Issues): 577 (6.44%)
    FORM (Formation): 513 (5.73%)
    POS (Part of Speech): 401 (4.48%)
    RED (Redundanc

In [21]:
jsonl_to_eleven_label_token('/home/zlovoblachko/GD_correction_diploma/data_preparation/overlapping_full.jsonl', '11tag_overlapping', False)

TEST: 5518 sentences, 119831 tokens
  Total Errors: 33218 (27.72%)
  Error distribution:
    MULTIWORD (Multi-word Structures): 6224 (18.74%)
    DET (Determiners): 5705 (17.17%)
    PUNCT (Punctuation): 5215 (15.70%)
    WORD (Word Choice): 4607 (13.87%)
    SPELL (Spelling): 2528 (7.61%)
    VERB (Verb Issues): 2361 (7.11%)
    POS (Part of Speech): 1854 (5.58%)
    FORM (Formation): 1588 (4.78%)
    RED (Redundancy): 1488 (4.48%)
    MORPH (Morphology): 676 (2.04%)
    NUM (Number Issues): 517 (1.56%)
    ORTH (Orthography): 455 (1.37%)
  Saved to 11tag_overlapping_test.jsonl



In [4]:
from datasets import Dataset, DatasetDict
import json
from huggingface_hub import notebook_login, HfApi
from datasets import ClassLabel, Value, Features, Sequence

In [5]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
def load_split(file_path):
    data = {"tokens": [], "labels": [], "text": []}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line)
            data["tokens"].append(item["tokens"])
            data["labels"].append(item["labels"])
            data["text"].append(item["text"])
    return Dataset.from_dict(data)

In [22]:
dataset_dict = DatasetDict({
    "train": load_split("11tag_grammar_GED_train.jsonl"),
    "validation": load_split("11tag_grammar_GED_dev.jsonl"), 
    "test": load_split("11tag_grammar_GED_test.jsonl")
})

In [23]:
features = Features({
    'tokens': Sequence(Value('string')),
    'labels': Sequence(ClassLabel(names=[
        'correct', 
        'ORTH',
        'FORM',
        'MORPH',
        'DET',
        'POS',
        'VERB',
        'NUM',
        'WORD',
        'PUNCT',
        'RED',
        'MULTIWORD',
        'SPELL'
    ])),
    'text': Value('string')
})

In [24]:
dataset_dict = dataset_dict.cast(features)

Casting the dataset:   0%|          | 0/24906 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3559 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7117 [00:00<?, ? examples/s]

In [25]:
repo_name = "Zlovoblachko/REALEC_GED_11tag_errors"
dataset_dict.push_to_hub(repo_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Zlovoblachko/REALEC_GED_11tag_errors/commit/9f7172c4ff557b8c28c7882673c4074fdc1c860d', commit_message='Upload dataset', commit_description='', oid='9f7172c4ff557b8c28c7882673c4074fdc1c860d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Zlovoblachko/REALEC_GED_11tag_errors', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Zlovoblachko/REALEC_GED_11tag_errors'), pr_revision=None, pr_num=None)

GEC dataset

In [1]:
import json
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from tqdm import tqdm

In [2]:
def process_data_for_gec(json_lines):
    data = []
    for line in tqdm(json_lines):
        example = json.loads(line) if isinstance(line, str) else line
        source_text = example['text']
        corrected_text = source_text
        offset = 0
        sorted_tags = sorted(example['tags'], key=lambda x: int(x['span_start']))
        for tag in sorted_tags:
            start = int(tag['span_start']) + offset
            end = int(tag['span_end']) + offset
            correction = tag['correction']
            corrected_text = corrected_text[:start] + correction + corrected_text[end:]
            offset += len(correction) - (end - start)
        data.append({
            'source': source_text,
            'target': corrected_text
        })
    return data

In [3]:
def create_huggingface_dataset(data):
    df = pd.DataFrame(data)
    n = len(df)
    indices = np.random.permutation(n)
    train_idx = indices[:int(0.8 * n)]
    val_idx = indices[int(0.8 * n):int(0.9 * n)]
    test_idx = indices[int(0.9 * n):]
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]
    test_df = df.iloc[test_idx]
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })
    return dataset_dict

In [4]:
def upload_to_huggingface(dataset_dict, repo_name):
    """Upload the dataset to Hugging Face Hub."""
    dataset_dict.push_to_hub(repo_name)
    print(f"Dataset uploaded to {repo_name}")

In [5]:
with open('/home/zlovoblachko/GD_correction_diploma/data_preparation/sentencewise_full.jsonl', 'r') as f:
    json_lines = f.readlines()
    processed_data = process_data_for_gec(json_lines)
    dataset = create_huggingface_dataset(processed_data)
    upload_to_huggingface(dataset, "Zlovoblachko/REALEC_GEC_dataset")

100%|██████████| 35582/35582 [00:00<00:00, 128710.81it/s]


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Dataset uploaded to Zlovoblachko/REALEC_GEC_dataset
