In [1]:
import os
import json
import re
import nltk
import random
from collections import defaultdict
from transformers import ElectraTokenizerFast
from datasets import Dataset, DatasetDict
import spacy
import numpy as np
from tqdm import tqdm

In [3]:
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")
electra_tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-small-discriminator")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/zlovoblachko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
errors_correspondence = {
    'Capitalisation': 'ORTH',
    'Formational_affixes': 'FORM',
    'Derivation': 'FORM',
    'Noun_number': 'FORM',
    'Countable_uncountable': 'FORM',
    'Category_confusion': 'MORPH',
    'Articles': 'DET',
    'Determiners': 'DET',
    'Prepositional_noun': 'POS',
    'Prepositions': 'POS',
    'Verb_pattern': 'POS',
    'Conjunctions': 'POS',
    'Pronouns': 'POS',
    'Tense_choice': 'VERB',
    'Tense_form': 'VERB',
    'Voice': 'VERB',
    'Modals': 'VERB',
    'Numerals': 'NUM',
    'lex_item_choice': 'WORD',
    'Absence_comp_sent': 'WORD',
    'Inappropriate_register': 'WORD',
    'Ref_device': 'WORD',
    'Linking_device': 'WORD',
    'Punctuation': 'PUNCT',
    'Relative_clause': 'PUNCT',
    'Redundant_comp': 'RED',
    'Confusion_of_structures': 'MULTIWORD',
    'Word_order': 'MULTIWORD',
    'Word_choice': 'MULTIWORD',
    'Absence_explanation': 'MULTIWORD',
    'Coherence': 'MULTIWORD',
    'Spelling': 'SPELL'
}

In [17]:
binary_labels = ["CORRECT", "ERROR"]
first_level_labels = ["C", "M", "R", "U"]  # C = correct, M, R, U are the first-level error types
second_level_labels = list(set(errors_correspondence.values())) + ["CORRECT"]
combined_labels = []
for l1 in ["M", "R", "U"]:
    for l2 in set(errors_correspondence.values()):
        combined_labels.append(f"{l1}-{l2}")
combined_labels.append("CORRECT")

In [43]:
class TokenClassificationDataCollator:
    """
    Data collator that will dynamically pad the inputs received, as well as the labels.
    """
    def __init__(self, tokenizer, label_info, max_length=None):
        self.tokenizer = tokenizer
        self.label_info = label_info
        self.max_length = max_length
    
    def __call__(self, features):
        # Tokenize texts and align the labels with wordpiece tokens
        batch = []
        for feature in features:
            # Get text and labels
            text = feature["text"]
            word_labels = feature["word_labels"]
            
            # Tokenize
            encodings = self.tokenizer(
                text, 
                truncation=True,
                max_length=self.max_length,
                padding="max_length",
                return_offsets_mapping=True
            )
            
            # Align labels with tokens
            word_ids = encodings.word_ids()
            label_ids = []
            
            for word_idx in word_ids:
                # Special tokens
                if word_idx is None:
                    label_ids.append(-100)
                else:
                    # Use the label of the first token of the word
                    try:
                        label_ids.append(word_labels[word_idx])
                    except IndexError:
                        # Handle the case where word_idx is out of bounds
                        label_ids.append(-100)
            
            # Add labels to encodings
            encodings["labels"] = label_ids
            
            # Remove offset mapping as it's not needed anymore
            del encodings["offset_mapping"]
            
            # Convert to PyTorch tensors
            batch.append({k: torch.tensor(v) for k, v in encodings.items()})
        
        # Concatenate all tensors
        batch = self.tokenizer.pad(
            batch,
            padding=True,
            return_tensors="pt",
        )
        
        return batch

In [18]:
def read_raw_files(directory):
    """Read all raw files (.txt and .ann) and extract text and annotation data."""
    file_pairs = []
    
    for filename in os.listdir(directory):
        if filename.endswith('.txt') and filename.split(".")[0][-1] == '1':
            txt_path = os.path.join(directory, filename)
            ann_path = os.path.join(directory, filename.split(".")[0] + ".ann")
            
            if os.path.exists(ann_path):
                with open(txt_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                
                with open(ann_path, 'r', encoding='utf-8') as f:
                    annotations = f.readlines()
                
                file_pairs.append({
                    'filename': filename.split(".")[0],
                    'text': text,
                    'annotations': annotations
                })
    
    print(f"Found {len(file_pairs)} text-annotation pairs")
    return file_pairs

In [19]:
def parse_annotations(annotations):
    """Parse annotation lines into structured error spans with extended error type information."""
    error_spans = []
    tag_to_correction = {}
    
    # First pass: Collect all T tags and their span information
    for line in annotations:
        if line.startswith('T'):
            try:
                parts = line.strip().split('\t')
                tag_id = parts[0]
                error_info = parts[1].split(' ')
                
                # Skip if tag type length is 3 (these are POS tags, not error tags)
                if len(error_info[0]) == 3:
                    continue
                
                error_type = error_info[0]
                span_start = int(error_info[1]) if error_info[1].isdigit() else None
                span_end = int(error_info[2]) if error_info[2].isdigit() else None
                
                if span_start is not None and span_end is not None:
                    error_text = parts[2].strip()
                    
                    # Get second-level tag based on error type
                    second_level_tag = errors_correspondence.get(error_type, "OTHER")
                    
                    error_spans.append({
                        'tag_id': tag_id,
                        'error_type': error_type,
                        'span_start': span_start,
                        'span_end': span_end,
                        'error_text': error_text,
                        'correction': None,
                        'first_level_tag': None,
                        'second_level_tag': second_level_tag
                    })
            except (IndexError, ValueError) as e:
                # Skip malformed annotation lines
                continue
    
    # Second pass: Find corrections and first-level tags
    for line in annotations:
        if line.startswith('A'):
            # This is a deletion annotation
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                ref_tag = None
                for part in parts[1].split():
                    if part.startswith('T'):
                        ref_tag = part
                        break
                
                if ref_tag:
                    for span in error_spans:
                        if span['tag_id'] == ref_tag:
                            span['correction'] = ""
                            span['first_level_tag'] = "U"  # Unnecessary (deletion)
        
        elif line.startswith('#') and 'lemma' not in line:
            # This is a correction annotation
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                ref_tag = None
                for part in parts[0].split():
                    if part.startswith('T'):
                        ref_tag = part
                        break
                
                if ref_tag:
                    correction = parts[2].strip()
                    
                    for span in error_spans:
                        if span['tag_id'] == ref_tag:
                            span['correction'] = correction
                            
                            # Determine first level tag
                            if span['error_text'] in correction:
                                span['first_level_tag'] = "M"  # Missing
                            else:
                                span['first_level_tag'] = "R"  # Replace
    
    return error_spans

In [20]:
def split_into_sentences(text):
    """Split text into sentences using SpaCy and track character offsets."""
    sentences = []
    sent_spans = []
    
    # Use SpaCy for sentence splitting
    doc = nlp(text)
    
    for sent in doc.sents:
        sentences.append(sent.text)
        sent_spans.append((sent.start_char, sent.end_char))
    
    return sentences, sent_spans

In [21]:
def assign_errors_to_sentences(text, error_spans, sent_spans):
    """Assign error spans to the appropriate sentences."""
    sentence_errors = [[] for _ in range(len(sent_spans))]
    overlapping_spans = []
    
    # Create a map of character positions to error span indices
    char_to_error = defaultdict(list)
    for i, span in enumerate(error_spans):
        for pos in range(span['span_start'], span['span_end']):
            char_to_error[pos].append(i)
    
    # Find overlapping spans
    for pos, span_indices in char_to_error.items():
        if len(span_indices) > 1:
            # Only add unique combinations of overlapping spans
            overlap_set = tuple(sorted(span_indices))
            if overlap_set not in overlapping_spans:
                overlapping_spans.append(overlap_set)
    
    # Assign error spans to sentences
    for i, (start, end) in enumerate(sent_spans):
        for span in error_spans:
            # Check if the error span overlaps with this sentence
            if max(start, span['span_start']) < min(end, span['span_end']):
                # Adjust span positions relative to the sentence start
                adjusted_span = span.copy()
                adjusted_span['span_start'] = max(0, span['span_start'] - start)
                adjusted_span['span_end'] = min(end - start, span['span_end'] - start)
                sentence_errors[i].append(adjusted_span)
    
    # Identify sentences with overlapping error spans
    sentences_with_overlaps = set()
    for overlap in overlapping_spans:
        for span_idx in overlap:
            span = error_spans[span_idx]
            for i, (start, end) in enumerate(sent_spans):
                if max(start, span['span_start']) < min(end, span['span_end']):
                    sentences_with_overlaps.add(i)
    
    return sentence_errors, list(sentences_with_overlaps), overlapping_spans

In [42]:
def tokenize_and_align_labels(examples, tokenizer):
    """
    Function to tokenize and align labels during training.
    This will be called by the Trainer during training.
    
    Parameters:
    -----------
    examples : dict
        Dictionary of examples
    tokenizer : AutoTokenizer
        Tokenizer to use
        
    Returns:
    --------
    dict
        Dictionary with tokenized inputs and aligned labels
    """
    tokenized_inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        # We use this argument because the texts in our dataset are lists of words and not full sentences.
        is_split_into_words=False,
        return_offsets_mapping=True
    )
    
    labels = []
    
    for i, label in enumerate(examples["word_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
            
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [23]:
def create_sentence_dataset(file_pairs):
    """Create a dataset of sentences with token-level error labels."""
    regular_sentences = []
    overlapping_sentences = []
    
    for file_data in tqdm(file_pairs, desc="Processing files"):
        text = file_data['text']
        error_spans = parse_annotations(file_data['annotations'])
        
        # Split text into sentences
        sentences, sent_spans = split_into_sentences(text)
        
        # Assign error spans to sentences
        sentence_errors, sentences_with_overlaps, overlap_spans = assign_errors_to_sentences(text, error_spans, sent_spans)
        
        # Process each sentence
        for i, (sentence, errors) in enumerate(zip(sentences, sentence_errors)):
            if i in sentences_with_overlaps:
                # Add to overlapping sentences dataset
                overlapping_sentences.append({
                    'text': sentence,
                    'error_spans': errors,
                    'file_id': file_data['filename'],
                    'sentence_id': i
                })
            else:
                # Add to regular sentences dataset
                regular_sentences.append({
                    'text': sentence,
                    'error_spans': errors,
                    'file_id': file_data['filename'],
                    'sentence_id': i
                })
    
    print(f"Created dataset with {len(regular_sentences)} regular sentences and {len(overlapping_sentences)} sentences with overlapping errors")
    return regular_sentences, overlapping_sentences

In [24]:
def create_token_labels(sentence_data):
    """Create multi-type token-level labels for each sentence."""
    for sentence in tqdm(sentence_data, desc="Creating token labels"):
        text = sentence['text']
        doc = nlp(text)
        
        # Initialize token labels (all correct by default)
        binary_token_labels = ["CORRECT"] * len(doc)
        first_level_token_labels = ["C"] * len(doc)  # C for correct
        second_level_token_labels = ["CORRECT"] * len(doc)
        combined_token_labels = ["CORRECT"] * len(doc)
        
        token_spans = [(token.idx, token.idx + len(token.text)) for token in doc]
        
        # Assign error labels to tokens
        for error in sentence['error_spans']:
            error_start = error['span_start']
            error_end = error['span_end']
            first_level = error['first_level_tag']
            second_level = error['second_level_tag']
            
            if first_level is None:
                first_level = "R"  # Default to R if no first level tag
            
            combined = f"{first_level}-{second_level}" if first_level != "C" else "CORRECT"
            
            for i, (start, end) in enumerate(token_spans):
                # Check if token overlaps with error span
                if max(start, error_start) < min(end, error_end):
                    binary_token_labels[i] = "ERROR"
                    first_level_token_labels[i] = first_level
                    second_level_token_labels[i] = second_level
                    combined_token_labels[i] = combined
        
        # Add token labels to the sentence data
        sentence['tokens'] = [token.text for token in doc]
        sentence['binary_labels'] = binary_token_labels
        sentence['first_level_labels'] = first_level_token_labels
        sentence['second_level_labels'] = second_level_token_labels
        sentence['combined_labels'] = combined_token_labels

In [45]:
def create_transformer_dataset(sentence_data, label_type="binary"):
    """
    Create a dataset in the format required by the Transformers library.
    This version doesn't pre-tokenize the data, making it usable with any transformer model.
    
    Parameters:
    -----------
    sentence_data : list
        List of dictionaries containing sentences with token-level labels
    label_type : str
        Type of labels to use ('binary', 'first_level', 'second_level', or 'combined')
    
    Returns:
    --------
    list
        List of examples in a format compatible with the Transformers library
    """
    transformer_data = []
    
    # Map label names to indices based on label type
    if label_type == "binary":
        label_list = ["CORRECT", "ERROR"]
    elif label_type == "first_level":
        label_list = ["C", "M", "R", "U"]
    elif label_type == "second_level":
        label_list = list(set(errors_correspondence.values())) + ["CORRECT"]
    elif label_type == "combined":
        label_list = []
        for l1 in ["M", "R", "U"]:
            for l2 in set(errors_correspondence.values()):
                label_list.append(f"{l1}-{l2}")
        label_list.append("CORRECT")
    else:
        raise ValueError(f"Unknown label type: {label_type}")
    
    label_map = {label: i for i, label in enumerate(label_list)}
    
    for sentence in tqdm(sentence_data, desc=f"Creating transformer dataset ({label_type})"):
        # Select the appropriate labels based on label_type
        if label_type == "binary":
            token_labels = sentence['binary_labels']
        elif label_type == "first_level":
            token_labels = sentence['first_level_labels']
        elif label_type == "second_level":
            token_labels = sentence['second_level_labels']
        elif label_type == "combined":
            token_labels = sentence['combined_labels']
        
        # Create a dictionary containing the text and word-level labels
        # (not tokenized yet - will be tokenized during training)
        example = {
            "text": sentence['text'],
            "tokens": sentence['tokens'],
            "word_labels": [label_map.get(label, 0) for label in token_labels]
        }
        
        # Add metadata
        example["file_id"] = sentence.get("file_id", "")
        example["sentence_id"] = sentence.get("sentence_id", 0)
        
        transformer_data.append(example)
    
    # Add label information to the dataset features
    features = {
        "text": {"dtype": "string"},
        "tokens": {"dtype": "string", "sequence": True},
        "word_labels": {"dtype": "int32", "sequence": True}
    }
    
    # Include label map for reference
    label_info = {
        "label_type": label_type,
        "labels": label_list,
        "label_map": label_map
    }
    
    return transformer_data, features, label_info

In [None]:
def create_spacy_dataset(sentence_data, label_type="binary"):
    """Create a dataset in the format required for SpaCy's span detection training."""
    spacy_data = []
    
    for sentence in tqdm(sentence_data, desc=f"Creating SpaCy dataset ({label_type})"):
        text = sentence['text']
        error_spans = []
        
        # Get appropriate token labels based on label_type
        if label_type == "binary":
            token_labels = sentence['binary_labels']
        elif label_type == "first_level":
            token_labels = sentence['first_level_labels']
        elif label_type == "second_level":
            token_labels = sentence['second_level_labels']
        else:
            raise ValueError(f"Unknown label type: {label_type}")
        
        # Collect spans with their labels
        spans = []
        for error in sentence['error_spans']:
            error_start = error['span_start']
            error_end = error['span_end']
            
            if label_type == "binary":
                label = "ERROR"
            elif label_type == "first_level":
                label = error['first_level_tag']
            elif label_type == "second_level":
                label = error['second_level_tag']
            
            spans.append((error_start, error_end, label))
        
        # Create SpaCy training example
        spacy_example = {
            "text": text,
            "spans": spans,
            "meta": {
                "file_id": sentence["file_id"],
                "sentence_id": sentence["sentence_id"]
            }
        }
        
        spacy_data.append(spacy_example)
    
    return spacy_data

In [48]:
def save_datasets(transformer_data, features, label_info, output_dir, hf_repo_name, dataset_type="binary"):
    """
    Save model-agnostic transformer dataset to disk and upload to Hugging Face.
    
    Parameters:
    -----------
    transformer_data : list
        List of examples in model-agnostic format
    features : dict
        Dictionary describing the features of the dataset
    label_info : dict
        Dictionary containing label information
    output_dir : str
        Directory to save the dataset
    hf_repo_name : str
        Name of the Hugging Face repository
    dataset_type : str
        Type of labels in the dataset
    """
    from datasets import Dataset, DatasetDict, Features, Sequence, Value
    
    # Convert features dictionary to HF Features format
    hf_features = Features({
        "text": Value("string"),
        "tokens": Sequence(Value("string")),
        "word_labels": Sequence(Value("int32")),
        "file_id": Value("string"),
        "sentence_id": Value("int32")
    })
    
    # Create dataset from the data
    dataset = Dataset.from_list(transformer_data, features=hf_features)
    
    # Train/eval/test split (80/10/10)
    train_test_val = dataset.train_test_split(test_size=0.2, seed=42)
    test_val = train_test_val["test"].train_test_split(test_size=0.5, seed=42)
    
    dataset_dict = DatasetDict({
        "train": train_test_val["train"],
        "validation": test_val["train"],
        "test": test_val["test"]
    })
    
    # Save dataset to disk
    dataset_dir = os.path.join(output_dir, f"transformer_dataset_{dataset_type}_agnostic")
    dataset_dict.save_to_disk(dataset_dir)
    
    # Save label information
    with open(os.path.join(dataset_dir, "label_info.json"), "w") as f:
        json.dump(label_info, f, indent=2)
    
    # Upload to Hugging Face
    try:
        repo_id = f"{hf_repo_name}-transformer-{dataset_type}-agnostic"
        dataset_dict.push_to_hub(repo_id)
        print(f"Uploaded dataset to Hugging Face: {repo_id}")
    except Exception as e:
        print(f"Failed to upload to Hugging Face: {str(e)}")
    
    return dataset_dict

In [28]:
raw_dir = "/home/zlovoblachko/GD_correction_diploma/data/rawfiles"
output_dir = "/home/zlovoblachko/GD_correction_diploma/data/new_datasets"
hf_repo = "Zlovoblachko/REALEC_GED"
process_only = ["binary", "first_level", "second_level"]
file_pairs = read_raw_files(raw_dir)
regular_sentences, overlapping_sentences = create_sentence_dataset(file_pairs)
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, "regular_sentences.json"), "w") as f:
    json.dump(regular_sentences, f, ensure_ascii=False)
with open(os.path.join(output_dir, "overlapping_sentences.json"), "w") as f:
    json.dump(overlapping_sentences, f, ensure_ascii=False)
save_datasets(output_dir, hf_repo, dataset_types=process_only)
print("Dataset processing complete!")

Found 9389 text-annotation pairs


Processing files: 100%|██████████| 9389/9389 [03:56<00:00, 39.76it/s]


Created dataset with 77004 regular sentences and 8427 sentences with overlapping errors


Creating token labels: 100%|██████████| 77004/77004 [08:11<00:00, 156.56it/s]
Creating token labels: 100%|██████████| 8427/8427 [00:56<00:00, 148.31it/s]
Creating transformer dataset (binary): 100%|██████████| 77004/77004 [07:50<00:00, 163.53it/s] 
Creating transformer dataset (binary): 100%|██████████| 8427/8427 [00:52<00:00, 161.44it/s]


Saving the dataset (0/1 shards):   0%|          | 0/61603 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7700 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7701 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8427 [00:00<?, ? examples/s]

Saved transformer dataset to /home/zlovoblachko/GD_correction_diploma/data/new_datasets/transformer_dataset_binary
Failed to upload to Hugging Face: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67f2e35a-2644a14e5dfb90e10a9770d0;b9bad6cc-daf5-4589-8a01-7a8f882ce08b)

Invalid username or password.


Creating SpaCy dataset (binary): 100%|██████████| 77004/77004 [00:00<00:00, 690806.28it/s]
Creating SpaCy dataset (binary): 100%|██████████| 8427/8427 [00:00<00:00, 527275.71it/s]


Saved SpaCy datasets to /home/zlovoblachko/GD_correction_diploma/data/new_datasets/spacy_dataset_binary


Creating token labels: 100%|██████████| 77004/77004 [07:31<00:00, 170.45it/s] 
Creating token labels: 100%|██████████| 8427/8427 [00:52<00:00, 160.63it/s]
Creating transformer dataset (first_level): 100%|██████████| 77004/77004 [07:59<00:00, 160.74it/s] 
Creating transformer dataset (first_level): 100%|██████████| 8427/8427 [00:54<00:00, 154.25it/s]


Saving the dataset (0/1 shards):   0%|          | 0/61603 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7700 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7701 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8427 [00:00<?, ? examples/s]

Saved transformer dataset to /home/zlovoblachko/GD_correction_diploma/data/new_datasets/transformer_dataset_first_level
Failed to upload to Hugging Face: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /api/repos/create (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))"), '(Request ID: 7614224c-263c-472b-8de6-397953047d28)')


Creating SpaCy dataset (first_level): 100%|██████████| 77004/77004 [00:00<00:00, 713533.72it/s]
Creating SpaCy dataset (first_level): 100%|██████████| 8427/8427 [00:00<00:00, 459222.01it/s]


Saved SpaCy datasets to /home/zlovoblachko/GD_correction_diploma/data/new_datasets/spacy_dataset_first_level


Creating token labels: 100%|██████████| 77004/77004 [08:01<00:00, 160.00it/s] 
Creating token labels: 100%|██████████| 8427/8427 [00:57<00:00, 147.34it/s]
Creating transformer dataset (second_level): 100%|██████████| 77004/77004 [07:46<00:00, 165.21it/s] 
Creating transformer dataset (second_level): 100%|██████████| 8427/8427 [00:51<00:00, 165.03it/s]


Saving the dataset (0/1 shards):   0%|          | 0/61603 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7700 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7701 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8427 [00:00<?, ? examples/s]

Saved transformer dataset to /home/zlovoblachko/GD_correction_diploma/data/new_datasets/transformer_dataset_second_level
Failed to upload to Hugging Face: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67f2eb93-00fc8d797e039a6b2b9d6263;7f3cb43f-cd81-40df-b2e5-5f3a603ea82c)

Invalid username or password.


Creating SpaCy dataset (second_level): 100%|██████████| 77004/77004 [00:00<00:00, 642532.95it/s]
Creating SpaCy dataset (second_level): 100%|██████████| 8427/8427 [00:00<00:00, 486355.50it/s]


Saved SpaCy datasets to /home/zlovoblachko/GD_correction_diploma/data/new_datasets/spacy_dataset_second_level
Dataset processing complete!


In [49]:
raw_dir = "/home/zlovoblachko/GD_correction_diploma/data/rawfiles"
output_dir = "/home/zlovoblachko/GD_correction_diploma/data/new_new_datasets"
hf_repo = "Zlovoblachko/REALEC_GED"
process_only = ["binary", "first_level", "second_level", "combined"]
file_pairs = read_raw_files(raw_dir)
regular_sentences, overlapping_sentences = create_sentence_dataset(file_pairs)
create_token_labels(regular_sentences)
create_token_labels(overlapping_sentences)

for label_type in process_only:
    print(f"\nProcessing {label_type} label type...")
    transformer_data, features, label_info = create_transformer_dataset(
        regular_sentences, 
        label_type=label_type
    )
    dataset_dict = save_datasets(
        transformer_data, 
        features, 
        label_info, 
        output_dir, 
        hf_repo, 
        dataset_type=label_type  # Pass a single string, not a list
    )
    
    print(f"Completed processing {label_type} label type.")

Processing files: 100%|██████████| 9389/9389 [04:06<00:00, 38.09it/s]
Creating token labels: 100%|██████████| 77004/77004 [07:54<00:00, 162.32it/s] 
Creating token labels: 100%|██████████| 8427/8427 [00:55<00:00, 150.99it/s]
Creating transformer dataset (binary): 100%|██████████| 77004/77004 [00:00<00:00, 464764.96it/s]


Saving the dataset (0/1 shards):   0%|          | 0/61603 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7700 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7701 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Creating transformer dataset (first_level): 100%|██████████| 77004/77004 [00:00<00:00, 586513.25it/s]


Saving the dataset (0/1 shards):   0%|          | 0/61603 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7700 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7701 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Creating transformer dataset (second_level): 100%|██████████| 77004/77004 [00:00<00:00, 450996.15it/s]


Saving the dataset (0/1 shards):   0%|          | 0/61603 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7700 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7701 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Creating transformer dataset (combined): 100%|██████████| 77004/77004 [00:00<00:00, 116154.34it/s]


Saving the dataset (0/1 shards):   0%|          | 0/61603 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7700 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7701 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

In [30]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [32]:
from datasets import load_from_disk, Dataset, DatasetDict
import os
import json
from huggingface_hub import HfApi, login
import glob

In [38]:
def upload_transformer_datasets(
    base_dir,
    hf_username,
    repo_prefix="grammar-error-detection",
    dataset_types=["binary", "first_level", "second_level", "combined"],
    use_auth_token=None
):
    """
    Upload only transformer datasets to Hugging Face.
    
    Parameters:
    -----------
    base_dir : str
        Base directory containing the datasets
    hf_username : str
        Your Hugging Face username
    repo_prefix : str
        Prefix for the repository names
    dataset_types : list
        List of dataset types to upload
    use_auth_token : str, optional
        Hugging Face authentication token. If None, will prompt for login.
    """
    # Login to Hugging Face if token not provided
    if use_auth_token is None:
        login()
    else:
        login(token=use_auth_token)
    
    api = HfApi()
    
    # Upload transformer datasets
    for dataset_type in dataset_types:
        # Upload transformer dataset
        transformer_path = os.path.join(base_dir, f"transformer_dataset_{dataset_type}")
        if os.path.exists(transformer_path):
            print(f"Uploading transformer dataset ({dataset_type})...")
            
            # Load dataset
            dataset = load_from_disk(transformer_path)
            
            # Create repository name
            repo_name = f"{hf_username}/{repo_prefix}-transformer-{dataset_type}"
            
            # Push to hub
            try:
                dataset.push_to_hub(repo_name)
                print(f"Successfully uploaded transformer dataset to {repo_name}")
            except Exception as e:
                print(f"Error uploading transformer dataset ({dataset_type}): {str(e)}")
        else:
            print(f"Transformer dataset not found at {transformer_path}")
    
    print("Transformer dataset upload complete!")

In [50]:
def upload_spacy_datasets(
    base_dir,
    hf_username,
    repo_prefix="grammar-error-detection",
    dataset_types=["binary", "first_level", "second_level"],
    use_auth_token=None
):
    """
    Upload SpaCy datasets to Hugging Face as datasets, not models.
    
    Parameters:
    -----------
    base_dir : str
        Base directory containing the datasets
    hf_username : str
        Your Hugging Face username
    repo_prefix : str
        Prefix for the repository names
    dataset_types : list
        List of dataset types to upload
    use_auth_token : str, optional
        Hugging Face authentication token. If None, will prompt for login.
    """
    # Login to Hugging Face if token not provided
    if use_auth_token is None:
        login()
    else:
        login(token=use_auth_token)
    
    api = HfApi()
    
    # Upload SpaCy datasets
    for dataset_type in dataset_types:
        if dataset_type == "combined":
            print("Skipping combined type for SpaCy dataset (not supported)")
            continue
            
        spacy_path = os.path.join(base_dir, f"spacy_dataset_{dataset_type}")
        if os.path.exists(spacy_path):
            print(f"Uploading SpaCy dataset ({dataset_type})...")
            
            # Create repository name
            repo_name = f"{hf_username}/{repo_prefix}-spacy-{dataset_type}"
            
            # First, create the repository if it doesn't exist
            try:
                api.create_repo(
                    repo_id=repo_name,
                    repo_type="dataset",  # Specify dataset type here
                    exist_ok=True
                )
                print(f"Created dataset repository: {repo_name}")
            except Exception as e:
                print(f"Error creating repository {repo_name}: {str(e)}")
                continue
            
            # Upload each JSON file directly using the API
            for split in ["train", "eval", "test", "overlapping"]:
                file_path = os.path.join(spacy_path, f"{split}.json")
                if os.path.exists(file_path):
                    try:
                        api.upload_file(
                            path_or_fileobj=file_path,
                            path_in_repo=f"{split}.json",
                            repo_id=repo_name,
                            repo_type="dataset"  # Specify dataset type here
                        )
                        print(f"Uploaded {split}.json to {repo_name}")
                    except Exception as e:
                        print(f"Error uploading {split}.json: {str(e)}")
            
            # Create a simple README with dataset information
            readme_content = f"""# SpaCy Grammar Error Detection Dataset - {dataset_type.capitalize()} Labels

This repository contains the SpaCy-formatted dataset for grammar error detection with {dataset_type} labels.

## Files:
- `train.json`: Training data
- `eval.json`: Evaluation data
- `test.json`: Test data
- `overlapping.json`: Sentences with overlapping error spans

## Format:
Each file contains a JSON array of examples in SpaCy format, with text and span annotations.

## Usage:
These files can be used directly with SpaCy's span categorization training.
"""
            
            # Upload README
            try:
                with open("temp_readme.md", "w") as f:
                    f.write(readme_content)
                
                api.upload_file(
                    path_or_fileobj="temp_readme.md",
                    path_in_repo="README.md",
                    repo_id=repo_name,
                    repo_type="dataset"  # Specify dataset type here
                )
                print(f"Uploaded README.md to {repo_name}")
                
                # Clean up temporary file
                os.remove("temp_readme.md")
            except Exception as e:
                print(f"Error uploading README.md: {str(e)}")
                
        else:
            print(f"SpaCy dataset not found at {spacy_path}")
    
    print("SpaCy dataset upload complete!")

In [40]:
upload_transformer_datasets(
    base_dir="/home/zlovoblachko/GD_correction_diploma/data/new_datasets",
    hf_username="Zlovoblachko",
    repo_prefix="REALEC_GED",
    dataset_types=["binary", "first_level", "second_level"]
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Uploading transformer dataset (binary)...


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Successfully uploaded transformer dataset to Zlovoblachko/REALEC_GED-transformer-binary
Uploading transformer dataset (first_level)...


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Successfully uploaded transformer dataset to Zlovoblachko/REALEC_GED-transformer-first_level
Uploading transformer dataset (second_level)...


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Successfully uploaded transformer dataset to Zlovoblachko/REALEC_GED-transformer-second_level
Transformer dataset upload complete!


In [51]:
upload_spacy_datasets(
    base_dir="/home/zlovoblachko/GD_correction_diploma/data/new_datasets",
    hf_username="Zlovoblachko",
    repo_prefix="REALEC_GED",
    dataset_types=["binary", "first_level", "second_level"]
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

train.json:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

- empty or missing yaml metadata in repo card


train.json:   0%|          | 0.00/22.3M [00:00<?, ?B/s]

train.json:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

In [52]:
from huggingface_hub import HfApi, login
import json
import os
import tempfile
from datasets import Dataset, DatasetDict

def fix_and_upload_spacy_dataset(
    repo_id,  # e.g., "Zlovoblachko/REALEC_GED-spacy-binary"
    output_repo_id=None,  # Use the same repo ID by default
    use_auth_token=None
):
    """
    Fix the SpaCy dataset format and re-upload it to be compatible with the datasets library.
    
    Parameters:
    -----------
    repo_id : str
        Source repository ID
    output_repo_id : str, optional
        Target repository ID (defaults to source repository ID)
    use_auth_token : str, optional
        Hugging Face authentication token
    """
    # Login to Hugging Face
    if use_auth_token:
        login(token=use_auth_token)
    else:
        login()
    
    if output_repo_id is None:
        output_repo_id = repo_id
    
    api = HfApi()
    
    # Create a temporary directory for processing files
    with tempfile.TemporaryDirectory() as temp_dir:
        datasets_by_split = {}
        
        # Process each split
        for split in ["train", "eval", "test", "overlapping"]:
            try:
                # Download the JSON file
                file_path = api.hf_hub_download(
                    repo_id=repo_id,
                    filename=f"{split}.json",
                    repo_type="dataset"
                )
                
                # Load the JSON data
                with open(file_path, 'r') as f:
                    data = json.load(f)
                
                # Transform the data to make it compatible with datasets
                transformed_data = []
                for example in data:
                    # Convert span tuples with string labels to consistent format
                    new_spans = []
                    for span in example["spans"]:
                        # Make sure spans have consistent format with numeric indices and strings
                        start, end, label = span
                        new_spans.append({
                            "start": int(start),
                            "end": int(end),
                            "label": str(label)
                        })
                    
                    # Create a transformed example
                    transformed_example = {
                        "text": example["text"],
                        "spans": new_spans
                    }
                    
                    # Add metadata if present
                    if "meta" in example:
                        transformed_example["meta"] = example["meta"]
                    
                    transformed_data.append(transformed_example)
                
                # Create and save a Hugging Face dataset
                dataset = Dataset.from_list(transformed_data)
                datasets_by_split[split] = dataset
                
                print(f"Processed {split} split: {len(dataset)} examples")
            
            except Exception as e:
                print(f"Error processing {split} split: {str(e)}")
        
        # Create a dataset dictionary if we have at least some data
        if datasets_by_split:
            # Create a DatasetDict
            datasets_dict = DatasetDict(datasets_by_split)
            
            # Push to Hugging Face
            try:
                datasets_dict.push_to_hub(
                    output_repo_id,
                    private=False,
                )
                print(f"Successfully uploaded dataset to {output_repo_id}")
            except Exception as e:
                print(f"Error uploading dataset: {str(e)}")
        else:
            print("No data to upload.")

# Example usage:
# fix_and_upload_spacy_dataset("Zlovoblachko/REALEC_GED-spacy-binary")

In [53]:
# Fix all SpaCy datasets
for dataset_type in ["binary", "first_level", "second_level"]:
    repo_id = f"Zlovoblachko/REALEC_GED-spacy-{dataset_type}"
    fix_and_upload_spacy_dataset(repo_id)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

train.json:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

eval.json:   0%|          | 0.00/2.86M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/2.87M [00:00<?, ?B/s]

overlapping.json:   0%|          | 0.00/4.73M [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/508 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

train.json:   0%|          | 0.00/22.3M [00:00<?, ?B/s]

eval.json:   0%|          | 0.00/2.79M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/2.80M [00:00<?, ?B/s]

overlapping.json:   0%|          | 0.00/4.54M [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

train.json:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

eval.json:   0%|          | 0.00/2.86M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/2.87M [00:00<?, ?B/s]

overlapping.json:   0%|          | 0.00/4.71M [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/520 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.
