# **3. NLI Filtering**

Labels aligned sentence pairs with edit types (e.g., addition, deletion, rewrite) using a Dutch Natural Language Inference (NLI) model.

Edit Types:
- **NO_CONTENT_CHANGE**
- **CONTENT_ADDITION**
- **CONTENT_DELETION**
- **CONTENT_CHANGE**

In [None]:
import yaml

# Load configuration from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [None]:
import pandas as pd
import torch
from transformers import pipeline
from tqdm import tqdm

def classify_sentence_edits(input_file, 
                           output_file,
                           sim_threshold=0.45):
    
    # Load the Dutch NLI pipeline
    print("Loading Dutch NLI model...")
    device = 0 if torch.cuda.is_available() else -1
    nli = pipeline('text-classification',
                   model='LoicDL/bert-base-dutch-cased-finetuned-snli',
                   tokenizer='LoicDL/bert-base-dutch-cased-finetuned-snli',
                   device=device)
    
    # Load the aligned pairs
    print(f"Loading aligned pairs from {input_file}...")
    df = pd.read_parquet(input_file)
    
    def classify_change(old, new, similarity):
        # Classify the type of change between two sentences.
        
        # Handle None/empty cases
        if not old or pd.isna(old) or old.strip() == '':
            return 'CONTENT_ADDITION'
        if not new or pd.isna(new) or new.strip() == '':
            return 'CONTENT_DELETION'
        
        # Check for exact match
        if old == new:
            return 'NO_CONTENT_CHANGE'
        
        # Bidirectional NLI check
        try:
            # Forward direction: old → new
            res_fwd = nli({"text": old, "text_pair": new}, truncation=True, top_k=None)[0]
            # Backward direction: new → old  
            res_bwd = nli({"text": new, "text_pair": old}, truncation=True, top_k=None)[0]
            
            fwd = res_fwd["label"].upper()
            bwd = res_bwd["label"].upper()
            
            # Apply the classification logic
            if fwd == 'ENTAILMENT' and bwd == 'ENTAILMENT':
                return 'NO_CONTENT_CHANGE'
            elif fwd == 'ENTAILMENT' and bwd != 'ENTAILMENT':
                return 'CONTENT_DELETION'  
            elif bwd == 'ENTAILMENT' and fwd != 'ENTAILMENT':
                return 'CONTENT_ADDITION'
            else:
                return 'CONTENT_CHANGE'
                
        except Exception as e:
            print(f"NLI error: {e}")
            return 'CONTENT_CHANGE'
    
    # Process all records
    records = []
    
    print(f"Processing {len(df)} aligned pairs...")
    for _, row in tqdm(df.iterrows(), total=len(df)):
        old_sentence = row.get('old_sentence', '')
        new_sentence = row.get('new_sentence', '')
        similarity = row.get('similarity', row.get('similarity_score', 0))
        
        # Convert None to empty string
        if pd.isna(old_sentence):
            old_sentence = ''
        if pd.isna(new_sentence):
            new_sentence = ''
        
        # Split low similarity pairs into DELETION + ADDITION
        if similarity is not None and similarity < sim_threshold and old_sentence and new_sentence:
            # Create two separate records
            records.append({
                'isbn': row['isbn'],
                'from_version': row['from_version'],
                'to_version': row['to_version'],
                'old_sentence': old_sentence,
                'new_sentence': '',
                'similarity': similarity,
                'edit_type': 'CONTENT_DELETION'
            })
            records.append({
                'isbn': row['isbn'],
                'from_version': row['from_version'],
                'to_version': row['to_version'],
                'old_sentence': '',
                'new_sentence': new_sentence,
                'similarity': similarity,
                'edit_type': 'CONTENT_ADDITION'
            })
        else:
            # For high similarity pairs or actual deletions/additions, classify normally
            edit_type = classify_change(old_sentence, new_sentence, similarity)
            
            records.append({
                'isbn': row['isbn'],
                'from_version': row['from_version'],
                'to_version': row['to_version'],
                'old_sentence': old_sentence,
                'new_sentence': new_sentence,
                'similarity': similarity,
                'edit_type': edit_type
            })
    
    # Create output dataframe
    df_output = pd.DataFrame(records)
    
    # Save results
    print(f"Saving results to {output_file}...")
    df_output.to_parquet(output_file, index=False)
    
    # Print statistics
    print("\nEdit type distribution:")
    print(df_output['edit_type'].value_counts())
    print(f"\nTotal sentence pairs processed: {len(df_output)}")
    print(f"Note: Low similarity pairs (< {sim_threshold}) were split into DELETION + ADDITION")
    
    return df_output

# Basic version without punctuation stripping
df_labeled = classify_sentence_edits(
    input_file=config['aligned_file'],
    output_file=config['labeled_file'],
    sim_threshold=0.45
)

## **Extract micro-edits**

This code filters out high-similarity NO_CONTENT_CHANGE sentence pairs (excluding exact matches) to extract likely linguistic/stylistic edits and saves them.

In [None]:
LOWER_SIM = 0.9
UPPER_SIM = 1     # exclude exact duplicates

df = pd.read_parquet(config["labeled_file"])

df_q = (
    df.query("edit_type == 'NO_CONTENT_CHANGE'")
      .query(f"{LOWER_SIM} <= similarity < {UPPER_SIM}")
      .copy()
)

df_q.to_parquet(config["nli_filtered_file"], index=False)
print(f"Kept {len(df_q):,} micro-edits between {LOWER_SIM}–{UPPER_SIM} similarity")
