In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Merged alignment preparation script with stratified K-fold cross-validation using sklearn
Reads CSV, applies stratified sampling, and creates alignment input files
"""
#1 FIXED tested sample randomly and in equal proportion
#2 stratified library random
#3 FIXED sets - not Cross-Val

import re
import os
import pandas as pd
import numpy as np 
from sklearn.model_selection import StratifiedKFold

# ============================================================
# TOKENIZATION
# ============================================================

def simple_tokenize(text):
    """Fast regex tokenizer"""
    tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
    return tokens

def segment_with_underscores(text):
    """Segment text with underscores between tokens"""
    return "_".join(simple_tokenize(text))

# ============================================================
# STRATIFIED SAMPLING WITH SKLEARN
# ============================================================

def stratified_split(df, k_folds=5, random_state=42):
    """
    Perform stratified K-fold split using sklearn.model_selection.StratifiedKFold
    Maintains:
    - 65% left-as-is (corrected=False) / 35% corrected (corrected=True)
    - Same corpus proportions in each fold
    
    Parameters:
    -----------
    df : DataFrame with columns 'corpus', 'corrected', 'src', 'tgt'
    k_folds : number of folds for cross-validation
    random_state : random seed for reproducibility
    
    Returns:
    --------
    list of tuples: [(train_df, test_df), ...] for each fold
    """
    # Target proportions for reference
    target_corpus_props = {
        'Kolipsi_1_L1': 0.2287,
        'Kolipsi_1_L2': 0.2728,
        'Kolipsi_2': 0.3076,
        'LEONIDE': 0.1908
    }
    
    target_correction_props = {
        False: 0.65,  # left-as-is
        True: 0.35    # corrected
    }
    
    # Create stratification key combining corpus and correction status
    df['strat_key'] = df['corpus'].astype(str) + '_' + df['corrected'].astype(str)
    
    # Initialize StratifiedKFold from sklearn
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=random_state)
    
    # Get indices for X (features) and y (stratification labels)
    X = df.index.to_numpy()
    y = df['strat_key'].to_numpy()
    
    splits = []
    
    # Generate train/test splits using sklearn
    for fold_idx, (train_indices, test_indices) in enumerate(skf.split(X, y), 1):
        train_df = df.iloc[train_indices].copy()
        test_df = df.iloc[test_indices].copy()
        
        splits.append((train_df, test_df))
        
        # Print statistics
        print(f"\n=== FOLD {fold_idx}/{k_folds} ===")
        print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")
        
        for split_name, split_df in [("Train", train_df), ("Test", test_df)]:
            print(f"\n{split_name} set:")
            print("  Corpus distribution:")
            for corpus in target_corpus_props.keys():
                count = (split_df['corpus'] == corpus).sum()
                prop = count / len(split_df) if len(split_df) > 0 else 0
                print(f"    {corpus}: {count} ({prop:.2%}) [target: {target_corpus_props[corpus]:.2%}]")
            
            print("  Correction distribution:")
            for corrected_val in [False, True]:
                count = (split_df['corrected'] == corrected_val).sum()
                prop = count / len(split_df) if len(split_df) > 0 else 0
                label = "left-as-is" if not corrected_val else "corrected"
                print(f"    {label}: {count} ({prop:.2%}) [target: {target_correction_props[corrected_val]:.2%}]")
    
    # Drop temporary column
    df.drop('strat_key', axis=1, inplace=True)
    
    return splits

# ============================================================
# SAVE ALIGNMENT FILES
# ============================================================

def save_alignment_files(df, out_dir, set_name):
    """
    Save alignment input files from DataFrame
    
    Parameters:
    -----------
    df : DataFrame with 'src' and 'tgt' columns
    out_dir : output directory
    set_name : name for the file set (e.g., 'train', 'test')
    """
    os.makedirs(out_dir, exist_ok=True)
    
    print(f"\nCreating alignment files for: {set_name}")
    print(f"Total sentence pairs: {len(df)}")
    
    # File paths
    src_path = os.path.join(out_dir, f"{set_name}_src.txt")
    tgt_path = os.path.join(out_dir, f"{set_name}_tgt.txt")
    src_tok = os.path.join(out_dir, f"{set_name}_src.tok")
    tgt_tok = os.path.join(out_dir, f"{set_name}_tgt.tok")
    src_segm = os.path.join(out_dir, f"{set_name}_src.segm")
    tgt_segm = os.path.join(out_dir, f"{set_name}_tgt.segm")
    
    with open(src_path, "w", encoding="utf-8") as fsrc, \
         open(tgt_path, "w", encoding="utf-8") as ftgt, \
         open(src_tok, "w", encoding="utf-8") as fsrc_tok, \
         open(tgt_tok, "w", encoding="utf-8") as ftgt_tok, \
         open(src_segm, "w", encoding="utf-8") as fsrc_seg, \
         open(tgt_segm, "w", encoding="utf-8") as ftgt_seg:
        
        for _, row in df.iterrows():
            s = str(row['src']).strip()
            t = str(row['tgt']).strip()
            
            # Write plain text
            fsrc.write(s + "\n")
            ftgt.write(t + "\n")
            
            # Tokenized
            s_tok = simple_tokenize(s)
            t_tok = simple_tokenize(t)
            fsrc_tok.write(" ".join(s_tok) + "\n")
            ftgt_tok.write(" ".join(t_tok) + "\n")
            
            # Segmented (underscore-separated)
            fsrc_seg.write(segment_with_underscores(s) + "\n")
            ftgt_seg.write(segment_with_underscores(t) + "\n")
    
    print(f"✓ Alignment files created in: {out_dir}")
    print(f"  - {set_name}_src.txt / {set_name}_tgt.txt (plain)")
    print(f"  - {set_name}_src.tok / {set_name}_tgt.tok (tokenized)")
    print(f"  - {set_name}_src.segm / {set_name}_tgt.segm (segmented)")

# ============================================================
# MAIN PROCESSING FUNCTION
# ============================================================

def process_csv_to_alignment(csv_path, out_dir="alignment_output", k_folds=5, random_state=42):
    """
    Main function: Load CSV, apply stratified sampling with sklearn, create alignment files
    
    Parameters:
    -----------
    csv_path : path to input CSV file
    out_dir : output directory for alignment files
    k_folds : number of cross-validation folds
    random_state : random seed
    """
    # Load CSV
    print(f"Loading CSV from: {csv_path}")
    df = pd.read_csv(csv_path)
    
    # Verify required columns
    required_cols = ['src', 'tgt', 'corpus', 'corrected']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    print(f"\nTotal sentences loaded: {len(df)}")
    print(f"Columns: {list(df.columns)}")
    
    # Convert 'corrected' to boolean if needed
    if df['corrected'].dtype == 'object':
        df['corrected'] = df['corrected'].map({'True': True, 'False': False, True: True, False: False})
    
    # Perform stratified split with sklearn
    print("\n" + "="*60)
    print("PERFORMING STRATIFIED K-FOLD SPLIT (using sklearn)")
    print("="*60)
    splits = stratified_split(df, k_folds=k_folds, random_state=random_state)
    
    # Save alignment files for each fold
    print("\n" + "="*60)
    print("CREATING ALIGNMENT FILES")
    print("="*60)
    
    for fold_idx, (train_df, test_df) in enumerate(splits, 1):
        fold_dir = os.path.join(out_dir, f"fold_{fold_idx}")
        
        save_alignment_files(train_df, fold_dir, f"train")
        save_alignment_files(test_df, fold_dir, f"test")
    
    print("\n" + "="*60)
    print("✓ ALL PROCESSING COMPLETE")
    print("="*60)
    print(f"Output directory: {out_dir}")
    print(f"Number of folds: {k_folds}")
    print(f"Method: sklearn.model_selection.StratifiedKFold")
    
    return splits

# ============================================================
# EXAMPLE USAGE
# ============================================================

if __name__ == "__main__":
    # Example usage
    csv_file = "all_corpora.csv"  # Replace with your CSV path
    output_directory = "aligned_sets"
    
    # Process CSV and create alignment files
    splits = process_csv_to_alignment(
        csv_path=csv_file,
        out_dir=output_directory,
        k_folds=5,
        random_state=42
    )
    
    print("\nYou can access the splits programmatically:")
    print("  splits[0] = (train_df_fold1, test_df_fold1)")
    print("  splits[1] = (train_df_fold2, test_df_fold2)")
    print("  etc.")

Loading CSV from: all_corpora.csv

Total sentences loaded: 27889
Columns: ['corpus', 'lang_prof', 'xml_file', 'file_id', 'sent_num', 'src', 'tgt', 'corrected']

PERFORMING STRATIFIED K-FOLD SPLIT (using sklearn)

=== FOLD 1/5 ===
Train size: 22311, Test size: 5578

Train set:
  Corpus distribution:
    Kolipsi_1_L1: 4013 (17.99%) [target: 22.87%]
    Kolipsi_1_L2: 4065 (18.22%) [target: 27.28%]
    Kolipsi_2: 10714 (48.02%) [target: 30.76%]
    LEONIDE: 3519 (15.77%) [target: 19.08%]
  Correction distribution:
    left-as-is: 14583 (65.36%) [target: 65.00%]
    corrected: 7728 (34.64%) [target: 35.00%]

Test set:
  Corpus distribution:
    Kolipsi_1_L1: 1004 (18.00%) [target: 22.87%]
    Kolipsi_1_L2: 1016 (18.21%) [target: 27.28%]
    Kolipsi_2: 2678 (48.01%) [target: 30.76%]
    LEONIDE: 880 (15.78%) [target: 19.08%]
  Correction distribution:
    left-as-is: 3646 (65.36%) [target: 65.00%]
    corrected: 1932 (34.64%) [target: 35.00%]

=== FOLD 2/5 ===
Train size: 22311, Test size: 5