In [1]:
import pandas as pd

df = pd.read_csv("./data/preprocessed/data_normalized.csv")
df.head()

Unnamed: 0,language,sentence,n,vertex,degree,closeness,betweenness,pagerank,katz,load,is_root,degree_norm,closeness_norm,betweenness_norm,pagerank_norm,katz_norm,load_norm,n_norm
0,Arabic,2,21,10,0.15,7.547655,0.415789,0.070385,0.24333,0.415789,1,1.0,0.933542,0.724771,0.932971,0.996388,0.724771,0.268657
1,Arabic,2,21,8,0.15,7.803968,0.568421,0.068442,0.2435,0.568421,0,1.0,1.0,0.990826,0.891309,1.0,0.990826,0.268657
2,Arabic,2,21,5,0.1,6.247655,0.1,0.051047,0.218981,0.1,0,0.5,0.596475,0.174312,0.518343,0.477072,0.174312,0.268657
3,Arabic,2,21,13,0.05,4.803211,0.0,0.028838,0.196845,0.0,0,0.0,0.221956,0.0,0.042182,0.004953,0.0,0.268657
4,Arabic,2,21,6,0.1,7.171825,0.521053,0.046262,0.22322,0.521053,0,0.5,0.836096,0.908257,0.415764,0.567474,0.908257,0.268657


In [2]:
from sklearn.model_selection import KFold
import pandas as pd
import os

def kfold_split_by_sentence_language(df, n_splits=5, random_state=42, shuffle=True, output_dir="./data/cross_validation"):
    """
    Performs KFold split ensuring all instances of the same sentence-language pair stay together.
    Saves each fold as separate CSV files.
    
    Args:
        df: Input DataFrame containing columns 'language' and 'sentence'
        n_splits: Number of folds (default: 5)
        random_state: Random seed for reproducibility
        shuffle: Whether to shuffle data before splitting
        output_dir: Directory to save fold CSVs
        
    Returns:
        List of (train_df, test_df) tuples for each fold
    """
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Step 1: Get unique sentence-language pairs
    sentence_lang_pairs = df[['language', 'sentence']].drop_duplicates().reset_index(drop=True)
    
    # Step 2: Setup KFold
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    folds = []

    # Step 3: Apply KFold to the pairs
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(sentence_lang_pairs), 1):
        train_pairs = sentence_lang_pairs.iloc[train_idx]
        test_pairs = sentence_lang_pairs.iloc[test_idx]

        # Merge back to get full data for this fold
        train_df = pd.merge(df, train_pairs, on=['language', 'sentence'])
        test_df = pd.merge(df, test_pairs, on=['language', 'sentence'])

        # Save to CSV
        train_df.to_csv(f"{output_dir}/fold_{fold_idx}_train.csv", index=False)
        test_df.to_csv(f"{output_dir}/fold_{fold_idx}_test.csv", index=False)
        
        folds.append((train_df, test_df))

    return folds

# Usage example:
folds = kfold_split_by_sentence_language(df)