In [None]:
from sklearn.model_selection import KFold
import pandas as pd

def kfold_split_by_sentence_language(df, n_splits=5, random_state=42, shuffle=True):
    # Step 1: Get unique sentence-language pairs
    sentence_lang_pairs = df[['language', 'sentence']].drop_duplicates().reset_index(drop=True)
    
    # Step 2: Setup KFold
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    folds = []

    # Step 3: Apply KFold to the pairs
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(sentence_lang_pairs)):
        train_pairs = sentence_lang_pairs.iloc[train_idx]
        test_pairs = sentence_lang_pairs.iloc[test_idx]

        # Merge back to full data
        train_df = df.merge(train_pairs, on=['language', 'sentence'], how='inner')
        test_df = df.merge(test_pairs, on=['language', 'sentence'], how='inner')

        folds.append((train_df, test_df))

    return folds
