In [None]:
import pandas as pd

In [None]:
tp_multi = pd.read_json(r'C:\Users\jiayue.tan\OneDrive - YTL\Workstation\20251002LanguageFollowing_V2\text_processing_ext_ref_multi_turn\all_v2.jsonl',lines=True)
tp_multi['source'] = 'text_processing_ext_ref_multi_turn'

tp_single = pd.read_json(r'C:\Users\jiayue.tan\OneDrive - YTL\Workstation\20251002LanguageFollowing_V2\text_processing_ext_ref_single_turn\all_v2.jsonl',lines=True)
tp_single['source'] = 'text_processing_ext_ref_single_turn'

tp = pd.concat([tp_multi,tp_single])
tp.head()

In [None]:
import re

def get_total_word_count(question: str, answer: str, reason: str = "") -> int:
    combined_text = f"{question}{answer}{reason}"
    
    english_number_words = re.findall(r"[a-zA-Z0-9']+", combined_text)
    english_number_word_count = len(english_number_words)
    
    chinese_chars = re.findall(r'[\u4e00-\u9fff]', combined_text)
    chinese_char_count = len(chinese_chars)
    
    total_count = english_number_word_count + chinese_char_count
    return total_count

def get_turn_languages_standard(data_list):
    languages = []
    for item in data_list[1:]:
        metadata = item.get('metadata')
        if metadata:
            languages.append(metadata.get('language', 'N/A'))
    return languages

def flag_if_list_contains_chinese(language_list: list) -> int:
    
    if not isinstance(language_list, list):
        return 0
    for item in language_list:
        if isinstance(item, str) and "chinese" in item.lower():
            return 1
    return 0

In [None]:
tp['original_id'] = "langfol1014_" + (tp.index + 1).astype(str).str.zfill(5)
tp['sft_round'] = "LanguageFollowing"
tp['question'] = tp['prompt']
tp['answer'] = tp['response']

tp['language'] = tp['reference'].apply(lambda x:x.get('language'))
# The original lambda, modified to start from the second item

# 1. Define a function that does the work for a single row's reference dictionary
def extract_languages_from_reference(ref_dict):
    """
    Safely extracts 'turn_language' from a single reference dictionary.
    Returns a list of languages or ['N/A'] if no data is found.
    """
    # Get the list of metadata, defaulting to an empty list if the key is missing
    metadata_list = ref_dict.get('all_turn_metadata', [])
    
    # If the list is empty, return a list with a single 'N/A' placeholder
    if not metadata_list:
        return ['N/A']
        
    # If the list is not empty, use the lambda to extract languages
    return [item.get('metadata', {}).get('turn_language', 'N/A') for item in metadata_list]

tp['turn_languages'] = tp['reference'].apply(extract_languages_from_reference)
tp['chinese_flag'] = tp['turn_languages'].apply(flag_if_list_contains_chinese)

tp['style'] = tp['reference'].apply(lambda x: x.get('style'))
tp['rendered_history'] = tp['history'].apply(render_messages_safe)

tp['reason'] = tp['rendered_history']
tp['task'] = "LANGUAGE: " + tp['language'].astype(str) + " | STYLE: " + tp['style'].astype(str) + " | SOURCE: " + tp['source']
tp['domain'] = tp['language']
tp['total_word_count'] = tp.apply(lambda row: get_total_word_count(row['question'], row['answer'], row['reason']), axis=1)

tp2 = tp[['original_id','sft_round','question','answer','reason','turn_languages','chinese_flag','task','domain','style','total_word_count']]

def classify_wordcount_class(total_words_count):
    if total_words_count <= 600:
        return 'short(1-600)'
    elif total_words_count>=601 and total_words_count <= 1100:
        return 'medium(601-1100)'
    elif total_words_count>=1101 and total_words_count <= 1900:
        return 'long(1101-1900)'
    elif total_words_count>=1901:
        return 'very_long(>1901)'
    else: 
        return 'you_tell_me'
    
tp2['wordcount_class'] = tp2['total_word_count'].apply(classify_wordcount_class)
tp2.groupby(['wordcount_class']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

tp2.head(2)

In [None]:
import pandas as pd
import numpy as np
import math

def create_balanced_samples_flexible(
    df: pd.DataFrame,
    stratify_cols: list,
    samples_per_annotator: int,
    num_annotators: int,
    group_id_col: str = 'package_id',
    random_state: int = None
) -> pd.DataFrame:
    """
    Creates a balanced dataset for annotators, adjusting if not enough data is available.
    If requested samples exceed data size, it uses ceiling logic and distributes until data runs out.
    """
    if not all(col in df.columns for col in stratify_cols):
        raise ValueError("One or more columns in stratify_cols are not in the DataFrame.")
    if samples_per_annotator <= 0 or num_annotators <= 0:
        raise ValueError("Samples per annotator and number of annotators must be positive.")

    total_samples_needed = samples_per_annotator * num_annotators

    if total_samples_needed > len(df):
        # ceil version — try to spread evenly until data runs out
        max_possible_per_annotator = math.ceil(len(df) / num_annotators)
        print(f"⚠️ WARNING: Not enough data for full request.")
        print(f"Requested {total_samples_needed} samples, but only {len(df)} available.")
        print(f"Adjusting 'samples_per_annotator' to {max_possible_per_annotator} (ceil logic).")
        samples_per_annotator = max_possible_per_annotator

    # Compute total possible
    total_samples_needed = min(samples_per_annotator * num_annotators, len(df))

    grouped = df.groupby(stratify_cols)
    master_sample_list = []

    for _, group_df in grouped:
        stratum_size = len(group_df)
        samples_from_stratum = round((stratum_size / len(df)) * total_samples_needed)
        samples_from_stratum = min(samples_from_stratum, stratum_size)

        if samples_from_stratum > 0:
            sample = group_df.sample(n=samples_from_stratum, random_state=random_state)
            master_sample_list.append(sample)

    master_pool_df = pd.concat(master_sample_list, ignore_index=True)

    # If under or over
    current_pool_size = len(master_pool_df)
    if current_pool_size < total_samples_needed:
        remaining_df = df[~df.index.isin(master_pool_df.index)]
        if len(remaining_df) > 0:
            additional_n = min(total_samples_needed - current_pool_size, len(remaining_df))
            master_pool_df = pd.concat(
                [master_pool_df, remaining_df.sample(n=additional_n, random_state=random_state)],
                ignore_index=True
            )
    elif current_pool_size > total_samples_needed:
        master_pool_df = master_pool_df.sample(n=total_samples_needed, random_state=random_state)

    # Shuffle and assign group IDs
    master_pool_df = master_pool_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Assign groups evenly until data runs out
    master_pool_df[group_id_col] = (np.arange(len(master_pool_df)) // samples_per_annotator) + 1
    master_pool_df[group_id_col] = master_pool_df[group_id_col].clip(upper=num_annotators)

    return master_pool_df


In [None]:
stratification_columns = ['domain', 'wordcount_class']
samples_for_each_packages = 37.5
number_of_packages = 20

# --- 3. Run the Function ---
annotator_dfs = create_balanced_samples_flexible(
    tp2,
    stratify_cols=stratification_columns,
    samples_per_annotator=samples_for_each_packages,
    num_annotators=number_of_packages,
    random_state=42
)

print("DataFrame with Annotator Group Assignments:")
# print(df_with_groups)
print("-" * 30)

annotator_dfs.groupby(['wordcount_class','package_id']).size()

In [None]:
annotator_dfs.groupby(['package_id']).size().reset_index(name='counts').sort_values(by=['package_id'], ascending=True)

In [None]:
annotator_dfs.groupby(['package_id']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)