In [None]:
import os
import re
import json
import regex
import shutil
from typing import Any, Tuple, Optional
from google.colab import drive

# drive.mount("/content/drive")

## Helper Functions for Word Count

In [None]:
def count_word_multilingual(text: str) -> int:
    """
    Performs rough tokenization of the input text and counts the number of tokens.
    Supports mixed languages including English, Chinese (including Japanese Kanji),
    Spanish, Portuguese, French, German, Russian, Norwegian, and Japanese.

    Rules:
    1) [0-9\\.]+ : Matches consecutive digits (including decimal points) as a single token.
    2) \\p{Han} : Matches a single Chinese character (including Kanji used in Japanese).
    3) \\p{Hiragana}+ : Matches consecutive Hiragana characters as a single token.
    4) \\p{Katakana}+ : Matches consecutive Katakana characters as a single token.
    5) \\p{Cyrillic}+ : Matches consecutive Cyrillic letters (Russian).
    6) \\p{Latin}+ : Matches consecutive Latin letters (including diacritics),
                     supporting English, Spanish, Portuguese, French, German, Norwegian, etc.

    Notes:
    - Each Chinese character (\\p{Han}) is treated as an individual token.
      For example, "你好" => ["你", "好"].
    - Consecutive characters from other scripts (e.g., "hello") are treated as a single token.
    - This is a simplified example and does not handle other symbols, punctuation,
      or complex numerical formats.
    - Requires the third-party module `regex` (pip install regex),
      because the built-in `re` module has incomplete support for Unicode properties \\p{...}.
    """
    pattern = (
        r'[0-9\.]+'        # Consecutive digits and decimal points
        r'|\p{Han}'        # Single Chinese character
        r'|\p{Hiragana}+'  # Consecutive Hiragana characters
        r'|\p{Katakana}+'  # Consecutive Katakana characters
        r'|\p{Cyrillic}+'  # Consecutive Cyrillic letters
        r'|\p{Latin}+'     # Consecutive Latin letters (including diacritics)
    )
    tokens = regex.findall(pattern, text)
    return tokens, len(tokens)

def sort_files(files_list):

    num_re = re.compile(r"^(\d+)")  # capture 1+ digits at start

    def leading_num(fname):
        m = num_re.match(fname)
        if m:
            return int(m.group(1))
        else:
            return float("inf")  # or 0, depending on where you want no‑number files

    sorted_files = sorted(files_list, key=leading_num)

    return sorted_files

def split_by_analysis(
    instructions: str,
    prediction: str,
    record_id: Any
) -> Tuple[str, Optional[str], Any]:
    """
    Split a model's output into (analysis, result) based on markers.

    Specifically designed for the format:
    Analysis:
    [CoT reasoning]
    Result:
    [answer]

    Returns the CoT reasoning (without the "Analysis:" label) and the result.
    """
    # 1: Split by "</think>" for reasoning models
    try:
        parts = prediction.split("</think>", 1)
        if len(parts) == 2:
            analysis = parts[0].strip()

            # Remove "Analysis:" prefix if present
            analysis = re.sub(r'^Analysis:\s*', '', analysis, flags=re.IGNORECASE).strip()

            # Remove "<think>" prefix if present
            analysis = re.sub(r'<think>', '', analysis, flags=re.IGNORECASE).strip()

            result = parts[1].strip()
            return analysis, result, record_id
    except ValueError:
        pass

    # 2: Split by "Result:" for non-reasoning models
    try:
        parts = prediction.split("Result:", 1)
        if len(parts) == 2:
            analysis = parts[0].strip()

            # Remove "Analysis:" prefix if present
            analysis = re.sub(r'^Analysis:\s*', '', analysis, flags=re.IGNORECASE).strip()

            result = parts[1].strip()
            return analysis, result, record_id
    except ValueError:
        pass

    # 3: If "Result:" does not exist, derive the label from instructions
    match = re.search(r"Result:\s*([^:]+?):", instructions)  # find the label in "Result: [label]:..."
    if match:
        label = match.group(1).strip()
        marker = f"{label}:"  # [label]:...
        try:
            analysis, result = prediction.split(marker, 1)  # look for this label in the prediction

            # Remove "Analysis:" prefix if present
            analysis = re.sub(r'^Analysis:\s*', '', analysis.strip(), flags=re.IGNORECASE).strip()
            return analysis, result, record_id
        except ValueError:
            pass


    # 4: Otherwise, return the whole output (prediction) as the analysis
    analysis = prediction.strip()

    return analysis, None, record_id


def aug_cot_length(json_path):
    """
    Given a json file of data, return the number of tokens in the analysis and result
    and add to each entry.

    RETURNS:
        - Augments the json file with the length of CoT analysis
        - returns the same list of dictionaries
    """

    with open(json_path, 'r') as f:
        json_content = json.load(f)

    for i, d in enumerate(json_content):
        try:
            analysis, result, _ = split_by_analysis(d["instruction"], d["pred"], d["id"])

            if analysis:
                _, num_tokens = count_word_multilingual(analysis)
                d['CoT Length'] = num_tokens
            else:
                # No analysis found, set to 0 or handle as needed
                d['CoT Length'] = 0
                print(f"Warning: No analysis found for record {i} (id: {d.get('id', 'unknown')})")

        except KeyError as e:
            print(f"Error: Missing key {e} in record {i}")
            d['CoT Length'] = 0
        except Exception as e:
            print(f"Error processing record {i}: {e}")
            d['CoT Length'] = 0

    return json_content

## Migrate the "Direct" Files

In [None]:
import subprocess

# For direct
def copy_files_with_rsync():
    for task_name in os.listdir(path):
        task_path = os.path.join(path, task_name)

        for model in os.listdir(task_path):
            model_path = os.path.join(task_path, model)

            direct_file = f"{task_name}-direct-greedy-42.result.json"
            direct_src_file = os.path.join(model_path, direct_file)
            dst_file = os.path.join(os.path.join(destination_path, task_name, model), f"{direct_file}")

            if os.path.exists(direct_src_file):
                try:
                    # Ensure destination directory exists
                    os.makedirs(os.path.dirname(dst_file), exist_ok=True)

                    # Use rsync with proper path handling for spaces
                    subprocess.run([
                        'rsync', '-av', '--mkpath',  # --mkpath creates destination directories
                        direct_src_file, dst_file
                    ], check=True)
                    # print(f"Synced: {dst_file}")
                except subprocess.CalledProcessError as e:
                    print(f"Error syncing {direct_src_file}: {e}")
                    # Fallback to cp if rsync fails
                    try:
                        subprocess.run(['cp', direct_src_file, dst_file], check=True)
                        print(f"Fallback copied: {dst_file}")
                    except subprocess.CalledProcessError as e2:
                        print(f"Fallback also failed: {e2}")
            else:
                print(f"File not found: {direct_src_file}")

print("DONE")

DONE


## Migrate the CoT files and augment with CoT output length

In [None]:
import os
import json
import subprocess

# For direct

path = '/content/drive/MyDrive/cot-analysis/combined-results'

destination_path = '/content/drive/MyDrive/cot-analysis/cot-length'

def copy_cot_files_with_rsync():
    for task_name in os.listdir(path):
        task_path = os.path.join(path, task_name)

        for model in os.listdir(task_path):
            model_path = os.path.join(task_path, model)

            cot_file = f"{task_name}-cot-greedy-42.result.json"
            cot_src_file = os.path.join(model_path, cot_file)  # Path to source file

            dst_file = os.path.join(os.path.join(destination_path, task_name, model), f"{cot_file}")  # Destination

            if os.path.exists(cot_src_file):
                try:
                    # Process the file and get augmented JSON data
                    augmented_cot_data = aug_cot_length(cot_src_file)  # json content

                    # Ensure destination directory exists
                    os.makedirs(os.path.dirname(dst_file), exist_ok=True)

                    # Save the augmented JSON data to destination file
                    with open(dst_file, 'w', encoding='utf-8') as f:
                        json.dump(augmented_cot_data, f, indent=4, ensure_ascii=False)

                except Exception as e:
                    print(f"Error processing {cot_src_file}: {e}")
            else:
                print(f"File not found: {cot_src_file}")

# Run the processing
print("Starting CoT length analysis and file creation...")
copy_cot_files_with_rsync()
