In [18]:
import sqlite3
import pandas as pd
from transformers import AutoTokenizer

def analyze_db(db_file: str, table_name: str = 'dataset') -> None:
    """
    Connects to the given SQLite database,
    analyzes the 'trace' column for empty values,
    and prints summary stats plus non-empty row indices.
    """
    # Load table
    conn = sqlite3.connect(db_file)
    df = pd.read_sql(f"SELECT trace FROM {table_name}", conn)
    conn.close()

    # Metrics
    total_rows       = len(df)
    null_rows        = df['trace'].isna().sum()
    empty_str_rows   = (df['trace'] == '').sum()
    whitespace_rows  = df['trace'].str.strip().eq('').sum()
    empty_rows       = null_rows + whitespace_rows  # counts '' and all-whitespace
    non_empty_mask   = df['trace'].notna() & (df['trace'].str.strip() != '')
    non_empty_rows   = non_empty_mask.sum()
    non_empty_indices = df[non_empty_mask].index.tolist()

    # Output
    print(f"Total rows:        {total_rows}")
    print(f"Null rows:         {null_rows}")
    print(f"Empty rows:        {empty_rows}")
    print(f"Non-empty rows:    {non_empty_rows}\n")
    print("Non-empty row indices:")
    print(non_empty_indices)

def merge_db(
    db1_file: str,
    db2_file: str,
    table_name: str = 'dataset',
    out_db_file: str = 'merged.db'
) -> None:
    """
    Merge two SQLite DBs by keeping 'question' and 'answer' from DB1
    and merging 'trace' by choosing the shorter-token one on conflicts.
    """
    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        "Qwen/Qwen3-0.6B", trust_remote_code=True
    )
    
    # load full df from DB1
    conn1 = sqlite3.connect(db1_file)
    df1 = pd.read_sql(f"SELECT * FROM {table_name}", conn1)
    conn1.close()
    
    # load only trace from DB2
    conn2 = sqlite3.connect(db2_file)
    df2 = pd.read_sql(f"SELECT trace FROM {table_name}", conn2)
    conn2.close()
    
    assert len(df1) == len(df2), "Row counts differ"
    
    t1 = df1['trace'].fillna('').astype(str)
    t2 = df2['trace'].fillna('').astype(str)
    
    merged_traces = []
    conflicts = []
    
    for i, (a, b) in enumerate(zip(t1, t2)):
        empty1 = not a.strip()
        empty2 = not b.strip()
        if empty1 and not empty2:
            choice = b
        elif empty2 and not empty1:
            choice = a
        elif a == b:
            choice = a
        else:
            # conflict: pick shorter in tokens
            l1 = len(tokenizer(a).input_ids)
            l2 = len(tokenizer(b).input_ids)
            choice = a if l1 <= l2 else b
            conflicts.append(i)
        merged_traces.append(choice)
    
    # assign merged traces back to df1
    df1['trace'] = merged_traces
    
    # write merged DB
    conn_out = sqlite3.connect(out_db_file)
    df1.to_sql(table_name, conn_out, index=False, if_exists='replace')
    conn_out.close()
    
    # report
    print(f"Total rows:      {len(df1)}")
    print(f"Conflicts found: {len(conflicts)}")
    print("Conflict indices:", conflicts)

def show_trace(db_file: str, idx: int, table_name: str = 'dataset') -> None:
    """
    Load the 'trace' column from the given SQLite database,
    print the trace at row `idx` with separators, and its token count
    using the Qwen/Qwen3-0.6B tokenizer.
    """
    # load
    conn = sqlite3.connect(db_file)
    df = pd.read_sql(f"SELECT trace FROM {table_name}", conn)
    conn.close()

    # bounds check
    if idx < 0 or idx >= len(df):
        raise IndexError(f"Index {idx} out of range (0–{len(df)-1})")

    # get trace
    trace = df.at[idx, 'trace'] or ""

    sep = "-" * 80

    # output with separators
    print(f"Trace[{idx}]:")
    print(sep)
    print(trace)
    print(sep)

    # tokenize & count
    tokenizer = AutoTokenizer.from_pretrained(
        "Qwen/Qwen3-0.6B", trust_remote_code=True
    )
    tokens = tokenizer(trace).input_ids
    print(f"Token count: {len(tokens)}")


In [24]:
DB_FILE_1 = 'dataset_4qwen3_250609a_1.db'
DB_FILE_2 = 'dataset_4qwen3_250609a_2.db'
DB_FILE_MERGED = 'dataset_4qwen3_250609a_merged.db'

In [25]:
analyze_db(DB_FILE_1)

Total rows:        1817
Null rows:         0
Empty rows:        1782
Non-empty rows:    35

Non-empty row indices:
[0, 2, 5, 18, 35, 36, 41, 45, 50, 54, 59, 60, 61, 67, 70, 71, 72, 73, 76, 77, 78, 81, 84, 91, 258, 273, 279, 280, 297, 303, 305, 321, 325, 331, 337]


In [26]:
analyze_db(DB_FILE_2)

Total rows:        1817
Null rows:         0
Empty rows:        1785
Non-empty rows:    32

Non-empty row indices:
[502, 506, 509, 511, 514, 522, 527, 528, 531, 534, 536, 537, 552, 553, 560, 562, 750, 752, 758, 764, 770, 776, 778, 785, 789, 790, 793, 794, 798, 806, 807, 811]


In [27]:
merge_db(
    DB_FILE_1,
    DB_FILE_2,
    out_db_file=DB_FILE_MERGED
)

Total rows:      1817
Conflicts found: 0
Conflict indices: []


In [28]:
analyze_db(DB_FILE_MERGED)

Total rows:        1817
Null rows:         0
Empty rows:        1750
Non-empty rows:    67

Non-empty row indices:
[0, 2, 5, 18, 35, 36, 41, 45, 50, 54, 59, 60, 61, 67, 70, 71, 72, 73, 76, 77, 78, 81, 84, 91, 258, 273, 279, 280, 297, 303, 305, 321, 325, 331, 337, 502, 506, 509, 511, 514, 522, 527, 528, 531, 534, 536, 537, 552, 553, 560, 562, 750, 752, 758, 764, 770, 776, 778, 785, 789, 790, 793, 794, 798, 806, 807, 811]


In [29]:
show_trace(DB_FILE_MERGED, 84)

Trace[84]:
--------------------------------------------------------------------------------

Okay, let me try to work through this problem step by step. So the problem says Gary bought a large beverage and drank m/n of it, where m and n are coprime positive integers. Then, if he had purchased half as much and drunk twice as much, he would have wasted only 2/9 as much beverage. We need to find m + n.

First, let me make sure I understand the problem correctly. Gary originally bought a certain amount, let's say the total beverage is T. Then he drank m/n of T, so the amount wasted is T - m/n * T = T(1 - m/n). Then the problem says, if he had bought half as much, so T/2, and drank twice as much, which would be 2*(m/n)*T, then the wasted amount would be (T/2) - 2*(m/n)*T. And this wasted amount is said to be 2/9 as much as the original wasted amount. 

Wait, so the original wasted amount is T - T*(m/n) = T*(1 - m/n). The new wasted amount, after buying half and drinking twice, is (T/2) - 2*