In [12]:
import pandas as pd
import re
import numpy as np

def parse_graph_info_file(txt_path):
    records = []
    with open(txt_path, 'r') as f:
        current_difficulty = None
        for line in f:
            line = line.strip()
            # Detect difficulty section headers
            m = re.match(r'^(easy|medium|hard): \d+ file', line)
            if m:
                current_difficulty = m.group(1)
            # Detect entries with path and score
            else:
                m2 = re.match(r'^(.*?)\s*\(score:\s*([\d.]+)\)', line)
                if m2 and current_difficulty:
                    path = m2.group(1).strip()
                    score = float(m2.group(2))
                    records.append({
                        'file_path': path,
                        'difficulty': current_difficulty,
                        'complexity': score
                    })
    return pd.DataFrame(records)

# Parse the three files
full = parse_graph_info_file('../src/graph_complexity_info_3493.txt')
cand = parse_graph_info_file('../src/graph_complexity_info_666.txt')
sample = parse_graph_info_file('../src/graph_complexity_info_69.txt')

# 1) Retain only easy/medium from the existing sample (drop hard)
full_keep = full[full['difficulty'].isin(['easy', 'medium'])]
print("======== Full Dataset Statistics: ========")
print("full_keep shape:", full_keep.shape)
print("Existing full counts:\n", full_keep['difficulty'].value_counts())

print("======== Sample Dataset Statistics: ========")
sample_keep = sample[sample['difficulty'].isin(['easy', 'medium'])]
print("sample_keep shape:", sample_keep.shape)

# 2) Compute existing counts
counts = sample_keep['difficulty'].value_counts()
print("Existing sample counts:\n", counts)

# 3) Cohen's d between full and existing sample
def cohens_d(a, b):
    nx, ny = len(a), len(b)
    mean_diff = a.mean() - b.mean()
    varx = a.var(ddof=1)
    vary = b.var(ddof=1)
    print("\n================= Data Statistics:================")
    print("\nFull Data Statistics:")
    print(f"  Count: {nx}")
    print(f"  Mean: {a.mean():.4f}")
    print(f"  Variance: {varx:.4f}")

    print("\nSample Data Statistics:")
    print(f"  Count: {ny}")
    print(f"  Mean: {b.mean():.4f}")
    print(f"  Variance: {vary:.4f}")

    pooled_sd = np.sqrt(((nx-1)*varx + (ny-1)*vary) / (nx+ny-2))
    return mean_diff / pooled_sd

print("============= After dropping hard =============")
print("Number of Full Dataset: ", len(full_keep['complexity']))
print("Number of Sample Dataset: ", len(sample_keep['complexity']))

d_initial = cohens_d(full_keep['complexity'], sample_keep['complexity'])
print("\nInitial Cohen's d:", d_initial)

# 4) Determine how many more easy/medium are needed
needed_easy = 50 - counts.get('easy', 0)
needed_medium = 64 - counts.get('medium', 0)
print(f"\nNeed additional: {needed_easy} easy; {needed_medium} medium")

# 5) Filter candidate pool (exclude already kept)
cand_filtered = cand[~cand['file_path'].isin(sample_keep['file_path'])]
cand_easy = cand_filtered[cand_filtered['difficulty']=='easy']
cand_med = cand_filtered[cand_filtered['difficulty']=='medium']

# 6) Sort candidates by closeness to full dataset mean complexity
full_mean = full_keep['complexity'].mean()
cand_easy = cand_easy.assign(dist=(cand_easy['complexity'] - full_mean).abs()).sort_values('dist')
cand_med = cand_med.assign(dist=(cand_med['complexity'] - full_mean).abs()).sort_values('dist')

# 7) Select the top needed from each
selected_easy = cand_easy.head(needed_easy)
selected_med = cand_med.head(needed_medium)
selected = pd.concat([selected_easy, selected_med])

# 8) Output the directories of the selected entries
print("\nSelected directories to add:")
for path in selected['file_path']:
    print(path.rsplit('/', 1)[0])

# 9) Verify new Cohen's d
new_sample = pd.concat([sample_keep, selected])
d_new = cohens_d(full_keep['complexity'], new_sample['complexity'])
print("\nNew Cohen's d:", d_new)


full_keep shape: (3486, 3)
Existing full counts:
 difficulty
medium    1958
easy      1528
Name: count, dtype: int64
sample_keep shape: (68, 3)
Existing sample counts:
 difficulty
easy      34
medium    34
Name: count, dtype: int64
Number of Full Dataset:  3486
Number of Sample Dataset:  68


Full Data Statistics:
  Count: 3486
  Mean: 0.3510
  Variance: 0.0089

Sample Data Statistics:
  Count: 68
  Mean: 0.3046
  Variance: 0.0258

Initial Cohen's d: 0.48347261079009923

Need additional: 16 easy; 30 medium

Selected directories to add:
../cubicasa5k-666/high_quality_architectural/12482
../cubicasa5k-666/high_quality_architectural/11008
../cubicasa5k-666/high_quality_architectural/2528
../cubicasa5k-666/high_quality/10422
../cubicasa5k-666/high_quality_architectural/5028
../cubicasa5k-666/high_quality/5972
../cubicasa5k-666/high_quality_architectural/3933
../cubicasa5k-666/high_quality_architectural/11002
../cubicasa5k-666/high_quality_architectural/13202
../cubicasa5k-666/high_quality_

In [19]:
import pandas as pd
import re
import numpy as np

def parse_graph_info_file(txt_path):
    records = []
    with open(txt_path, 'r') as f:
        current_difficulty = None
        for line in f:
            line = line.strip()
            # Detect difficulty section headers
            m = re.match(r'^(easy|medium|hard): \d+ file', line)
            if m:
                current_difficulty = m.group(1)
            # Detect entries with path and score
            else:
                m2 = re.match(r'^(.*?)\s*\(score:\s*([\d.]+)\)', line)
                if m2 and current_difficulty:
                    path = m2.group(1).strip()
                    score = float(m2.group(2))
                    records.append({
                        'file_path': path,
                        'difficulty': current_difficulty,
                        'complexity': score
                    })
    return pd.DataFrame(records)

# Parse the three files
full = parse_graph_info_file('../src/graph_complexity_info_3493.txt')
cand = parse_graph_info_file('../src/graph_complexity_info_666.txt')
sample = parse_graph_info_file('../src/graph_complexity_info_69.txt')

# 1) Retain only easy/medium from the existing sample (drop hard)
full_keep = full[full['difficulty'].isin(['easy', 'medium'])]
print("======== Full Dataset Statistics: ========")
print("full_keep shape:", full_keep.shape)
print("Existing full counts:\n", full_keep['difficulty'].value_counts())

print("======== Sample Dataset Statistics: ========")
sample_keep = sample[sample['difficulty'].isin(['easy', 'medium'])]
print("sample_keep shape:", sample_keep.shape)

# 2) Compute existing counts
counts = sample_keep['difficulty'].value_counts()
print("Existing sample counts:\n", counts)

# 3) Cohen's d between full and existing sample
def cohens_d(a, b):
    nx, ny = len(a), len(b)
    mean_diff = a.mean() - b.mean()
    varx = a.var(ddof=1)
    vary = b.var(ddof=1)

    pooled_sd = np.sqrt(((nx-1)*varx + (ny-1)*vary) / (nx+ny-2))
    return mean_diff / pooled_sd

print("============= After dropping hard =============")
print("Number of Full Dataset: ", len(full_keep['complexity']))
print("Number of Sample Dataset: ", len(sample_keep['complexity']))

d_initial = cohens_d(full_keep['complexity'], sample_keep['complexity'])
print("\nInitial Cohen's d:", d_initial)

# 4) Determine how many more easy/medium are needed
needed_easy = max(0, 50 - counts.get('easy', 0))
needed_medium = max(0, 64 - counts.get('medium', 0))
print("=========================================")
print(f"Need additional: {needed_easy} easy; {needed_medium} medium")

# 5) Filter candidate pool (exclude already kept)
cand_filtered = cand[~cand['file_path'].isin(sample_keep['file_path'])]
cand_easy = cand_filtered[cand_filtered['difficulty']=='easy']
cand_med  = cand_filtered[cand_filtered['difficulty']=='medium']

# 6) Sort candidates by closeness to full dataset mean complexity
full_mean = full_keep['complexity'].mean()
cand_easy = cand_easy.assign(dist=(cand_easy['complexity'] - full_mean).abs())\
                     .sort_values('dist')
cand_med  = cand_med.assign(dist=(cand_med['complexity']  - full_mean).abs())\
                    .sort_values('dist')

# 7) Greedy selection to best reduce Cohen's d, stopping early if |d|<0.2
remaining = pd.concat([cand_easy, cand_med])
current   = sample_keep.copy()
selected  = pd.DataFrame(columns=current.columns)
stop_flag = False

for diff, needed in [('easy', needed_easy), ('medium', needed_medium)]:
    for _ in range(needed):
        # compute current d and stop if already below threshold
        curr_d = abs(cohens_d(full_keep['complexity'], current['complexity']))
        if curr_d < 0.2:
            print(f"Stopping early: |d|={curr_d:.4f} < 0.2")
            stop_flag = True
            break

        best_idx = None
        best_d   = curr_d

        # try each remaining candidate of this difficulty
        for idx, row in remaining[remaining['difficulty']==diff].iterrows():
            trial = pd.concat([current, pd.DataFrame([row])], ignore_index=True)
            d = abs(cohens_d(full_keep['complexity'], trial['complexity']))
            if d < best_d:
                best_d, best_idx = d, idx

        if best_idx is None:
            print(f"No more {diff} files improve Cohen's d; stopping.")
            stop_flag = True
            break

        # add the best one to current & to selected
        best_row  = remaining.loc[[best_idx]]
        current   = pd.concat([current,  best_row], ignore_index=True)
        selected  = pd.concat([selected, best_row], ignore_index=True)
        remaining = remaining.drop(best_idx)

        # check again after adding
        if best_d < 0.2:
            print(f"Reached |d|={best_d:.4f} < 0.2; stopping early.")
            stop_flag = True
            break

    if stop_flag:
        break

# 8) Output the directories of the selected entries
print("\nSelected directories to add:")
num = 1
for path in selected['file_path']:
    print(num, ": ", path.rsplit('/', 1)[0])
    num += 1

# 9) Verify new Cohen's d
new_sample = pd.concat([sample_keep, selected], ignore_index=True)
d_new = cohens_d(full_keep['complexity'], new_sample['complexity'])
a =full_keep['complexity']
b = new_sample['complexity']
nx, ny = len(a), len(b)
mean_diff = a.mean() - b.mean()
varx = a.var(ddof=1)
vary = b.var(ddof=1)
print("\n================= Data Statistics:================")
print("\nFull Data Statistics:")
print(f"  Count: {nx}")
print(f"  Mean: {a.mean():.4f}")
print(f"  Variance: {varx:.4f}")

print("\nSample Data Statistics:")
print(f"  Count: {ny}")
print(f"  Mean: {b.mean():.4f}")
print(f"  Variance: {vary:.4f}")
print("\nNew Cohen's d:", d_new)


full_keep shape: (3486, 3)
Existing full counts:
 difficulty
medium    1958
easy      1528
Name: count, dtype: int64
sample_keep shape: (68, 3)
Existing sample counts:
 difficulty
easy      34
medium    34
Name: count, dtype: int64
Number of Full Dataset:  3486
Number of Sample Dataset:  68

Initial Cohen's d: 0.48347261079009923
Need additional: 16 easy; 30 medium
Reached |d|=0.1851 < 0.2; stopping early.

Selected directories to add:
1 :  ../cubicasa5k-666/high_quality_architectural/12482
2 :  ../cubicasa5k-666/high_quality_architectural/11008
3 :  ../cubicasa5k-666/high_quality_architectural/2528
4 :  ../cubicasa5k-666/high_quality/10422
5 :  ../cubicasa5k-666/high_quality_architectural/5028
6 :  ../cubicasa5k-666/high_quality/5972
7 :  ../cubicasa5k-666/high_quality_architectural/3933
8 :  ../cubicasa5k-666/high_quality_architectural/11002
9 :  ../cubicasa5k-666/high_quality_architectural/13202
10 :  ../cubicasa5k-666/high_quality_architectural/11004
11 :  ../cubicasa5k-666/high_qu