### dataset preprocessing

In [31]:
import pandas as pd
import os
import glob
import torch
import numpy as np
import re
from pointnet2_ops import pointnet2_utils  # FPS 사용

def _extract_tree_id_from_filename(csv_path: str) -> int:
    """파일명에서 8자리 숫자를 찾아 앞 4자리를 tree_id로 반환 (정수 나눗셈 // 10000)."""
    stem = os.path.splitext(os.path.basename(csv_path))[0]  # 예: '06620000'
    m = re.search(r'(\d{8})', stem)
    if not m:
        # 8자리가 아니면, 파일명 내 숫자만 모아 처리(안전장치)
        digits = ''.join(ch for ch in stem if ch.isdigit())
        if len(digits) >= 8:
            base = int(digits[-8:])
        else:
            raise ValueError(f"파일명에서 8자리 숫자를 찾을 수 없음: {csv_path}")
    else:
        base = int(m.group(1))
    return base // 10000  # 앞 4자리

def voxelize_and_downsample(input_dir, output_dir, voxel_size=1,
                            max_points_per_voxel=32768, start_index=1):
    os.makedirs(output_dir, exist_ok=True)
    csv_files = sorted(glob.glob(os.path.join(input_dir, "*.csv")))
    print(f"Found {len(csv_files)} CSV files in {input_dir}.")
    csv_files = csv_files[start_index-1:]  # 1-based

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for csv_idx, csv_file in enumerate(csv_files, start=start_index):
        print(f"\nProcessing {csv_idx}: {csv_file}")

        # 파일명에서 tree_id 추출 (06620000 -> 0662 -> 662)
        try:
            tree_id = _extract_tree_id_from_filename(csv_file)
        except Exception as e:
            print(f"[SKIP] tree_id 추출 실패: {csv_file} :: {e}")
            continue

        # ---- CSV 읽기(헤더 없음, 자동 구분자) ----
        try:
            df = pd.read_csv(csv_file, header=None, sep=None, engine="python")
        except Exception as e:
            print(f"[SKIP] read_csv 실패: {csv_file} :: {e}")
            continue

        if df.shape[1] < 3:
            print(f"[SKIP] {csv_file}: 열이 3개 미만입니다({df.shape[1]}개).")
            continue

        # 3컬럼 또는 4+컬럼 정규화 → (x,y,z,type)
        if df.shape[1] == 3:
            df.columns = ['x', 'y', 'z']
            df['type'] = -1
        else:
            df = df.iloc[:, :4]
            df.columns = ['x', 'y', 'z', 'type']

        df[['x','y','z']] = df[['x','y','z']].astype(np.float32)
        try:
            df['type'] = df['type'].astype(np.int32)
        except Exception:
            df['type'] = -1

        # ---- Voxelization ----
        df['voxel_x'] = np.floor(df['x'] / voxel_size).astype(np.int64)
        df['voxel_y'] = np.floor(df['y'] / voxel_size).astype(np.int64)
        df['voxel_z'] = np.floor(df['z'] / voxel_size).astype(np.int64)

        # voxel_id 부여 (파일 내에서 1..K)
        voxel_keys = df[['voxel_x','voxel_y','voxel_z']].drop_duplicates().values.tolist()
        voxel_keys = sorted(voxel_keys)
        voxel_to_index = {tuple(key): idx+1 for idx, key in enumerate(voxel_keys)}
        df['voxel_id'] = df[['voxel_x','voxel_y','voxel_z']].apply(tuple, axis=1).map(voxel_to_index)

        grouped = df.groupby('voxel_id', sort=True)
        voxel_point_counts = grouped.size()
        print(f"File {csv_idx}: Max points in a voxel = {voxel_point_counts.max()}")
        print(f"File {csv_idx}: Min points in a voxel = {voxel_point_counts.min()}")

        # ---- 복셀별 저장 ----
        for voxel_id, group in grouped:
            # 최종 번호: i*10000 + voxel_id  → 8자리 zero-pad
            i_val = int(tree_id) * 10000 + int(voxel_id)
            i_str = f"{i_val:08d}"  # 예: 6620001 -> '006620001'이 아니라 '006620001'? (주의: 8자리 유지)
            # 위에서 8자리 보장을 위해 tree_id(0~9999), voxel_id(1~9999) 가정
            file_name = f"99999999-{i_str}.csv"
            file_path = os.path.join(output_dir, file_name)

            grp = group.reset_index(drop=True)
            n = len(grp)
            if n > max_points_per_voxel:
                pts = torch.from_numpy(grp[['x','y','z']].to_numpy(np.float32)).unsqueeze(0).to(device).contiguous()
                try:
                    idx = pointnet2_utils.furthest_point_sample(pts, max_points_per_voxel)  # (1,M)
                    sel = idx.squeeze(0).long().cpu().numpy()
                    sampled = grp.iloc[sel]
                except Exception as e:
                    print(f"[WARN] FPS 실패(원본 저장): voxel_id={voxel_id}, N={n} :: {e}")
                    sampled = grp
            else:
                sampled = grp

            sampled[['x','y','z','type']].to_csv(file_path, index=False, header=False)

    print("\n[Done] Voxelization + (optional) FPS downsampling finished.")


# 실행 예시
input_dir = "/esail4/heeju/REGRESSION/Point-M2AE/data/06masked/22222222"
output_dir = "/esail4/heeju/REGRESSION/Point-M2AE/data/06masked_voxel/22222222"

voxelize_and_downsample(input_dir, output_dir, start_index=1)

Found 12 CSV files in /esail4/heeju/REGRESSION/Point-M2AE/data/06masked/22222222.

Processing 1: /esail4/heeju/REGRESSION/Point-M2AE/data/06masked/22222222/06620000.csv
File 1: Max points in a voxel = 284204
File 1: Min points in a voxel = 2528

Processing 2: /esail4/heeju/REGRESSION/Point-M2AE/data/06masked/22222222/06630000.csv
File 2: Max points in a voxel = 1154385
File 2: Min points in a voxel = 404

Processing 3: /esail4/heeju/REGRESSION/Point-M2AE/data/06masked/22222222/06640000.csv
File 3: Max points in a voxel = 729145
File 3: Min points in a voxel = 45

Processing 4: /esail4/heeju/REGRESSION/Point-M2AE/data/06masked/22222222/07670000.csv
File 4: Max points in a voxel = 35969
File 4: Min points in a voxel = 276

Processing 5: /esail4/heeju/REGRESSION/Point-M2AE/data/06masked/22222222/08620000.csv
File 5: Max points in a voxel = 36482
File 5: Min points in a voxel = 238

Processing 6: /esail4/heeju/REGRESSION/Point-M2AE/data/06masked/22222222/15320000.csv
File 6: Max points in 

In [4]:
import os
import pandas as pd

# 경로 설정
target_dir = '/esail4/heeju/REGRESSION/Point-M2AE/data/06masked_voxel/33333333'

# 해당 디렉토리의 모든 csv 파일 찾기
csv_files = [f for f in os.listdir(target_dir) if f.endswith('.csv')]

# 기준 이하 파일 삭제
for file in csv_files:
    file_path = os.path.join(target_dir, file)
    try:
        df = pd.read_csv(file_path)
        if len(df) <= 50:
            os.remove(file_path)
            print(f'Deleted: {file}')
    except Exception as e:
        print(f'Error reading {file}: {e}')


Deleted: 99999999-15430044.csv
Deleted: 99999999-15710050.csv
Deleted: 99999999-15710065.csv
Deleted: 99999999-22360015.csv
Deleted: 99999999-22360016.csv
Deleted: 99999999-22360037.csv
Deleted: 99999999-22360038.csv
Deleted: 99999999-22360039.csv
Deleted: 99999999-22360040.csv
Deleted: 99999999-22360042.csv


In [6]:
import os
import shutil

# 입력 및 출력 디렉토리 경로 설정
src_dir = '/esail4/heeju/REGRESSION/Point-M2AE/data/06masked_voxel/33333333'
dst_dir = '/esail4/heeju/REGRESSION/Point-M2AE/data/06masked_inf/33333333'

# 출력 디렉토리가 없으면 생성
os.makedirs(dst_dir, exist_ok=True)

# 소스 디렉토리 내 파일 순회
for filename in os.listdir(src_dir):
    if filename.endswith('.csv') and filename.startswith('99999999-'):
        # 숫자 ID 추출
        numeric_id = filename.split('-')[1].replace('.csv', '')
        
        # 복사 대상 경로
        new_filename = f'{numeric_id}.csv'
        src_path = os.path.join(src_dir, filename)
        dst_path = os.path.join(dst_dir, new_filename)
        
        # 파일 복사
        shutil.copyfile(src_path, dst_path)
        print(f'Copied: {filename} → {new_filename}')


Copied: 99999999-05530001.csv → 05530001.csv
Copied: 99999999-05530002.csv → 05530002.csv
Copied: 99999999-05530003.csv → 05530003.csv
Copied: 99999999-05530004.csv → 05530004.csv
Copied: 99999999-05530005.csv → 05530005.csv
Copied: 99999999-05530006.csv → 05530006.csv
Copied: 99999999-05530007.csv → 05530007.csv
Copied: 99999999-05530008.csv → 05530008.csv
Copied: 99999999-05530009.csv → 05530009.csv
Copied: 99999999-05530010.csv → 05530010.csv
Copied: 99999999-05530011.csv → 05530011.csv
Copied: 99999999-05530012.csv → 05530012.csv
Copied: 99999999-05530013.csv → 05530013.csv
Copied: 99999999-05530014.csv → 05530014.csv
Copied: 99999999-05530015.csv → 05530015.csv
Copied: 99999999-05530016.csv → 05530016.csv
Copied: 99999999-05530017.csv → 05530017.csv
Copied: 99999999-05530018.csv → 05530018.csv
Copied: 99999999-05530019.csv → 05530019.csv
Copied: 99999999-05530020.csv → 05530020.csv
Copied: 99999999-05530021.csv → 05530021.csv
Copied: 99999999-05530022.csv → 05530022.csv
Copied: 99

In [27]:
import os
import json
import random
from collections import defaultdict

# 두 클래스 디렉토리
bl_dir = "/esail4/heeju/REGRESSION/Point-M2AE/data/segmentation2/00000000"

# 출력 디렉토리
output_dir = "/esail4/heeju/REGRESSION/Point-M2AE/data/segmentation2/classified_json_grouped"
os.makedirs(output_dir, exist_ok=True)

# 모든 파일 로드 및 그룹화 (앞 4자리 기준)
file_groups = defaultdict(list)

for file in os.listdir(bl_dir):
    if not file.endswith(".csv"):
        continue
    group_key = file[:4]  # 앞 4자리 기준 그룹화
    path = f"segmentation2/00000000/{os.path.splitext(file)[0]}"
    file_groups[group_key].append(path)


# 그룹 키들을 무작위로 섞음
group_keys = list(file_groups.keys())
random.shuffle(group_keys)

# 그룹 단위로 7:1:1:1 분할
n = len(group_keys)
train_keys = group_keys[: int(n * 0.7)]
test_keys  = group_keys[int(n * 0.7) : int(n * 0.8)]
val_keys   = group_keys[int(n * 0.8) : int(n * 0.9)]
inf_keys   = group_keys[:]

# 키 기반으로 전체 파일 리스트 생성
train_paths = [path for key in train_keys for path in file_groups[key]]
test_paths  = [path for key in test_keys  for path in file_groups[key]]
val_paths   = [path for key in val_keys   for path in file_groups[key]]
inf_paths   = [path for key in inf_keys   for path in file_groups[key]]

# 저장 함수
def save_json(data, filename):
    with open(os.path.join(output_dir, filename), "w") as f:
        json.dump(data, f, indent=4)

# 저장 실행
save_json(train_paths, "leafwood_data_train.json")
save_json(test_paths,  "leafwood_data_test.json")
save_json(val_paths,   "leafwood_data_val.json")
save_json(inf_paths,   "leafwood_data_inference.json")

# 출력
print(f"Train groups : {len(train_keys)}")
print(f"Test  groups : {len(test_keys)}")
print(f"Val   groups : {len(val_keys)}")
print(f"Infer groups : {len(inf_keys)}")
print(f"Saved to     : {output_dir}")


Train groups : 19
Test  groups : 3
Val   groups : 3
Infer groups : 28
Saved to     : /esail4/heeju/REGRESSION/Point-M2AE/data/segmentation2/classified_json_grouped


### json for 2 dir

In [10]:
import os
import json
import random
from collections import defaultdict

# 재현성 고정 (원하시면 바꾸세요)
random.seed(42)

# 경로 설정
data_root = "/esail4/heeju/REGRESSION/Point-M2AE/data"
base_dir  = os.path.join(data_root, "06masked")   # ← occluded_inf 기준
class_dirs = ["22222222", "33333333"]                 # 두 클래스 폴더
output_dir = os.path.join(base_dir, "classified_json_grouped")
os.makedirs(output_dir, exist_ok=True)

# 1) 파일 수집 및 앞 4자리 기준 그룹핑
file_groups = defaultdict(list)  # key: 앞 4자리, val: [상대경로들]
total_files = 0

for cls in class_dirs:
    cls_dir = os.path.join(base_dir, cls)
    if not os.path.isdir(cls_dir):
        print(f"[WARN] 디렉토리 없음: {cls_dir}")
        continue

    for file in os.listdir(cls_dir):
        if not file.endswith(".csv"):
            continue
        total_files += 1
        group_key = file[:4]  # 앞 4자리
        stem = os.path.splitext(file)[0]
        rel_path = f"06masked/{cls}/{stem}"  # ← occluded_inf로 경로 프리픽스
        file_groups[group_key].append(rel_path)

print(f"[INFO] 총 CSV 파일 수: {total_files}")
print(f"[INFO] 그룹 수(앞4자리): {len(file_groups)}")

# 2) 그룹 키 셔플 + 7:1:1:1 분할
group_keys = list(file_groups.keys())
random.shuffle(group_keys)

n = len(group_keys)
train_keys = group_keys[: int(n * 0.7)]
test_keys  = group_keys[int(n * 0.7) : int(n * 0.8)]
val_keys   = group_keys[int(n * 0.8) : int(n * 0.9)]
inf_keys   = group_keys[:]  # 전체

# 3) 키 기반 경로 펼치기
train_paths = [p for k in train_keys for p in file_groups[k]]
test_paths  = [p for k in test_keys  for p in file_groups[k]]
val_paths   = [p for k in val_keys   for p in file_groups[k]]
inf_paths   = [p for k in inf_keys   for p in file_groups[k]]

# 4) 저장 함수
def save_json(data, filename):
    with open(os.path.join(output_dir, filename), "w") as f:
        json.dump(data, f, indent=4)

# 5) 저장
save_json(train_paths, "leafwood_data_train.json")
save_json(test_paths,  "leafwood_data_test.json")
save_json(val_paths,   "leafwood_data_val.json")
save_json(inf_paths,   "leafwood_data_inference.json")

# 6) 로그
print(f"Train groups : {len(train_keys)}  | files: {len(train_paths)}")
print(f"Test  groups : {len(test_keys)}   | files: {len(test_paths)}")
print(f"Val   groups : {len(val_keys)}    | files: {len(val_paths)}")
print(f"Infer groups : {len(inf_keys)}    | files: {len(inf_paths)}")
print(f"Saved to     : {output_dir}")


[INFO] 총 CSV 파일 수: 24
[INFO] 그룹 수(앞4자리): 24
Train groups : 16  | files: 16
Test  groups : 3   | files: 3
Val   groups : 2    | files: 2
Infer groups : 24    | files: 24
Saved to     : /esail4/heeju/REGRESSION/Point-M2AE/data/06masked/classified_json_grouped


### merging

In [12]:
import os
import pandas as pd
from collections import defaultdict

# 입력 및 출력 경로
src_dir = '/esail4/heeju/REGRESSION/Point-M2AE/segmentation2/comb1_result_csv_0826/00000001'
output_dir = os.path.join(src_dir, 'merged_noheader')
os.makedirs(output_dir, exist_ok=True)

# 파일 그룹핑 (앞 4자리)
grouped_files = defaultdict(list)

for filename in os.listdir(src_dir):
    if filename.endswith('.csv'):
        key = filename[:4]  # 앞 4자리 기준
        grouped_files[key].append(filename)

# 그룹별 수직 병합
for key, files in grouped_files.items():
    dfs = []
    for file in sorted(files):
        path = os.path.join(src_dir, file)
        df = pd.read_csv(path, header=None)  # 헤더가 없으므로 header=None
        dfs.append(df)

    merged_df = pd.concat(dfs, axis=0, ignore_index=True)  # 수직 결합
    output_path = os.path.join(output_dir, f"{key}_merged.csv")
    merged_df.to_csv(output_path, index=False, header=False)  # header도 저장하지 않음

    print(f"Saved: {output_path} with {len(merged_df)} rows")


Saved: /esail4/heeju/REGRESSION/Point-M2AE/segmentation2/comb1_result_csv_0826/00000001/merged_noheader/0553_merged.csv with 189715 rows
Saved: /esail4/heeju/REGRESSION/Point-M2AE/segmentation2/comb1_result_csv_0826/00000001/merged_noheader/0678_merged.csv with 1429084 rows
Saved: /esail4/heeju/REGRESSION/Point-M2AE/segmentation2/comb1_result_csv_0826/00000001/merged_noheader/0680_merged.csv with 1375773 rows
Saved: /esail4/heeju/REGRESSION/Point-M2AE/segmentation2/comb1_result_csv_0826/00000001/merged_noheader/0683_merged.csv with 1674596 rows
Saved: /esail4/heeju/REGRESSION/Point-M2AE/segmentation2/comb1_result_csv_0826/00000001/merged_noheader/0746_merged.csv with 416870 rows
Saved: /esail4/heeju/REGRESSION/Point-M2AE/segmentation2/comb1_result_csv_0826/00000001/merged_noheader/1019_merged.csv with 468835 rows
Saved: /esail4/heeju/REGRESSION/Point-M2AE/segmentation2/comb1_result_csv_0826/00000001/merged_noheader/1059_merged.csv with 1676077 rows
Saved: /esail4/heeju/REGRESSION/Point

In [36]:
import os
import pandas as pd

# 입력 및 출력 경로
input_dir = '/esail4/heeju/REGRESSION/Point-M2AE/segmentation2/result_csv_eval0820_2/00000000/merged_noheader'
output_dir = '/esail4/heeju/PoinTr/data/EVAL_GT'
os.makedirs(output_dir, exist_ok=True)

# 모든 CSV 파일 처리
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        input_path = os.path.join(input_dir, filename)
        df = pd.read_csv(input_path, header=None)  # header 없음
        
        # 4번째 컬럼이 0인 행만 필터링
        df_filtered = df[df[4] == 0].values
        print(df_filtered.shape)
        df_filtered = pd.DataFrame(df_filtered[:,0:3])
        print(df_filtered.shape)
        
        # 출력 경로 설정
        output_path = os.path.join(output_dir, filename)
        df_filtered.to_csv(output_path, index=False, header=False)
        
        print(f"Processed: {filename} → {len(df_filtered)} rows saved")


(918476, 5)
(918476, 3)
Processed: 01_merged.csv → 918476 rows saved
(691273, 5)
(691273, 3)
Processed: 02_merged.csv → 691273 rows saved
(537502, 5)
(537502, 3)
Processed: 03_merged.csv → 537502 rows saved
(297085, 5)
(297085, 3)
Processed: 12_merged.csv → 297085 rows saved
(472469, 5)
(472469, 3)
Processed: 13_merged.csv → 472469 rows saved
(1113211, 5)
(1113211, 3)
Processed: 04_merged.csv → 1113211 rows saved
(444659, 5)
(444659, 3)
Processed: 05_merged.csv → 444659 rows saved
(272101, 5)
(272101, 3)
Processed: 14_merged.csv → 272101 rows saved
(209886, 5)
(209886, 3)
Processed: 15_merged.csv → 209886 rows saved
(536082, 5)
(536082, 3)
Processed: 06_merged.csv → 536082 rows saved
(462240, 5)
(462240, 3)
Processed: 07_merged.csv → 462240 rows saved
(937111, 5)
(937111, 3)
Processed: 16_merged.csv → 937111 rows saved
(440570, 5)
(440570, 3)
Processed: 17_merged.csv → 440570 rows saved
(776846, 5)
(776846, 3)
Processed: 08_merged.csv → 776846 rows saved
(2089422, 5)
(2089422, 3)
Proce

In [4]:
import os
import json
import pandas as pd

# 경로 설정
json_path = "/esail4/heeju/REGRESSION/Point-M2AE/data/segmentation/classified_json_grouped/leafwood_data_inference.json"
base_dir = "/esail4/heeju/REGRESSION/Point-M2AE/data/"
save_dir = "/esail4/heeju/REGRESSION/Point-M2AE/data/forinference"

os.makedirs(save_dir, exist_ok=True)

# JSON 읽기
with open(json_path, "r") as f:
    file_list = json.load(f)

# 그룹핑 딕셔너리
groups = {}
for path in file_list:
    filename = os.path.basename(path)  # 예: "06180002"
    prefix = filename[:4]              # 앞 4자리 (예: "0618")
    groups.setdefault(prefix, []).append(path)

# 그룹별 처리
for prefix, paths in groups.items():
    dfs = []
    for p in paths:
        csv_path = os.path.join(base_dir, p + ".csv")  # 실제 csv 경로
        if os.path.exists(csv_path):
            try:
                df = pd.read_csv(csv_path)  # 헤더 없는 경우
                dfs.append(df)
            except Exception as e:
                print(f"Error reading {csv_path}: {e}")
        else:
            print(f"Missing file: {csv_path}")

    if dfs:
        merged = pd.concat(dfs, axis=0)  # 세로로 합치기
        out_path = os.path.join(save_dir, f"{prefix}0000.csv")
        merged.to_csv(out_path, index=False, header=False)
        print(f"Saved: {out_path}")


Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/09340000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/06180000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/05870000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/05530000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/05570000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/21610000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/16850000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/16890000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/04710000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/12910000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/01420000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/13000000.csv
Saved: /esail4/heeju/REGRESSION/Point-M2AE/data/forinference/15050000.csv
Saved: /esail4/heeju/REGRESSION/Point-

In [5]:
import os
import re
import json
import shutil
from pathlib import Path
from collections import defaultdict

# 경로 설정
base_dir = Path("/esail4/heeju/REGRESSION/Point-M2AE/data")
json_path = base_dir / "segmentation" / "classified_json_grouped" / "leafwood_data_inference.json"
forinf_dir = base_dir / "forinference"

# 안전장치: 경로 존재 확인
if not json_path.exists():
    raise FileNotFoundError(f"JSON이 없습니다: {json_path}")
if not forinf_dir.exists():
    raise FileNotFoundError(f"forinference 폴더가 없습니다: {forinf_dir}")

# 1) JSON 읽어서 prefix -> {groups} 매핑 만들기
with open(json_path, "r") as f:
    items = json.load(f)

prefix_to_groups = defaultdict(set)
pat = re.compile(r"^segmentation/([^/]+)/(\d{8})$")  # segmentation/<group>/<8digits>

for it in items:
    m = pat.match(it.strip())
    if not m:
        # 예상 포맷이 아니면 스킵
        continue
    group, eight = m.groups()
    prefix = eight[:4]  # 앞 4자리
    prefix_to_groups[prefix].add(group)

# 2) forinference 내의 8자리 csv 파일들을 분배
csv_pat = re.compile(r"^(\d{8})\.csv$")
moved, copied, skipped, not_found_prefix = 0, 0, 0, 0

for entry in sorted(forinf_dir.iterdir()):
    if not entry.is_file():
        continue
    m = csv_pat.match(entry.name)
    if not m:
        # 8자리.csv 형태가 아니면 무시
        continue

    eight = m.group(1)         # 예: 06180000
    prefix = eight[:4]          # 예: 0618
    groups = sorted(prefix_to_groups.get(prefix, []))

    if not groups:
        print(f"[WARN] 매칭되는 그룹이 없어 스킵: {entry.name} (prefix={prefix})")
        not_found_prefix += 1
        continue

    # 그룹 폴더들 생성
    group_dirs = [forinf_dir / g for g in groups]
    for gd in group_dirs:
        gd.mkdir(parents=True, exist_ok=True)

    # 첫 번째 그룹으로 이동
    first_dst = group_dirs[0] / entry.name
    try:
        # 이미 목적지에 파일이 있으면 덮어쓰기
        if first_dst.exists():
            first_dst.unlink()
        shutil.move(str(entry), str(first_dst))
        moved += 1
        print(f"[MOVE] {entry.name} → {groups[0]}/{entry.name}")
    except Exception as e:
        print(f"[ERROR] 이동 실패: {entry.name} → {groups[0]} :: {e}")
        skipped += 1
        continue

    # 나머지 그룹에는 복사
    for gdir in group_dirs[1:]:
        dst = gdir / first_dst.name
        try:
            # 이미 있으면 건너뜀(원하시면 덮어쓰기로 바꿀 수 있음)
            if dst.exists():
                print(f"[SKIP] 이미 존재: {gdir.name}/{dst.name}")
                continue
            shutil.copy2(str(first_dst), str(dst))
            copied += 1
            print(f"[COPY] {first_dst.name} → {gdir.name}/{dst.name}")
        except Exception as e:
            print(f"[ERROR] 복사 실패: {first_dst.name} → {gdir.name} :: {e}")
            skipped += 1

print("\n=== 요약 ===")
print(f"이동(MOVE): {moved}")
print(f"복사(COPY): {copied}")
print(f"스킵/오류(SKIP): {skipped}")
print(f"매칭 없는 prefix 파일: {not_found_prefix}")


[MOVE] 00180000.csv → 33333333/00180000.csv
[MOVE] 00360000.csv → 33333333/00360000.csv
[MOVE] 00390000.csv → 33333333/00390000.csv
[MOVE] 00630000.csv → 33333333/00630000.csv
[MOVE] 00820000.csv → 33333333/00820000.csv
[MOVE] 00890000.csv → 33333333/00890000.csv
[MOVE] 01080000.csv → 33333333/01080000.csv
[MOVE] 01140000.csv → 33333333/01140000.csv
[MOVE] 01230000.csv → 33333333/01230000.csv
[MOVE] 01260000.csv → 33333333/01260000.csv
[MOVE] 01320000.csv → 33333333/01320000.csv
[MOVE] 01420000.csv → 33333333/01420000.csv
[MOVE] 01510000.csv → 33333333/01510000.csv
[MOVE] 01600000.csv → 33333333/01600000.csv
[MOVE] 01620000.csv → 33333333/01620000.csv
[MOVE] 01770000.csv → 33333333/01770000.csv
[MOVE] 01800000.csv → 33333333/01800000.csv
[MOVE] 01960000.csv → 33333333/01960000.csv
[MOVE] 02010000.csv → 33333333/02010000.csv
[MOVE] 02630000.csv → 33333333/02630000.csv
[MOVE] 02640000.csv → 33333333/02640000.csv
[MOVE] 02680000.csv → 33333333/02680000.csv
[MOVE] 02920000.csv → 33333333/0

In [30]:
import os
import glob
import pandas as pd
import torch
import numpy as np
import random

# ----- 다운샘플 함수: 선택 인덱스도 함께 반환 -----
def point_cloud_down_with_indices(xyz, ratio, center_sigma):
    """
    xyz: torch.Tensor (N,3) or (1,N,3)
    return: (1,M,3), sampled_idx(LongTensor shape [M])
    """
    random.seed(42)
    torch.manual_seed(42)

    if xyz.dim() == 3:
        xyz = xyz.squeeze(0)  # (N,3)

    N = xyz.shape[0]
    target_N = max(1, int(N * ratio))

    center = xyz[torch.randint(0, N, (1,)).item()]
    dists = torch.linalg.norm(xyz - center, dim=1)
    # 수치 안정성을 위해 softmax 사용 (exp/합 대신)
    prob = torch.softmax(dists / center_sigma, dim=0)
    sampled_idx = torch.multinomial(prob, target_N, replacement=False)

    crop_data = xyz[sampled_idx].unsqueeze(0).contiguous()  # (1,M,3)
    return crop_data, sampled_idx

# ----- 경로/하이퍼파라미터 -----
ROOT = "/esail4/heeju/REGRESSION/Point-M2AE/data"
FORINF = os.path.join(ROOT, "forinference")
OUTROOT = os.path.join(ROOT, "09masked")

RATIO = 0.6       # 남길 비율
CENTER_SIGMA = 0.5  # 거리 가중치 스케일

GROUPS = ["22222222", "33333333"]
os.makedirs(OUTROOT, exist_ok=True)

def process_group(group_name: str):
    src_dir = os.path.join(FORINF, group_name)
    dst_dir = os.path.join(OUTROOT, group_name)
    os.makedirs(dst_dir, exist_ok=True)

    csv_list = sorted(glob.glob(os.path.join(src_dir, "*.csv")))
    if not csv_list:
        print(f"[INFO] No CSVs in {src_dir}")
        return

    for csv_path in csv_list:
        try:
            # 헤더 없음: 구분자 자동 감지
            df = pd.read_csv(csv_path, header=None, sep=None, engine="python")

            if df.shape[1] < 3:
                print(f"[SKIP] {csv_path}: 열이 3개 미만입니다({df.shape[1]}개).")
                continue

            # (x,y,z,type) 정규화
            if df.shape[1] >= 4:
                df = df.iloc[:, :4]
                df.columns = ['x','y','z','type']
            else:
                df.columns = ['x','y','z']
                df['type'] = -1

            # 타입/좌표 형 변환
            xyz_np = df[['x','y','z']].to_numpy(dtype=np.float32)
            try:
                type_np = df['type'].to_numpy(dtype=np.int32)
            except Exception:
                type_np = df['type'].to_numpy()
            
            # Torch 텐서 변환
            pts = torch.from_numpy(xyz_np)

            # 다운샘플 + 인덱스 획득
            down, sampled_idx = point_cloud_down_with_indices(
                pts, ratio=RATIO, center_sigma=CENTER_SIGMA
            )
            down_np = down.squeeze(0).cpu().numpy()
            sel_idx = sampled_idx.cpu().numpy()
            sel_type = type_np[sel_idx].reshape(-1, 1)

            # x,y,z,type 붙여 저장
            out_arr = np.hstack([down_np, sel_type])
            out_df = pd.DataFrame(out_arr)

            fname = os.path.basename(csv_path)  # 예: 06180000.csv
            out_path = os.path.join(dst_dir, fname)
            out_df.to_csv(out_path, index=False, header=False)
            print(f"[OK] {group_name} :: {fname} → {out_path} (N={len(df)}, M={out_arr.shape[0]})")

        except Exception as e:
            print(f"[ERROR] {csv_path}: {e}")

for g in GROUPS:
    process_group(g)


[OK] 22222222 :: 06620000.csv → /esail4/heeju/REGRESSION/Point-M2AE/data/09masked/22222222/06620000.csv (N=1800816, M=1080489)
[OK] 22222222 :: 06630000.csv → /esail4/heeju/REGRESSION/Point-M2AE/data/09masked/22222222/06630000.csv (N=6256194, M=3753716)
[OK] 22222222 :: 06640000.csv → /esail4/heeju/REGRESSION/Point-M2AE/data/09masked/22222222/06640000.csv (N=3564413, M=2138647)
[OK] 22222222 :: 07670000.csv → /esail4/heeju/REGRESSION/Point-M2AE/data/09masked/22222222/07670000.csv (N=763341, M=458004)
[OK] 22222222 :: 08620000.csv → /esail4/heeju/REGRESSION/Point-M2AE/data/09masked/22222222/08620000.csv (N=493387, M=296032)
[OK] 22222222 :: 15320000.csv → /esail4/heeju/REGRESSION/Point-M2AE/data/09masked/22222222/15320000.csv (N=568711, M=341226)
[OK] 22222222 :: 16270000.csv → /esail4/heeju/REGRESSION/Point-M2AE/data/09masked/22222222/16270000.csv (N=2377039, M=1426223)
[OK] 22222222 :: 17270000.csv → /esail4/heeju/REGRESSION/Point-M2AE/data/09masked/22222222/17270000.csv (N=2215852, M