In [1]:
import os
import shutil
import re


dir_list = [
    'results_tf_2_diabetes',
    'results_tf_3_diabetes',
    'results_tf_4_diabetes',
    'results_tf_5_diabetes',
    'results_tf_6_diabetes',
]



# Define date pattern to identify files with dates in their names
date_pattern = re.compile(r'\d{4}-\d{2}-\d{2}')

# Process each directory
for dir_path in dir_list:
    print(f"Processing {dir_path}...")
    
    # Get all date subdirectories
    date_dirs = [d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d))]
    
    # Process each date directory
    for date_dir in date_dirs:
        date_dir_path = os.path.join(dir_path, date_dir)
        
        # Find all CSV files without dates in their names
        for file in os.listdir(date_dir_path):
            if file.endswith('.csv') and not date_pattern.search(file):
                src_file = os.path.join(date_dir_path, file)
                dst_file = os.path.join(dir_path, file)
                
                # Check if destination file already exists
                if os.path.exists(dst_file):
                    print(f"Skipping {file} (already exists in destination)")
                    continue
                
                print(f"Copying {file} from {date_dir} to {dir_path}")
                try:
                    shutil.copy2(src_file, dst_file)
                except Exception as e:
                    print(f"Error copying {file}: {e}")

print("Finished copying files")

Processing results_tf_2_diabetes...
Copying diabetes_train_n_seed5_constrained.csv from 2025-03-23_20-15-00 to results_tf_2_diabetes
Copying diabetes_train_n_seed0.csv from 2025-03-24_00-25-18 to results_tf_2_diabetes
Copying diabetes_cp_n_seed7.csv from 2025-03-24_06-40-36 to results_tf_2_diabetes
Copying diabetes_cp_n_seed6_constrained.csv from 2025-03-23_22-20-03 to results_tf_2_diabetes
Copying diabetes_train_n_seed4.csv from 2025-03-24_04-35-30 to results_tf_2_diabetes
Copying diabetes_cp_n_seed0.csv from 2025-03-24_00-25-16 to results_tf_2_diabetes
Copying diabetes_train_n_seed3.csv from 2025-03-24_02-30-25 to results_tf_2_diabetes
Copying diabetes_train_n_seed4_constrained.csv from 2025-03-23_20-14-59 to results_tf_2_diabetes
Copying diabetes_cp_n_seed0_constrained.csv from 2025-03-23_16-04-45 to results_tf_2_diabetes
Copying diabetes_ring_n_seed5_constrained.csv from 2025-03-23_20-15-02 to results_tf_2_diabetes
Copying diabetes_cp_n_seed8.csv from 2025-03-24_08-45-41 to results

In [4]:
import os
import pandas as pd


dir_list = [
    'results_tf_2_diabetes',
    'results_tf_3_diabetes',
    'results_tf_4_diabetes',
    'results_tf_5_diabetes',
    'results_tf_6_diabetes',
]

result_df_list = []

for dir_path in dir_list:
    print(f"\n{'='*80}")
    print(f"ANALYZING DIRECTORY: {dir_path}")
    print(f"{'='*80}")

    # Get all CSV files in the directory (excluding those in date subdirectories)
    existing_files = [f for f in os.listdir(dir_path) 
                    if f.endswith('.csv') and os.path.isfile(os.path.join(dir_path, f))]

    # Define the expected file pattern
    types = ["cp", "ring", "train"]
    seeds = list(range(10))  # 0 to 9
    constrained = [True, False]  # With and without _constrained suffix

    # Create a list of all expected filenames
    expected_files = []
    for t in types:
        for s in seeds:
            for c in constrained:
                suffix = "_constrained" if c else ""
                filename = f"diabetes_{t}_n_seed{s}{suffix}.csv"
                expected_files.append(filename)

    # Find missing files
    missing_files = [f for f in expected_files if f not in existing_files]

    # Group missing files by type for better readability
    missing_by_type = {t: [] for t in types}
    for file in missing_files:
        for t in types:
            if f"_{t}_" in file:
                missing_by_type[t].append(file)

    # Create a DataFrame for visualization
    rows = []
    for t in types:
        for s in seeds:
            row = {"type": t, "seed": s}
            row["constrained"] = f"diabetes_{t}_n_seed{s}_constrained.csv" not in missing_files
            row["non_constrained"] = f"diabetes_{t}_n_seed{s}.csv" not in missing_files
            rows.append(row)

    df = pd.DataFrame(rows)

    # Display summary
    print(f"Directory: {dir_path}")
    print(f"Total expected files: {len(expected_files)}")
    print(f"Found files: {len(existing_files)}")
    print(f"Missing files: {len(missing_files)}")

    # Display missing files by type
    print("\nMissing files by type:")
    for t in types:
        print(f"\n{t.upper()} - Missing {len(missing_by_type[t])} files:")
        for f in missing_by_type[t]:
            print(f"  - {f}")

    # Display in table format
    print("\nAvailability table (True = file exists, False = file missing):")
    print(df.pivot(index="seed", columns="type"))

    # Append to list
    result_df_list.append(df)

    print()


ANALYZING DIRECTORY: results_tf_2_diabetes
Directory: results_tf_2_diabetes
Total expected files: 60
Found files: 57
Missing files: 3

Missing files by type:

CP - Missing 2 files:
  - diabetes_cp_n_seed2.csv
  - diabetes_cp_n_seed9_constrained.csv

RING - Missing 1 files:
  - diabetes_ring_n_seed4.csv

TRAIN - Missing 0 files:

Availability table (True = file exists, False = file missing):
     constrained             non_constrained             
type          cp  ring train              cp   ring train
seed                                                     
0           True  True  True            True   True  True
1           True  True  True            True   True  True
2           True  True  True           False   True  True
3           True  True  True            True   True  True
4           True  True  True            True  False  True
5           True  True  True            True   True  True
6           True  True  True            True   True  True
7           True  True  T

In [None]:
result_df_list[0]

Unnamed: 0,type,seed,constrained,non_constrained
0,cp,0,True,True
1,cp,1,True,True
2,cp,2,True,False
3,cp,3,True,True
4,cp,4,True,True
5,cp,5,True,True
6,cp,6,True,True
7,cp,7,True,True
8,cp,8,True,True
9,cp,9,False,True


In [7]:
result_df_list[1]

Unnamed: 0,type,seed,constrained,non_constrained
0,cp,0,True,True
1,cp,1,True,True
2,cp,2,True,True
3,cp,3,True,True
4,cp,4,True,True
5,cp,5,True,True
6,cp,6,True,True
7,cp,7,True,True
8,cp,8,True,True
9,cp,9,True,True


In [8]:
result_df_list[2]

Unnamed: 0,type,seed,constrained,non_constrained
0,cp,0,False,True
1,cp,1,False,True
2,cp,2,True,False
3,cp,3,True,False
4,cp,4,False,False
5,cp,5,False,False
6,cp,6,True,False
7,cp,7,True,False
8,cp,8,True,False
9,cp,9,True,False


In [10]:
result_df_list[3]

Unnamed: 0,type,seed,constrained,non_constrained
0,cp,0,False,False
1,cp,1,False,False
2,cp,2,True,True
3,cp,3,True,True
4,cp,4,True,True
5,cp,5,True,True
6,cp,6,False,True
7,cp,7,True,True
8,cp,8,True,False
9,cp,9,True,True


In [11]:
result_df_list[4]

Unnamed: 0,type,seed,constrained,non_constrained
0,cp,0,False,True
1,cp,1,True,True
2,cp,2,False,False
3,cp,3,True,True
4,cp,4,False,False
5,cp,5,True,True
6,cp,6,True,False
7,cp,7,True,True
8,cp,8,False,False
9,cp,9,True,True


In [16]:
import os
import pandas as pd

minimum_num = 50

dir_list = [
    'results_tf_2_diabetes',
    'results_tf_3_diabetes',
    'results_tf_4_diabetes',
    'results_tf_5_diabetes',
    'results_tf_6_diabetes',
]

for dir_path in dir_list:
    print(f"\n{'='*80}")
    print(f"CHECKING FILE SIZES IN: {dir_path}")
    print(f"{'='*80}")
    
    # Get all CSV files in the directory
    csv_files = [f for f in os.listdir(dir_path) 
                if f.endswith('.csv') and os.path.isfile(os.path.join(dir_path, f))]
    
    # Check each file for row count
    small_files = []
    for file in csv_files:
        file_path = os.path.join(dir_path, file)
        try:
            df = pd.read_csv(file_path)
            row_count = len(df)
            
            if row_count <= minimum_num:
                small_files.append((file, row_count))
                print(f"Small file found: {file} - Only {row_count} rows")
        except Exception as e:
            print(f"Error reading {file}: {str(e)}")
    
    # Summary
    if small_files:
        print(f"\nFound {len(small_files)} small files in {dir_path}:")
        for file, count in small_files:
            print(f"  - {file}: {count} rows")
    else:
        print(f"\nNo small files (<={minimum_num} rows) found in {dir_path}")


CHECKING FILE SIZES IN: results_tf_2_diabetes
Small file found: diabetes_cp_n_seed9.csv - Only 49 rows

Found 1 small files in results_tf_2_diabetes:
  - diabetes_cp_n_seed9.csv: 49 rows

CHECKING FILE SIZES IN: results_tf_3_diabetes

No small files (<=50 rows) found in results_tf_3_diabetes

CHECKING FILE SIZES IN: results_tf_4_diabetes

No small files (<=50 rows) found in results_tf_4_diabetes

CHECKING FILE SIZES IN: results_tf_5_diabetes

No small files (<=50 rows) found in results_tf_5_diabetes

CHECKING FILE SIZES IN: results_tf_6_diabetes

No small files (<=50 rows) found in results_tf_6_diabetes


# フィルター後

In [17]:
import os
import pandas as pd


dir_list = [
    'results_tf_2_diabetes',
    'results_tf_3_diabetes',
    'results_tf_4_diabetes',
    'results_tf_5_diabetes',
    'results_tf_6_diabetes',
]

result_df_list = []

for dir_path in dir_list:
    print(f"\n{'='*80}")
    print(f"ANALYZING DIRECTORY: {dir_path}")
    print(f"{'='*80}")

    # Get all CSV files in the directory (excluding those in date subdirectories)
    existing_files = [f for f in os.listdir(dir_path) 
                    if f.endswith('.csv') and os.path.isfile(os.path.join(dir_path, f))]

    # Define the expected file pattern
    types = ["cp", "ring", "train"]
    seeds = list(range(10))  # 0 to 9
    constrained = [True, False]  # With and without _constrained suffix

    # Create a list of all expected filenames
    expected_files = []
    for t in types:
        for s in seeds:
            for c in constrained:
                suffix = "_constrained" if c else ""
                filename = f"diabetes_{t}_n_seed{s}{suffix}.csv"
                expected_files.append(filename)

    # Find missing files
    missing_files = [f for f in expected_files if f not in existing_files]

    # Group missing files by type for better readability
    missing_by_type = {t: [] for t in types}
    for file in missing_files:
        for t in types:
            if f"_{t}_" in file:
                missing_by_type[t].append(file)

    # Create a DataFrame for visualization
    rows = []
    for t in types:
        for s in seeds:
            row = {"type": t, "seed": s}
            row["constrained"] = f"diabetes_{t}_n_seed{s}_constrained.csv" not in missing_files
            row["non_constrained"] = f"diabetes_{t}_n_seed{s}.csv" not in missing_files
            rows.append(row)

    df = pd.DataFrame(rows)

    # Display summary
    print(f"Directory: {dir_path}")
    print(f"Total expected files: {len(expected_files)}")
    print(f"Found files: {len(existing_files)}")
    print(f"Missing files: {len(missing_files)}")

    # Display missing files by type
    print("\nMissing files by type:")
    for t in types:
        print(f"\n{t.upper()} - Missing {len(missing_by_type[t])} files:")
        for f in missing_by_type[t]:
            print(f"  - {f}")

    # Display in table format
    print("\nAvailability table (True = file exists, False = file missing):")
    print(df.pivot(index="seed", columns="type"))

    # Append to list
    result_df_list.append(df)

    print()


ANALYZING DIRECTORY: results_tf_2_diabetes
Directory: results_tf_2_diabetes
Total expected files: 60
Found files: 56
Missing files: 4

Missing files by type:

CP - Missing 3 files:
  - diabetes_cp_n_seed2.csv
  - diabetes_cp_n_seed9_constrained.csv
  - diabetes_cp_n_seed9.csv

RING - Missing 1 files:
  - diabetes_ring_n_seed4.csv

TRAIN - Missing 0 files:

Availability table (True = file exists, False = file missing):
     constrained             non_constrained             
type          cp  ring train              cp   ring train
seed                                                     
0           True  True  True            True   True  True
1           True  True  True            True   True  True
2           True  True  True           False   True  True
3           True  True  True            True   True  True
4           True  True  True            True  False  True
5           True  True  True            True   True  True
6           True  True  True            True   True  Tr

In [24]:
result_df_list[0].to_csv('diabetes_2.csv', index=False)
result_df_list[1].to_csv('diabetes_3.csv', index=False)
result_df_list[2].to_csv('diabetes_4.csv', index=False)
result_df_list[3].to_csv('diabetes_5.csv', index=False)
result_df_list[4].to_csv('diabetes_6.csv', index=False)

In [19]:
result_df_list[1]

Unnamed: 0,type,seed,constrained,non_constrained
0,cp,0,True,True
1,cp,1,True,True
2,cp,2,True,True
3,cp,3,True,True
4,cp,4,True,True
5,cp,5,True,True
6,cp,6,True,True
7,cp,7,True,True
8,cp,8,True,True
9,cp,9,True,True


In [20]:
result_df_list[2]

Unnamed: 0,type,seed,constrained,non_constrained
0,cp,0,False,True
1,cp,1,False,True
2,cp,2,True,False
3,cp,3,True,False
4,cp,4,False,False
5,cp,5,False,False
6,cp,6,True,False
7,cp,7,True,False
8,cp,8,True,False
9,cp,9,True,False


In [21]:
result_df_list[3]

Unnamed: 0,type,seed,constrained,non_constrained
0,cp,0,False,False
1,cp,1,False,False
2,cp,2,True,True
3,cp,3,True,True
4,cp,4,True,False
5,cp,5,True,True
6,cp,6,False,False
7,cp,7,True,True
8,cp,8,True,False
9,cp,9,True,True


In [22]:
result_df_list[4]

Unnamed: 0,type,seed,constrained,non_constrained
0,cp,0,False,True
1,cp,1,True,True
2,cp,2,False,False
3,cp,3,True,True
4,cp,4,False,False
5,cp,5,True,True
6,cp,6,True,False
7,cp,7,True,True
8,cp,8,False,False
9,cp,9,True,True
