In [None]:
import os
import pandas as pd
import shutil

In [None]:
# Path to the CSV file
csv_file_path = 'sampled_metadata.csv'  # Update this with your actual CSV file path

# Base directory where the slides are stored and where new subdirs will be created
base_dir = '/home/mxn2498/projects/PathologySearchComparison/DATA/DATABASE'

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Mapping for primary_site to directory names
primary_site_mapping = {
    "Brain": "brain",
    "Breast": "breast",
    "Liver and intrahepatic bile ducts": "liver",
    "Bronchus and lung": "lung",
    "Colon": "colon"
}

# Mapping for project_name to subdirectory names
project_name_mapping = {
    "Glioblastoma Multiforme": "GBM",
    "Brain Lower Grade Glioma": "LGG",
    "Breast Invasive Carcinoma": "BRCA",
    "Lung Adenocarcinoma": "LUAD",
    "Lung Squamous Cell Carcinoma": "LUSC",
    "Colon Adenocarcinoma": "COAD",
    "Liver Hepatocellular Carcinoma": "LIHC",
    "Cholangiocarcinoma": "CHOL"
}

In [None]:
# Ensure base directories and subdirectories exist
for dir_name in primary_site_mapping.values():
    dir_path = os.path.join(base_dir, dir_name)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    for project, subdir in project_name_mapping.items():
        subdir_path = os.path.join(dir_path, subdir)
        if not os.path.exists(subdir_path):
            os.makedirs(subdir_path)

# Move files based on primary_site and project_name
for index, row in df.iterrows():
    primary_site_dir = primary_site_mapping.get(row['primary_site'])
    project_subdir = project_name_mapping.get(row['project_name'])
    if primary_site_dir and project_subdir:
        source_path = os.path.join(base_dir, str(row['id']), row['file_name'])
        target_dir = os.path.join(base_dir, primary_site_dir, project_subdir)
        target_path = os.path.join(target_dir, row['file_name'])
        
        if os.path.exists(source_path):
            shutil.move(source_path, target_path)
            # Delete the id folder, ignoring if it's empty or not
            id_dir_path = os.path.dirname(source_path)
            shutil.rmtree(id_dir_path, ignore_errors=True)