In [1]:
import os
import csv
import shutil
import pandas as pd
from tqdm.auto import tqdm
import config

In [2]:
def combine_sessions_append(session_base_dir, aggregate_image_dir, aggregate_csv_path):
    """
    Combines data from session directories into an aggregate dataset.
    - Uses session directory name as session_id.
    - Renames images using session_id as a prefix.
    - Appends data from new sessions to an existing CSV.
    """
    os.makedirs(aggregate_image_dir, exist_ok=True) #

    all_data = []

    try:
        session_dirs = [d for d in os.listdir(session_base_dir) if os.path.isdir(os.path.join(session_base_dir, d)) and d.startswith('session_')] #
        session_dirs.sort() #
    except FileNotFoundError: #
        print(f"Error: Base session directory not found: {session_base_dir}") #
        return #

    print(f"Found {len(session_dirs)} sessions to check from '{session_base_dir}'.") #

    # --- Determine which sessions are already processed (if CSV exists) ---
    existing_sessions = set()
    file_exists = os.path.exists(aggregate_csv_path) #
    if file_exists:
        try:
            print(f"Reading existing sessions from: {aggregate_csv_path}") #
            existing_df = pd.read_csv(aggregate_csv_path) #
            if 'session_id' in existing_df.columns:
                existing_sessions = set(existing_df['session_id'].unique()) #
            print(f"Found {len(existing_sessions)} existing sessions.")
        except pd.errors.EmptyDataError:
            print(f"Warning: Existing CSV '{aggregate_csv_path}' is empty.")
            file_exists = False # Treat as if it doesn't exist for writing header
        except Exception as e:
            print(f"Error reading existing aggregate CSV: {e}. Will attempt to proceed, but caution advised.")
            # We might proceed but risk duplicates if we can't read existing IDs

    # --- Process only new sessions ---
    sessions_to_process = [s for s in session_dirs if s not in existing_sessions]
    print(f"Found {len(sessions_to_process)} new sessions to process.")

    if not sessions_to_process:
        print("No new sessions to add. Exiting.")
        return

    for session_name in tqdm(sessions_to_process, desc="Processing New Sessions"): #
        session_path = os.path.join(session_base_dir, session_name) #
        session_csv = os.path.join(session_path, 'data.csv') #
        session_img_dir = os.path.join(session_path, 'images') #

        if not os.path.exists(session_csv) or not os.path.exists(session_img_dir): #
            print(f"Warning: Skipping session {session_name}, missing data.csv or images directory.") #
            continue #

        try:
            df = pd.read_csv(session_csv) #
            if df.empty: #
                 print(f"Warning: Skipping session {session_name}, data.csv is empty.") #
                 continue #
        except Exception as e: #
            print(f"Warning: Error reading {session_csv}, skipping session {session_name}. Error: {e}") #
            continue #

        print(f"Processing session: {session_name}, {len(df)} entries.") #

        for index, row in tqdm(df.iterrows(), total=len(df), desc=f"  Processing {session_name}", leave=False): #
            original_relative_path = row['image_path'] #
            original_absolute_path = os.path.join(session_path, original_relative_path) #
            original_filename = os.path.basename(original_relative_path) #

            if not os.path.exists(original_absolute_path): #
                 print(f"  Warning: Image not found, skipping: {original_absolute_path}") #
                 continue #

            new_filename = f"{session_name}_{original_filename}" #
            new_relative_path = os.path.join('images', new_filename) #
            new_absolute_path = os.path.join(aggregate_image_dir, new_filename) #

            try:
                if not os.path.exists(new_absolute_path): #
                    shutil.copy2(original_absolute_path, new_absolute_path) #
            except Exception as e: #
                print(f"  Error copying image {original_absolute_path} to {new_absolute_path}. Skipping. Error: {e}") #
                continue #

            all_data.append({ #
                'session_id': session_name, #
                'image_path': new_relative_path, #
                'timestamp': row['timestamp'], #
                'action': row['action'] #
            })

    # --- Write new data (if any) ---
    if not all_data:
         print("\nNo new valid data found in session directories to add.") #
         return #

    new_df_to_write = pd.DataFrame(all_data, columns=['session_id', 'image_path', 'timestamp', 'action']) #

    try:
        if file_exists:
            # Append to existing file without header
            print(f"Appending {len(new_df_to_write)} new entries to {aggregate_csv_path}")
            new_df_to_write.to_csv(aggregate_csv_path, mode='a', header=False, index=False)
        else:
            # Write new file with header
            print(f"Creating new aggregate file {aggregate_csv_path} with {len(new_df_to_write)} entries.")
            new_df_to_write.to_csv(aggregate_csv_path, mode='w', header=True, index=False)

        # Optional: Print total count after adding
        final_df = pd.read_csv(aggregate_csv_path)
        print(f"\nAggregate data saved. Total entries now: {len(final_df)}")

    except Exception as e:
         print(f"\nError writing aggregated CSV file to {aggregate_csv_path}. Error: {e}") #


In [3]:
def gather_new_sessions_only(session_base_dir, processed_csv_path, new_image_dir, new_csv_path):
    """Collects only sessions not already present in processed_csv_path and
    writes them to a separate aggregate located at ``new_image_dir`` and ``new_csv_path``.
    This is useful for incremental training before permanently adding the
    sessions to the full dataset."""
    os.makedirs(new_image_dir, exist_ok=True)
    if os.path.exists(new_csv_path):
        os.remove(new_csv_path)

    existing_sessions = set()
    if os.path.exists(processed_csv_path):
        try:
            df_existing = pd.read_csv(processed_csv_path)
            if 'session_id' in df_existing.columns:
                existing_sessions = set(df_existing['session_id'].unique())
        except Exception as exc:
            print(f"Error reading processed CSV {processed_csv_path}: {exc}")

    try:
        session_dirs = [d for d in os.listdir(session_base_dir)
                        if os.path.isdir(os.path.join(session_base_dir, d)) and d.startswith('session_')]
        session_dirs.sort()
    except FileNotFoundError:
        print(f"Base session directory not found: {session_base_dir}")
        return []

    sessions_to_process = [s for s in session_dirs if s not in existing_sessions]
    print(f"Found {len(sessions_to_process)} new sessions to collect.")

    all_rows = []
    for session_name in tqdm(sessions_to_process, desc="Collecting New Sessions"):
        session_path = os.path.join(session_base_dir, session_name)
        session_csv = os.path.join(session_path, 'data.csv')
        session_img_dir = os.path.join(session_path, 'images')
        if not os.path.exists(session_csv) or not os.path.exists(session_img_dir):
            print(f"Skipping {session_name}, missing data.csv or images")
            continue
        try:
            df = pd.read_csv(session_csv)
        except Exception as exc:
            print(f"Error reading {session_csv}: {exc}")
            continue
        for _, row in df.iterrows():
            orig_rel = row['image_path']
            orig_abs = os.path.join(session_path, orig_rel)
            new_filename = f"{session_name}_{os.path.basename(orig_rel)}"
            new_rel = os.path.join('images', new_filename)
            new_abs = os.path.join(new_image_dir, new_filename)
            if not os.path.exists(orig_abs):
                continue
            if not os.path.exists(new_abs):
                try:
                    shutil.copy2(orig_abs, new_abs)
                except Exception as exc:
                    print(f"Could not copy {orig_abs}: {exc}")
                    continue
            all_rows.append({'session_id': session_name,
                             'image_path': new_rel,
                             'timestamp': row.get('timestamp', ''),
                             'action': row['action']})

    if all_rows:
        pd.DataFrame(all_rows).to_csv(new_csv_path, index=False)
        print(f"Wrote {len(all_rows)} entries to {new_csv_path}")
    else:
        print("No new session data found.")
    return sessions_to_process


# In[3]:



combine_sessions_append(
    r'C:\Projects\jetbot-diffusion-world-model-kong-finder-aux\jetbot_session_data_two_actions_holdout_laundry', 
    r'C:\Projects\jetbot-diffusion-world-model-kong-finder-aux\jetbot_data_two_actions_holdout\images',
    r'C:\Projects\jetbot-diffusion-world-model-kong-finder-aux\jetbot_data_two_actions_holdout\holdout.csv'
)

In [4]:
if __name__ == '__main__':
    combine_sessions_append(config.SESSION_DATA_DIR, config.IMAGE_DIR, config.CSV_PATH)

Found 150 sessions to check from 'C:\Projects\jetbot-diffusion-world-model-kong-finder-aux\jetbot_livingroom_session_data_single_position'.
Reading existing sessions from: C:\Projects\jetbot-diffusion-world-model-kong-finder-aux\jetbot_data_two_actions_single_position\livingroom_data_incremental_test.csv
Found 74 existing sessions.
Found 76 new sessions to process.


Processing New Sessions:   0%|          | 0/76 [00:00<?, ?it/s]

Processing session: session_20250728_092449, 2309 entries.


  Processing session_20250728_092449:   0%|          | 0/2309 [00:00<?, ?it/s]

Processing session: session_20250728_093124, 2245 entries.


  Processing session_20250728_093124:   0%|          | 0/2245 [00:00<?, ?it/s]

Processing session: session_20250728_100126, 2182 entries.


  Processing session_20250728_100126:   0%|          | 0/2182 [00:00<?, ?it/s]

Processing session: session_20250728_100916, 2173 entries.


  Processing session_20250728_100916:   0%|          | 0/2173 [00:00<?, ?it/s]

Processing session: session_20250728_101056, 2266 entries.


  Processing session_20250728_101056:   0%|          | 0/2266 [00:00<?, ?it/s]

Processing session: session_20250728_101239, 2221 entries.


  Processing session_20250728_101239:   0%|          | 0/2221 [00:00<?, ?it/s]

Processing session: session_20250728_101420, 2143 entries.


  Processing session_20250728_101420:   0%|          | 0/2143 [00:00<?, ?it/s]

Processing session: session_20250728_101556, 2286 entries.


  Processing session_20250728_101556:   0%|          | 0/2286 [00:00<?, ?it/s]

Processing session: session_20250728_101754, 2252 entries.


  Processing session_20250728_101754:   0%|          | 0/2252 [00:00<?, ?it/s]

Processing session: session_20250728_101937, 2206 entries.


  Processing session_20250728_101937:   0%|          | 0/2206 [00:00<?, ?it/s]

Processing session: session_20250728_102113, 2265 entries.


  Processing session_20250728_102113:   0%|          | 0/2265 [00:00<?, ?it/s]

Processing session: session_20250728_102312, 2145 entries.


  Processing session_20250728_102312:   0%|          | 0/2145 [00:00<?, ?it/s]

Processing session: session_20250728_102510, 2226 entries.


  Processing session_20250728_102510:   0%|          | 0/2226 [00:00<?, ?it/s]

Processing session: session_20250728_102655, 2175 entries.


  Processing session_20250728_102655:   0%|          | 0/2175 [00:00<?, ?it/s]

Processing session: session_20250728_102831, 2353 entries.


  Processing session_20250728_102831:   0%|          | 0/2353 [00:00<?, ?it/s]

Processing session: session_20250728_103013, 2257 entries.


  Processing session_20250728_103013:   0%|          | 0/2257 [00:00<?, ?it/s]

Processing session: session_20250728_103416, 2199 entries.


  Processing session_20250728_103416:   0%|          | 0/2199 [00:00<?, ?it/s]

Processing session: session_20250728_103555, 2162 entries.


  Processing session_20250728_103555:   0%|          | 0/2162 [00:00<?, ?it/s]

Processing session: session_20250728_103738, 2238 entries.


  Processing session_20250728_103738:   0%|          | 0/2238 [00:00<?, ?it/s]

Processing session: session_20250728_104642, 2196 entries.


  Processing session_20250728_104642:   0%|          | 0/2196 [00:00<?, ?it/s]

Processing session: session_20250728_104815, 2286 entries.


  Processing session_20250728_104815:   0%|          | 0/2286 [00:00<?, ?it/s]

Processing session: session_20250728_104955, 2125 entries.


  Processing session_20250728_104955:   0%|          | 0/2125 [00:00<?, ?it/s]

Processing session: session_20250728_105137, 2173 entries.


  Processing session_20250728_105137:   0%|          | 0/2173 [00:00<?, ?it/s]

Processing session: session_20250728_105312, 2282 entries.


  Processing session_20250728_105312:   0%|          | 0/2282 [00:00<?, ?it/s]

Processing session: session_20250728_105607, 2187 entries.


  Processing session_20250728_105607:   0%|          | 0/2187 [00:00<?, ?it/s]

Processing session: session_20250728_105736, 2212 entries.


  Processing session_20250728_105736:   0%|          | 0/2212 [00:00<?, ?it/s]

Processing session: session_20250728_105911, 2194 entries.


  Processing session_20250728_105911:   0%|          | 0/2194 [00:00<?, ?it/s]

Processing session: session_20250728_110040, 2215 entries.


  Processing session_20250728_110040:   0%|          | 0/2215 [00:00<?, ?it/s]

Processing session: session_20250728_110211, 2187 entries.


  Processing session_20250728_110211:   0%|          | 0/2187 [00:00<?, ?it/s]

Processing session: session_20250728_110347, 2112 entries.


  Processing session_20250728_110347:   0%|          | 0/2112 [00:00<?, ?it/s]

Processing session: session_20250728_110525, 2133 entries.


  Processing session_20250728_110525:   0%|          | 0/2133 [00:00<?, ?it/s]

Processing session: session_20250728_110701, 2265 entries.


  Processing session_20250728_110701:   0%|          | 0/2265 [00:00<?, ?it/s]

Processing session: session_20250728_110835, 2233 entries.


  Processing session_20250728_110835:   0%|          | 0/2233 [00:00<?, ?it/s]

Processing session: session_20250728_111007, 2289 entries.


  Processing session_20250728_111007:   0%|          | 0/2289 [00:00<?, ?it/s]

Processing session: session_20250728_111318, 2326 entries.


  Processing session_20250728_111318:   0%|          | 0/2326 [00:00<?, ?it/s]

Processing session: session_20250728_111457, 2066 entries.


  Processing session_20250728_111457:   0%|          | 0/2066 [00:00<?, ?it/s]

Processing session: session_20250728_111625, 2109 entries.


  Processing session_20250728_111625:   0%|          | 0/2109 [00:00<?, ?it/s]

Processing session: session_20250728_111809, 2240 entries.


  Processing session_20250728_111809:   0%|          | 0/2240 [00:00<?, ?it/s]

Processing session: session_20250728_113416, 2245 entries.


  Processing session_20250728_113416:   0%|          | 0/2245 [00:00<?, ?it/s]

Processing session: session_20250728_113550, 2221 entries.


  Processing session_20250728_113550:   0%|          | 0/2221 [00:00<?, ?it/s]

Processing session: session_20250728_113747, 2200 entries.


  Processing session_20250728_113747:   0%|          | 0/2200 [00:00<?, ?it/s]

Processing session: session_20250728_113920, 2122 entries.


  Processing session_20250728_113920:   0%|          | 0/2122 [00:00<?, ?it/s]

Processing session: session_20250728_114050, 2215 entries.


  Processing session_20250728_114050:   0%|          | 0/2215 [00:00<?, ?it/s]

Processing session: session_20250728_114224, 2311 entries.


  Processing session_20250728_114224:   0%|          | 0/2311 [00:00<?, ?it/s]

Processing session: session_20250728_114402, 2169 entries.


  Processing session_20250728_114402:   0%|          | 0/2169 [00:00<?, ?it/s]

Processing session: session_20250728_114534, 2222 entries.


  Processing session_20250728_114534:   0%|          | 0/2222 [00:00<?, ?it/s]

Processing session: session_20250728_114747, 2428 entries.


  Processing session_20250728_114747:   0%|          | 0/2428 [00:00<?, ?it/s]

Processing session: session_20250728_114934, 2180 entries.


  Processing session_20250728_114934:   0%|          | 0/2180 [00:00<?, ?it/s]

Processing session: session_20250728_115105, 2114 entries.


  Processing session_20250728_115105:   0%|          | 0/2114 [00:00<?, ?it/s]

Processing session: session_20250728_115235, 2263 entries.


  Processing session_20250728_115235:   0%|          | 0/2263 [00:00<?, ?it/s]

Processing session: session_20250728_121458, 2113 entries.


  Processing session_20250728_121458:   0%|          | 0/2113 [00:00<?, ?it/s]

Processing session: session_20250728_121626, 2282 entries.


  Processing session_20250728_121626:   0%|          | 0/2282 [00:00<?, ?it/s]

Processing session: session_20250728_121807, 2282 entries.


  Processing session_20250728_121807:   0%|          | 0/2282 [00:00<?, ?it/s]

Processing session: session_20250728_121943, 2312 entries.


  Processing session_20250728_121943:   0%|          | 0/2312 [00:00<?, ?it/s]

Processing session: session_20250728_122127, 2129 entries.


  Processing session_20250728_122127:   0%|          | 0/2129 [00:00<?, ?it/s]

Processing session: session_20250728_122307, 2274 entries.


  Processing session_20250728_122307:   0%|          | 0/2274 [00:00<?, ?it/s]

Processing session: session_20250728_122757, 2105 entries.


  Processing session_20250728_122757:   0%|          | 0/2105 [00:00<?, ?it/s]

Processing session: session_20250728_123028, 2320 entries.


  Processing session_20250728_123028:   0%|          | 0/2320 [00:00<?, ?it/s]

Processing session: session_20250728_124432, 2299 entries.


  Processing session_20250728_124432:   0%|          | 0/2299 [00:00<?, ?it/s]

Processing session: session_20250728_124721, 2276 entries.


  Processing session_20250728_124721:   0%|          | 0/2276 [00:00<?, ?it/s]

Processing session: session_20250728_124953, 2194 entries.


  Processing session_20250728_124953:   0%|          | 0/2194 [00:00<?, ?it/s]

Processing session: session_20250728_125447, 2268 entries.


  Processing session_20250728_125447:   0%|          | 0/2268 [00:00<?, ?it/s]

Processing session: session_20250728_130114, 2333 entries.


  Processing session_20250728_130114:   0%|          | 0/2333 [00:00<?, ?it/s]

Processing session: session_20250728_130339, 2289 entries.


  Processing session_20250728_130339:   0%|          | 0/2289 [00:00<?, ?it/s]

Processing session: session_20250728_130546, 2180 entries.


  Processing session_20250728_130546:   0%|          | 0/2180 [00:00<?, ?it/s]

Processing session: session_20250728_131003, 2246 entries.


  Processing session_20250728_131003:   0%|          | 0/2246 [00:00<?, ?it/s]

Processing session: session_20250728_131153, 2202 entries.


  Processing session_20250728_131153:   0%|          | 0/2202 [00:00<?, ?it/s]

Processing session: session_20250728_131444, 2271 entries.


  Processing session_20250728_131444:   0%|          | 0/2271 [00:00<?, ?it/s]

Processing session: session_20250728_131644, 2207 entries.


  Processing session_20250728_131644:   0%|          | 0/2207 [00:00<?, ?it/s]

Processing session: session_20250728_131951, 2234 entries.


  Processing session_20250728_131951:   0%|          | 0/2234 [00:00<?, ?it/s]

Processing session: session_20250728_132155, 2283 entries.


  Processing session_20250728_132155:   0%|          | 0/2283 [00:00<?, ?it/s]

Processing session: session_20250728_132339, 2203 entries.


  Processing session_20250728_132339:   0%|          | 0/2203 [00:00<?, ?it/s]

Processing session: session_20250728_132538, 2276 entries.


  Processing session_20250728_132538:   0%|          | 0/2276 [00:00<?, ?it/s]

Processing session: session_20250728_132816, 2286 entries.


  Processing session_20250728_132816:   0%|          | 0/2286 [00:00<?, ?it/s]

Processing session: session_20250728_133215, 2238 entries.


  Processing session_20250728_133215:   0%|          | 0/2238 [00:00<?, ?it/s]

Processing session: session_20250728_133353, 2240 entries.


  Processing session_20250728_133353:   0%|          | 0/2240 [00:00<?, ?it/s]

Appending 169165 new entries to C:\Projects\jetbot-diffusion-world-model-kong-finder-aux\jetbot_data_two_actions_single_position\livingroom_data_incremental_test.csv

Aggregate data saved. Total entries now: 332134
