In [None]:
#EDIT
# import pandas as pd
import os
from datetime import datetime, timedelta

# Helper: Convert filename to datetime based on day
def filename_to_datetime(filename, day):
    original_filename = filename
    filename = filename.replace(".jpg", "").replace(".png", "").replace(".JPG", "").replace(".PNG", "")

    if day == 1:
        # For day 1: filename is HHMMSS format (e.g., "173710" = 17:37:10)
        if len(filename) == 6 and filename.isdigit():
            time_str = f"{filename[:2]}:{filename[2:4]}:{filename[4:6]}"
            return datetime.strptime(f"2019-07-24 {time_str}", "%Y-%m-%d %H:%M:%S")
        else:
            raise ValueError(f"Day 1 filename should be 6 digits (HHMMSS), got: '{filename}' from '{original_filename}'")
    elif day == 2:
        # For day 2: filename format is "2019-10-19 HHMMSS" (17 chars total)
        if len(filename) >= 17:
            # Extract the full datetime string
            date_time_part = filename[:17]
            try:
                # Parse format: "2019-10-19 HHMMSS"
                return datetime.strptime(date_time_part, "%Y-%m-%d %H%M%S")
            except ValueError:
                # Try alternative formats
                try:
                    # Try with seconds included: "2019-10-19 HHMMSS"
                    return datetime.strptime(date_time_part, "%Y-%m-%d %H%M%S")
                except ValueError:
                    raise ValueError(f"Day 2 filename format not recognized: '{date_time_part}' from '{original_filename}'")
        else:
            raise ValueError(f"Day 2 filename too short (expected ≥17 chars), got: '{filename}' from '{original_filename}'")
    elif day == 3:
        # For day 3: filename format should have YYYYMMDD_HHMMSS starting at position 4
        if len(filename) >= 19:
            date_time_part = filename[4:19]
            try:
                return datetime.strptime(date_time_part, "%Y%m%d_%H%M%S")
            except ValueError:
                raise ValueError(f"Day 3 filename format not recognized: '{date_time_part}' from '{original_filename}'")
        else:
            raise ValueError(f"Day 3 filename too short (expected ≥19 chars), got: '{filename}' from '{original_filename}'")
    else:
        raise ValueError("Invalid day number")

# Main function: image → sensor rows
def match_image_to_sensors(day, sensor_csv_path, image_folder, output_csv_path):
    print(f"📎 Matching Day {day}...")

    # Load sensor data
    df = pd.read_csv(sensor_csv_path)
    print(f"📊 CSV columns: {list(df.columns)}")
    
    # Find the time column (could be 'time', 'timestamp', 'datetime', etc.)
    time_col = None
    for col in df.columns:
        if 'time' in col.lower() or 'date' in col.lower():
            time_col = col
            break
    
    if time_col is None:
        print("❌ No time column found in CSV. Available columns:", list(df.columns))
        return
    
    print(f"📅 Using time column: '{time_col}'")
    df['time'] = pd.to_datetime(df[time_col])

    # Load and convert image filenames
    filenames = sorted([f for f in os.listdir(image_folder) if f.lower().endswith(('.jpg', '.png'))])
    if not filenames:
        print("❌ No image files found in folder!")
        return
        
    print(f"📷 Found {len(filenames)} total image files in folder")
    print(f"📂 First 10 filenames: {filenames[:10]}")
    
    image_times = []
    valid_filenames = []
    failed_files = []
    
    for f in filenames:
        try:
            img_time = filename_to_datetime(f, day)
            image_times.append(img_time)
            valid_filenames.append(f)
            print(f"🕐 {f} → {img_time}")  # Debug: show the conversion
        except Exception as e:
            print(f"⚠️ Skipping file {f}: {e}")
            failed_files.append((f, str(e)))
            continue
    
    print(f"✅ Successfully parsed {len(valid_filenames)} images")
    if failed_files:
        print(f"❌ Failed to parse {len(failed_files)} files:")
        for fname, error in failed_files:
            print(f"   - {fname}: {error}")
    
    if not valid_filenames:
        print("❌ No valid image files found!")
        return
        
    image_df = pd.DataFrame({'filename': valid_filenames, 'image_time': image_times})
    image_df = image_df.sort_values('image_time').reset_index(drop=True)
    
    print(f"📅 Image time range: {image_df['image_time'].min()} to {image_df['image_time'].max()}")
    print(f"📅 Sensor time range: {df['time'].min()} to {df['time'].max()}")

    # Create result list
    result_rows = []

    # First, handle sensor data that comes BEFORE the first image (assign to first image)
    first_image_time = image_df.iloc[0]['image_time']
    first_image_filename = image_df.iloc[0]['filename']
    
    early_sensor_data = df[df['time'] < first_image_time]
    print(f"📊 Found {len(early_sensor_data)} sensor readings before first image")
    
    for _, sensor_row in early_sensor_data.iterrows():
        # Create new row - adjust fields based on day
        if day == 1:
            # Day 1: no humidity and temperature
            new_row = {
                'pm2.5': sensor_row.get('pm2.5', sensor_row.get('PM2.5', None)),
                'pm10': sensor_row.get('pm10', sensor_row.get('PM10', None)),
                'time': sensor_row['time'],
                'location': sensor_row.get('location', sensor_row.get('Location', None)),
                'image': first_image_filename
            }
        else:
            # Day 2 and 3: include all fields
            new_row = {
                'pm2.5': sensor_row.get('pm2.5', sensor_row.get('PM2.5', None)),
                'pm10': sensor_row.get('pm10', sensor_row.get('PM10', None)),
                'time': sensor_row['time'],
                'temperature': sensor_row.get('temperature', sensor_row.get('Temperature', None)),
                'humidity': sensor_row.get('humidity', sensor_row.get('Humidity', None)),
                'location': sensor_row.get('location', sensor_row.get('Location', None)),
                'image': first_image_filename
            }
        result_rows.append(new_row)

    # For each image, define its time period (from this image time until next image time)
    for i in range(len(image_df)):
        current_image = image_df.iloc[i]
        image_time = current_image['image_time']
        image_filename = current_image['filename']
        
        # Define time window: from current image time until next image time (or end of data)
        if i < len(image_df) - 1:
            # Not the last image - use time until next image
            next_image_time = image_df.iloc[i + 1]['image_time']
            sensor_matches = df[
                (df['time'] >= image_time) &
                (df['time'] < next_image_time)
            ]
        else:
            # Last image - use ALL remaining sensor data from this point forward
            sensor_matches = df[df['time'] >= image_time]
        
        print(f"🖼️ Image {image_filename} ({image_time}): {len(sensor_matches)} sensor readings")
        
        # Create a row for each sensor reading in this time period
        for _, sensor_row in sensor_matches.iterrows():
            # Create new row - adjust fields based on day
            if day == 1:
                # Day 1: no humidity and temperature
                new_row = {
                    'pm2.5': sensor_row.get('pm2.5', sensor_row.get('PM2.5', None)),
                    'pm10': sensor_row.get('pm10', sensor_row.get('PM10', None)),
                    'time': sensor_row['time'],
                    'location': sensor_row.get('location', sensor_row.get('Location', None)),
                    'image': image_filename
                }
            else:
                # Day 2 and 3: include all fields
                new_row = {
                    'pm2.5': sensor_row.get('pm2.5', sensor_row.get('PM2.5', None)),
                    'pm10': sensor_row.get('pm10', sensor_row.get('PM10', None)),
                    'time': sensor_row['time'],
                    'temperature': sensor_row.get('temperature', sensor_row.get('Temperature', None)),
                    'humidity': sensor_row.get('humidity', sensor_row.get('Humidity', None)),
                    'location': sensor_row.get('location', sensor_row.get('Location', None)),
                    'image': image_filename
                }
            result_rows.append(new_row)

    # Create final dataframe
    if not result_rows:
        print("❌ No matches found! Check if time ranges overlap.")
        return
        
    result_df = pd.DataFrame(result_rows)
    
    # Sort by time to maintain chronological order
    result_df = result_df.sort_values('time').reset_index(drop=True)

    # Debug: Show some sample matches
    print("\n📋 Sample matches:")
    print("Time Range | Image | Sensor Data")
    print("-" * 50)
    for i in range(min(5, len(image_df))):
        img = image_df.iloc[i]
        matches = result_df[result_df['image'] == img['filename']]
        if not matches.empty:
            first_match = matches.iloc[0]
            last_match = matches.iloc[-1]
            print(f"{img['image_time'].strftime('%H:%M:%S')} | {img['filename']} | {len(matches)} readings ({first_match['time'].strftime('%H:%M:%S')} to {last_match['time'].strftime('%H:%M:%S')})")

    # Save final matched file
    result_df.to_csv(output_csv_path, index=False)
    print(f"\n✅ Saved: {output_csv_path} with {len(result_df)} matched rows.")
    print(f"📊 Matched {len(result_df)} total sensor readings to {len(image_df)} images.")

# Alternative function if you want each image to appear only once with aggregated sensor data
def match_image_to_sensors_aggregated(day, sensor_csv_path, image_folder, output_csv_path):
    print(f"📎 Matching Day {day} (Aggregated)...")

    # Load sensor data
    df = pd.read_csv(sensor_csv_path)
    
    # Find the time column
    time_col = None
    for col in df.columns:
        if 'time' in col.lower() or 'date' in col.lower():
            time_col = col
            break
    
    if time_col is None:
        print("❌ No time column found in CSV. Available columns:", list(df.columns))
        return
        
    df['time'] = pd.to_datetime(df[time_col])

    # Load and convert image filenames
    filenames = sorted([f for f in os.listdir(image_folder) if f.lower().endswith(('.jpg', '.png'))])
    
    image_times = []
    valid_filenames = []
    
    for f in filenames:
        try:
            img_time = filename_to_datetime(f, day)
            image_times.append(img_time)
            valid_filenames.append(f)
        except Exception as e:
            print(f"⚠️ Skipping file {f}: {e}")
            continue
    
    image_df = pd.DataFrame({'filename': valid_filenames, 'image_time': image_times})
    image_df = image_df.sort_values('image_time').reset_index(drop=True)

    # Create result list
    result_rows = []
    time_window = timedelta(minutes=20)

    # For each image, find sensor readings and aggregate
    for _, image_row in image_df.iterrows():
        image_time = image_row['image_time']
        image_filename = image_row['filename']
        
        # Find sensor readings within time window
        sensor_matches = df[
            (df['time'] >= image_time - time_window) &
            (df['time'] <= image_time + time_window)
        ]
        
        # If we have sensor matches, aggregate the data
        if not sensor_matches.empty:
            # Take the closest sensor reading by time
            closest_sensor = sensor_matches.iloc[
                (sensor_matches['time'] - image_time).abs().argsort().iloc[0]
            ]
            
            # Create new row - adjust fields based on day
            if day == 1:
                new_row = {
                    'pm2.5': closest_sensor.get('pm2.5', closest_sensor.get('PM2.5', None)),
                    'pm10': closest_sensor.get('pm10', closest_sensor.get('PM10', None)),
                    'time': image_time,  # Use image time as reference
                    'location': closest_sensor.get('location', closest_sensor.get('Location', None)),
                    'image': image_filename
                }
            else:
                new_row = {
                    'pm2.5': closest_sensor.get('pm2.5', closest_sensor.get('PM2.5', None)),
                    'pm10': closest_sensor.get('pm10', closest_sensor.get('PM10', None)),
                    'time': image_time,  # Use image time as reference
                    'temperature': closest_sensor.get('temperature', closest_sensor.get('Temperature', None)),
                    'humidity': closest_sensor.get('humidity', closest_sensor.get('Humidity', None)),
                    'location': closest_sensor.get('location', closest_sensor.get('Location', None)),
                    'image': image_filename
                }
            result_rows.append(new_row)

    # Create final dataframe
    result_df = pd.DataFrame(result_rows)
    
    # Sort by time to maintain chronological order
    result_df = result_df.sort_values('time').reset_index(drop=True)

    # Save final matched file
    result_df.to_csv(output_csv_path, index=False)
    print(f"✅ Saved: {output_csv_path} with {len(result_df)} matched rows.")
    print(f"📊 Matched {len(result_df)} images to sensor data.")

In [29]:
match_image_to_sensors(
    day=1,
    sensor_csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/merged_tabular_data/merged_7_24_data.csv",
    image_folder="/Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images",
    output_csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/matched/7_24_matched.csv"
)

match_image_to_sensors(
    day=2,
    sensor_csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/merged_tabular_data/merged_10_19_data.csv",
    image_folder="/Users/Shai/OneDrive/Desktop/THESIS_data/images/1019 images",
    output_csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/matched/10_19_matched.csv"
)

match_image_to_sensors(
    day=3,
    sensor_csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/merged_tabular_data/merged_11_10_data.csv",
    image_folder="/Users/Shai/OneDrive/Desktop/THESIS_data/images/1110 images",
    output_csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/matched/11_10_matched.csv"
)

📎 Matching Day 1...
📊 CSV columns: ['pm2.5', 'pm10', 'time', 'location']
📅 Using time column: 'time'
📷 Found 36 total image files in folder
📂 First 10 filenames: ['152338.JPG', '152342.JPG', '152348.JPG', '152418.JPG', '152422.JPG', '152510.JPG', '152512.JPG', '152604.JPG', '152838.JPG', '162506.JPG']
🕐 152338.JPG → 2019-07-24 15:23:38
🕐 152342.JPG → 2019-07-24 15:23:42
🕐 152348.JPG → 2019-07-24 15:23:48
🕐 152418.JPG → 2019-07-24 15:24:18
🕐 152422.JPG → 2019-07-24 15:24:22
🕐 152510.JPG → 2019-07-24 15:25:10
🕐 152512.JPG → 2019-07-24 15:25:12
🕐 152604.JPG → 2019-07-24 15:26:04
🕐 152838.JPG → 2019-07-24 15:28:38
🕐 162506.JPG → 2019-07-24 16:25:06
🕐 162524.JPG → 2019-07-24 16:25:24
🕐 162636.JPG → 2019-07-24 16:26:36
🕐 162710.JPG → 2019-07-24 16:27:10
🕐 162734.JPG → 2019-07-24 16:27:34
🕐 162802.JPG → 2019-07-24 16:28:02
🕐 162808.JPG → 2019-07-24 16:28:08
🕐 170224.JPG → 2019-07-24 17:02:24
🕐 170228.JPG → 2019-07-24 17:02:28
🕐 170238.JPG → 2019-07-24 17:02:38
🕐 170256.JPG → 2019-07-24 17:02: