In [13]:
import pandas as pd
import requests
import os
from tqdm import tqdm
import time


In [15]:
# --- Configuration ---
INPUT_CSV = '/Users/kristenvinh/Documents/Github_repos/machine_learning_knitting/EDA/Final Sweaters.csv'
OUTPUT_DIR = '/Volumes/Extreme Pro/knitting_photos'
FAILED_LOG = 'failed_downloads.csv' # File to log failed attempts

import pandas as pd
import requests
import os
from tqdm import tqdm
import time
import numpy as np
# --- Configuration ---
BATCH_SIZE = 100

# List of possible extensions to check for resume logic
POSSIBLE_EXTENSIONS = ['.jpg', '.png', '.gif']

MIME_TYPE_MAP = {
    'image/jpeg': '.jpg',
    'image/png': '.png',
    'image/gif': '.gif',
}

# --- Main Script ---
def download_batch(df_chunk, chunk_num):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    batch_failures = []

    for index, row in tqdm(df_chunk.iterrows(), total=df_chunk.shape[0], desc=f"Batch {chunk_num}"):
        sweater_id = row['ID']
        image_url = row['Photo']

        # --- **KEY FIX: ROBUST RESUME LOGIC** ---
        # Check if a file for this ID already exists with ANY of the possible extensions.
        already_exists = False
        for ext in POSSIBLE_EXTENSIONS:
            if os.path.exists(os.path.join(OUTPUT_DIR, f"{sweater_id}{ext}")):
                already_exists = True
                break # Found it, no need to check further
        
        if already_exists:
            continue # Skip to the next image
        # --- End of Fix ---

        try:
            response = requests.get(image_url, headers=headers, timeout=20)
            if response.status_code != 200:
                raise requests.exceptions.HTTPError(f"HTTP Status {response.status_code}")

            content_type = response.headers.get('content-type', '').lower()
            
            file_extension = None
            for mime, ext in MIME_TYPE_MAP.items():
                if mime in content_type:
                    file_extension = ext
                    break

            if not file_extension:
                raise TypeError(f"Unknown image Content-Type: {content_type}")

            output_path = os.path.join(OUTPUT_DIR, f"{sweater_id}{file_extension}")
            
            with open(output_path, 'wb') as f:
                f.write(response.content)

            time.sleep(0.1)

        except (requests.exceptions.RequestException, requests.exceptions.HTTPError, TypeError) as e:
            batch_failures.append({'sweater_id': sweater_id, 'image_url': image_url, 'error': str(e)})

    return batch_failures

def run_batch_downloader():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    try:
        df = pd.read_csv(INPUT_CSV)
    except FileNotFoundError:
        print(f"Error: Input file '{INPUT_CSV}' not found.")
        return
        
    num_chunks = int(np.ceil(len(df) / BATCH_SIZE))
    list_of_dfs = np.array_split(df, num_chunks)
    
    print(f"Loaded {len(df)} total image links, splitting into {len(list_of_dfs)} batches.")
    all_failed_downloads = []

    for i, chunk_df in enumerate(list_of_dfs):
        chunk_number = i + 1
        print(f"\n--- Processing chunk {chunk_number} of {len(list_of_dfs)} ---")
        failures = download_batch(chunk_df, chunk_number)
        if failures:
            all_failed_downloads.extend(failures)
            print(f"⚠️  {len(failures)} failures in this batch.")
        
    if all_failed_downloads:
        pd.DataFrame(all_failed_downloads).to_csv(FAILED_LOG, index=False)
        print(f"\n--- Process Complete ---")
        print(f"{len(all_failed_downloads)} total images failed. See '{FAILED_LOG}'.")
    else:
        print("\n--- Process Complete ---")
        print("All images downloaded successfully!")

if __name__ == '__main__':
    run_batch_downloader()

  df = pd.read_csv(INPUT_CSV)
  return bound(*args, **kwds)


Loaded 26022 total image links, splitting into 261 batches.

--- Processing chunk 1 of 261 ---


Batch 1: 100%|██████████| 100/100 [00:01<00:00, 87.40it/s]


⚠️  8 failures in this batch.

--- Processing chunk 2 of 261 ---


Batch 2: 100%|██████████| 100/100 [00:01<00:00, 87.04it/s]


⚠️  8 failures in this batch.

--- Processing chunk 3 of 261 ---


Batch 3: 100%|██████████| 100/100 [00:02<00:00, 37.67it/s]


⚠️  18 failures in this batch.

--- Processing chunk 4 of 261 ---


Batch 4: 100%|██████████| 100/100 [00:02<00:00, 46.05it/s]


⚠️  16 failures in this batch.

--- Processing chunk 5 of 261 ---


Batch 5: 100%|██████████| 100/100 [00:01<00:00, 67.02it/s]


⚠️  10 failures in this batch.

--- Processing chunk 6 of 261 ---


Batch 6: 100%|██████████| 100/100 [00:00<00:00, 140.09it/s]


⚠️  5 failures in this batch.

--- Processing chunk 7 of 261 ---


Batch 7: 100%|██████████| 100/100 [00:00<00:00, 332.68it/s]


⚠️  2 failures in this batch.

--- Processing chunk 8 of 261 ---


Batch 8: 100%|██████████| 100/100 [00:00<00:00, 4049.22it/s]



--- Processing chunk 9 of 261 ---


Batch 9: 100%|██████████| 100/100 [00:00<00:00, 608.19it/s]


⚠️  1 failures in this batch.

--- Processing chunk 10 of 261 ---


Batch 10: 100%|██████████| 100/100 [00:02<00:00, 37.82it/s]


⚠️  17 failures in this batch.

--- Processing chunk 11 of 261 ---


Batch 11: 100%|██████████| 100/100 [00:04<00:00, 24.78it/s]


⚠️  29 failures in this batch.

--- Processing chunk 12 of 261 ---


Batch 12: 100%|██████████| 100/100 [00:01<00:00, 51.10it/s]


⚠️  13 failures in this batch.

--- Processing chunk 13 of 261 ---


Batch 13: 100%|██████████| 100/100 [00:01<00:00, 62.06it/s]


⚠️  10 failures in this batch.

--- Processing chunk 14 of 261 ---


Batch 14: 100%|██████████| 100/100 [00:01<00:00, 92.45it/s]


⚠️  8 failures in this batch.

--- Processing chunk 15 of 261 ---


Batch 15: 100%|██████████| 100/100 [00:02<00:00, 48.62it/s]


⚠️  14 failures in this batch.

--- Processing chunk 16 of 261 ---


Batch 16: 100%|██████████| 100/100 [00:29<00:00,  3.33it/s]


⚠️  9 failures in this batch.

--- Processing chunk 17 of 261 ---


Batch 17: 100%|██████████| 100/100 [00:29<00:00,  3.37it/s]


⚠️  13 failures in this batch.

--- Processing chunk 18 of 261 ---


Batch 18: 100%|██████████| 100/100 [00:29<00:00,  3.37it/s]


⚠️  16 failures in this batch.

--- Processing chunk 19 of 261 ---


Batch 19: 100%|██████████| 100/100 [00:27<00:00,  3.61it/s]


⚠️  25 failures in this batch.

--- Processing chunk 20 of 261 ---


Batch 20: 100%|██████████| 100/100 [00:30<00:00,  3.33it/s]


⚠️  12 failures in this batch.

--- Processing chunk 21 of 261 ---


Batch 21:   8%|▊         | 8/100 [00:02<00:27,  3.36it/s]


KeyboardInterrupt: 