In [8]:
import os
import numpy as np
import sys
import pandas as pd
import requests
from dotenv import load_dotenv
import time

# Load environment variables from the .env file
load_dotenv()

BASE_URL = "https://api.ravelry.com"

# The os.getenv() calls will now find the variables loaded from your .env file
RAVELRY_ACCESS_KEY = os.getenv('RAVELRY_ACCESS_KEY')
RAVELRY_PERSONAL_KEY = os.getenv('RAVELRY_PERSONAL_KEY')

In [9]:
# --- ADD THIS DEBUGGING CODE ---
print(f"Access Key Loaded: {RAVELRY_ACCESS_KEY}")
print(f"Personal Key Loaded: {RAVELRY_PERSONAL_KEY}")

Access Key Loaded: read-d4086974ad193fe02828dd97c21b9560
Personal Key Loaded: Eq5JjrVDcMu4Ji01Y2aQ9bMh4gtUpr1JoSYsG7Ri


In [10]:
def extract_categories(category_data):
    """
    Extracts all parent categories from a potentially nested category structure.
    """
    all_categories = set()
    
    # --- START: ADD THIS NEW LOGIC ---
    
    # 1. Check if the input is a list and not empty
    if isinstance(category_data, list) and category_data:
        # If it is, use the first item in the list as our starting point
        current_level = category_data[0]
    # 2. Check if the input is already a dictionary
    elif isinstance(category_data, dict):
        # If so, use it directly
        current_level = category_data
    # 3. Otherwise, we can't process it
    else:
        return [] # Return an empty list if data is invalid
        
    # --- END: NEW LOGIC ---
    
    # Your existing, corrected loop will now work correctly
    while isinstance(current_level, dict):
        if 'name' in current_level:
            all_categories.add(current_level['name'])
        current_level = current_level.get('parent')
        
    return list(all_categories)

In [11]:
def extract_permalinks(attribute_list):
    """
    Extracts the 'permalink' value from each dictionary in a list.
    """
    if not isinstance(attribute_list, list):
        return [] # Return empty list if input is not a list
        
    return [item['permalink'] for item in attribute_list if 'permalink' in item]

In [12]:
def get_pattern_details(pattern_id):
    """
    Fetches details for a given pattern ID, now with rate limiting.
    """
    # --- RATE LIMITING ---
    # Wait for a short duration *before* each request in a loop.
    # A smaller wait is okay here as it's part of a larger process.
    time.sleep(0.5) 
    
    details_url = f"https://api.ravelry.com/patterns/{pattern_id}.json"
    
    try:
        response = requests.get(details_url, auth=(RAVELRY_ACCESS_KEY, RAVELRY_PERSONAL_KEY))
        response.raise_for_status()
        
        details_data = response.json()
        pattern_data = details_data.get('pattern', {})
        
        # ... (the rest of your extraction logic remains the same)
        craft = pattern_data.get('craft', {})['name']
        attributes = pattern_data.get('pattern_attributes', [])
        difficulty_average = pattern_data.get('difficulty_average', None)
        downloadable = pattern_data.get('downloadable', False)
        gauge = pattern_data.get('gauge', None)
        gauge_divisor = pattern_data.get('gauge_divisor', None)
        gauge_pattern = pattern_data.get('gauge_pattern', None)
        # Added .get() for safety on nested dictionaries
        pattern_type = pattern_data.get('pattern_type', {}).get('permalink')
        yarn_weight = pattern_data.get('yarn_weight', {}).get('name')
        projects_count = pattern_data.get('projects_count', 0)
        rating_average = pattern_data.get('rating_average', None)
        sizes_available = pattern_data.get('sizes_available', "")
        photos = pattern_data.get('photos', [])

        return {
            'Craft': craft, 
            'Attributes': extract_permalinks(attributes), 
            'Gauge': gauge, 
            'Difficulty Average': difficulty_average, 
            'Downloadable': downloadable, 
            'Gauge Divisor': gauge_divisor, 
            'Gauge Pattern': gauge_pattern, 
            'Pattern Type': pattern_type, 
            'Yarn Weight': yarn_weight, 
            'Projects Count': projects_count, 
            'Rating Average': rating_average, 
            'Sizes Available': sizes_available,
            'Photos': photos
        }

    except requests.exceptions.RequestException as e:
        print(f"Could not fetch data for pattern ID {pattern_id}: {e}")
        return {
            'Craft': None, 'Attributes': [], 'Gauge': None, 
            'Difficulty Average': None, 'Downloadable': False, 'Gauge Divisor': None, 
            'Gauge Pattern': None, 'Pattern Type': None, 'Yarn Weight': None, 
            'Projects Count': 0, 'Rating Average': None, 'Sizes Available': ""
        }

def add_details_to_df(df):
    """
    Applies the single, efficient function to the DataFrame to create
    three new columns from the returned data.
    """
    # Using .apply with a lambda function that returns a pandas Series
    # is an efficient way to create multiple columns at once.
    details = df['ID'].apply(lambda pid: pd.Series(get_pattern_details(pid)))
    
    # Join the newly created columns back to the original DataFrame
    return df.join(details)

In [13]:

# Load your main CSV file
sweaters_df = pd.read_csv("sweaters_v3.csv")

# --- Define Chunking Parameters ---
chunk_size = 500  # Process 500 rows at a time
output_dir = "processed_chunks" # A folder to store intermediate files

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Split the dataframe into a list of smaller dataframes
list_of_dfs = np.array_split(sweaters_df, len(sweaters_df) // chunk_size)

In [14]:
# Loop through each chunk, process it, and save it
for i, chunk_df in enumerate(list_of_dfs):
    
    # Define the expected output path for this chunk
    chunk_number = i + 1
    output_path = os.path.join(output_dir, f"sweaters_chunk_{chunk_number}.csv")

    # --- THIS IS THE NEW LOGIC ---
    # Check if this chunk's output file already exists
    if os.path.exists(output_path):
        print(f"⏩ Chunk {chunk_number} already processed. Skipping.")
        continue  # Move to the next iteration of the loop

    # --- The original processing logic runs only if the file doesn't exist ---
    print(f"--- Processing chunk {chunk_number} of {len(list_of_dfs)} ---")
    
    try:
        # Apply your existing function to fetch details for this chunk
        processed_chunk = add_details_to_df(chunk_df)
        
        # Save the processed chunk to its own CSV file
        processed_chunk.to_csv(output_path, index=False)
        
        print(f"✅ Successfully saved chunk {chunk_number} to {output_path}")

    except Exception as e:
        print(f"❌ An error occurred on chunk {chunk_number}: {e}")
        print("Stopping process. Please check the error and re-run the script to resume.")
        break # Stop the script if an error occurs

print("\\n--- All chunks processed! ---")

⏩ Chunk 1 already processed. Skipping.
⏩ Chunk 2 already processed. Skipping.
⏩ Chunk 3 already processed. Skipping.
⏩ Chunk 4 already processed. Skipping.
⏩ Chunk 5 already processed. Skipping.
⏩ Chunk 6 already processed. Skipping.
⏩ Chunk 7 already processed. Skipping.
⏩ Chunk 8 already processed. Skipping.
⏩ Chunk 9 already processed. Skipping.
⏩ Chunk 10 already processed. Skipping.
⏩ Chunk 11 already processed. Skipping.
⏩ Chunk 12 already processed. Skipping.
⏩ Chunk 13 already processed. Skipping.
⏩ Chunk 14 already processed. Skipping.
⏩ Chunk 15 already processed. Skipping.
⏩ Chunk 16 already processed. Skipping.
⏩ Chunk 17 already processed. Skipping.
⏩ Chunk 18 already processed. Skipping.
⏩ Chunk 19 already processed. Skipping.
⏩ Chunk 20 already processed. Skipping.
⏩ Chunk 21 already processed. Skipping.
⏩ Chunk 22 already processed. Skipping.
⏩ Chunk 23 already processed. Skipping.
⏩ Chunk 24 already processed. Skipping.
⏩ Chunk 25 already processed. Skipping.
⏩ Chunk 2