In [1]:
import pandas as pd
import dask.dataframe as dd
import s3fs # s3fs is typically pre-installed in SageMaker Data Science kernels
import os

In [9]:
s3_parquet_path = "s3://test-hupd-parquet-s3/2004.parquet/"

In [10]:
print(f"Attempting to load Parquet data from S3: {s3_parquet_path}")

# --- Verify S3 Access and List Files (Optional but Recommended) ---
print("\n--- Verifying S3 Access ---")
try:
    # s3fs will automatically use the IAM role attached to your SageMaker Studio instance
    fs = s3fs.S3FileSystem()

    # List contents of your S3 path to confirm access
    s3_files = fs.ls(s3_parquet_path)
    print(f"Successfully listed {len(s3_files)} items in {s3_parquet_path}:")
    for f in s3_files[:5]: # Print first 5 files for brevity
        print(f"- {f}")
    if not s3_files:
        print("Warning: No files found in the specified S3 path. Please double-check your path.")
except Exception as e:
    print(f"ERROR: Could not list S3 files. This indicates a permission or path issue: {e}")
    print("Please ensure:")
    print("1. The S3 path is correct and points to a valid bucket/folder.")
    print("2. The IAM Role attached to your SageMaker Studio user has 's3:ListBucket' and 's3:GetObject' permissions for this bucket.")
    # If this fails, go back and re-check the IAM role permissions and role ARN for your user profile in the SageMaker Studio console.

Attempting to load Parquet data from S3: s3://test-hupd-parquet-s3/2004.parquet/

--- Verifying S3 Access ---
Successfully listed 416 items in s3://test-hupd-parquet-s3/2004.parquet/:
- test-hupd-parquet-s3/2004.parquet/part.0.parquet
- test-hupd-parquet-s3/2004.parquet/part.1.parquet
- test-hupd-parquet-s3/2004.parquet/part.10.parquet
- test-hupd-parquet-s3/2004.parquet/part.100.parquet
- test-hupd-parquet-s3/2004.parquet/part.101.parquet


In [11]:
# --- Load Data with Dask (Recommended for your ~4GB dataset) ---
# Dask is best for datasets that might not fit into memory or are already partitioned.
print("\n--- Loading with Dask DataFrame ---")
try:
    # Dask can directly read a directory containing multiple Parquet part files from S3
    ddf = dd.read_parquet(s3_parquet_path)

    print(f"Dask DataFrame created successfully with {ddf.npartitions} partitions.")
    print("Dask DataFrame Schema:")
    print(ddf.dtypes)

    print("\nFirst 5 rows (this triggers a small computation):")
    print(ddf.head()) # .head() brings a small sample to local memory

    # Example Dask computation: Get the total number of rows (triggers full data read)
    print("\nCounting total rows (this triggers the full data loading and processing):")
    # You can see the progress in the Dask dashboard link that pops up when you initialize Dask
    total_rows = ddf.shape[0].compute()
    print(f"Total rows in dataset: {total_rows}")

    # Example: Calculate summary statistics for a numeric column (replace 'your_numeric_column' with an actual column name)
    # print("\nDescriptive statistics for a numeric column:")
    # if 'your_numeric_column' in ddf.columns:
    #     stats = ddf['your_numeric_column'].describe().compute()
    #     print(stats)
    # else:
    #     print("Skipping numeric column stats: 'your_numeric_column' not found.")

except Exception as e:
    print(f"\nERROR: Dask failed to load or process data: {e}")
    print("This could be due to:")
    print("- Incorrect S3 path or file format.")
    print("- Insufficient memory (even Dask needs some overhead; consider a larger instance if persistent crashes occur).")
    print("- Issues with the Parquet file integrity.")


--- Loading with Dask DataFrame ---
Dask DataFrame created successfully with 416 partitions.
Dask DataFrame Schema:
application_number      string[pyarrow]
publication_number      string[pyarrow]
title                   string[pyarrow]
decision                string[pyarrow]
date_produced           string[pyarrow]
date_published          string[pyarrow]
main_cpc_label          string[pyarrow]
cpc_labels                       object
main_ipcr_label         string[pyarrow]
ipcr_labels                      object
patent_number           string[pyarrow]
filing_date             string[pyarrow]
patent_issue_date       string[pyarrow]
abandon_date            string[pyarrow]
uspc_class              string[pyarrow]
uspc_subclass           string[pyarrow]
examiner_id             string[pyarrow]
examiner_name_last      string[pyarrow]
examiner_name_first     string[pyarrow]
examiner_name_middle    string[pyarrow]
inventor_list                    object
abstract                string[pyarrow]
sum