In [3]:
!pip install kaggle
!pip install azure-storage-file-datalake

Collecting azure-storage-file-datalake
  Downloading azure_storage_file_datalake-12.20.0-py3-none-any.whl.metadata (16 kB)
Downloading azure_storage_file_datalake-12.20.0-py3-none-any.whl (263 kB)
Installing collected packages: azure-storage-file-datalake
Successfully installed azure-storage-file-datalake-12.20.0


In [5]:
import os
from pathlib import Path
from kaggle.api.kaggle_api_extended import KaggleApi
from azure.storage.filedatalake import DataLakeServiceClient

In [7]:
def download_and_upload_dataset():
    # Authenticate with Kaggle
    api = KaggleApi()
    api.authenticate()

    # ADLS Gen2 connection string and container
    connection_string = # <-- insert the azure connection string here
    container_name = "raw-data"

    # Local folders
    base_dir = './Olist'
    raw_data_dir = os.path.join(base_dir, 'raw-data')
    os.makedirs(raw_data_dir, exist_ok=True)

    print(f"Downloading dataset to {raw_data_dir}...")
    api.dataset_download_files(dataset='olistbr/brazilian-ecommerce', path=raw_data_dir, unzip=True)

    # File renaming map
    rename_map = {
        "olist_customers_dataset.csv": "customers.csv",
        "olist_geolocation_dataset.csv": "geolocation.csv",
        "olist_order_items_dataset.csv": "order_items.csv",
        "olist_order_payments_dataset.csv": "order_payment.csv",
        "olist_order_reviews_dataset.csv": "order_reviews.csv",
        "olist_orders_dataset.csv": "orders.csv",
        "olist_products_dataset.csv": "products.csv",
        "olist_sellers_dataset.csv": "sellers.csv"
    }

    try:
        # ADLS Gen2 upload
        service_client = DataLakeServiceClient.from_connection_string(connection_string)
        file_system_client = service_client.get_file_system_client(file_system=container_name)

        for file_path in Path(raw_data_dir).glob('*.csv'):
            original_name = file_path.name
            new_name = rename_map.get(original_name, original_name)

            print(f"Uploading {original_name} as {new_name}...")

            file_client = file_system_client.get_file_client(new_name)
            with open(file_path, "rb") as data:
                file_contents = data.read()
                file_client.create_file()
                file_client.append_data(data=file_contents, offset=0, length=len(file_contents))
                file_client.flush_data(len(file_contents))

            print(f"Uploaded as {new_name}")

        print("All files uploaded successfully!")

    except Exception as e:
        print(f"ADLS Gen2 upload error: {str(e)}")

if __name__ == "__main__":
    download_and_upload_dataset()

Downloading dataset to ./Olist/raw-data...
Dataset URL: https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce
Uploading olist_sellers_dataset.csv as sellers.csv...
Uploaded as sellers.csv
Uploading product_category_name_translation.csv as product_category_name_translation.csv...
Uploaded as product_category_name_translation.csv
Uploading olist_orders_dataset.csv as orders.csv...
Uploaded as orders.csv
Uploading olist_order_items_dataset.csv as order_items.csv...
Uploaded as order_items.csv
Uploading olist_customers_dataset.csv as customers.csv...
Uploaded as customers.csv
Uploading olist_geolocation_dataset.csv as geolocation.csv...
Uploaded as geolocation.csv
Uploading olist_order_payments_dataset.csv as order_payment.csv...
Uploaded as order_payment.csv
Uploading olist_order_reviews_dataset.csv as order_reviews.csv...
Uploaded as order_reviews.csv
Uploading olist_products_dataset.csv as products.csv...
Uploaded as products.csv
All files uploaded successfully!
