### Script Overview

This notebook extracts a dataset from Kaggle, saves it as an in-memory CSV, and uploads it to Azure Blob Storage, using environment variables for configuration and supporting both local and Databricks environment

In [None]:
import io
import os
import kagglehub
from pathlib import Path
from dotenv import load_dotenv
from kagglehub import KaggleDatasetAdapter
from azure.storage.blob import BlobServiceClient


In [None]:
# Enviroment variables loader that works both locally and in DataBricks.
absolute_path = Path.absolute(Path(os.getcwd()))

for parent in absolute_path.parents:
    if str(parent).endswith("spark-pipeline"):
        repo_path = parent
        break

enviroment_variable_path = os.path.join(repo_path, "config", ".env")

load_dotenv(enviroment_variable_path)

In [None]:
# Used enviroment variables
KAGGLE_FILE = os.environ["KAGGLE_FILE"]
KAGGLE_HANDLE = os.environ["KAGGLE_HANDLE"]

AZURE_BLOB_STORAGE_CONTAINER = os.environ["AZURE_BLOB_STORAGE_CONTAINER"]
AZURE_STORAGE_ACCOUNT_CONNECTION_STRING = os.environ["AZURE_STORAGE_ACCOUNT_CONNECTION_STRING"]


In [None]:
# Extracting dataset from KaggleHub
df = kagglehub.load_dataset(
    adapter=KaggleDatasetAdapter.PANDAS,
    handle=KAGGLE_HANDLE,
    path=KAGGLE_FILE,
)

csv_data = io.BytesIO()
df.to_csv(csv_data)
csv_data.seek(0)

In [None]:
# Creating clients for the blob storage service and the desired container
blob_client = BlobServiceClient.from_connection_string(conn_str=AZURE_STORAGE_ACCOUNT_CONNECTION_STRING)
container_client = blob_client.get_container_client(container=AZURE_BLOB_STORAGE_CONTAINER)

In [None]:
# Uploading the .csv file into the desired container
container_client.upload_blob(name=KAGGLE_FILE, data=csv_data.getvalue(), overwrite=True)