#Sample datasets

https://docs.databricks.com/aws/en/discover/databricks-datasets

##Unity Catalog datasets

In [0]:
%sql
SELECT * FROM samples.nyctaxi.trips LIMIT 10

In [0]:
%sql
SHOW TABLES IN samples.tpch

##Databricks datasets (databricks-datasets) mounted to DBFS

In [0]:
display(dbutils.fs.ls('/databricks-datasets'))

In [0]:
display(dbutils.fs.ls('dbfs:/databricks-datasets/flights/'))

In [0]:
dbutils.fs.head('dbfs:/databricks-datasets/flights/airport-codes-na.txt')

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import re

spark = SparkSession.builder.getOrCreate()

# Define formats of interest
target_extensions = [".csv", ".parquet", "_delta_log"]

def find_first_matching_file(folder_path):
    stack = [folder_path]
    while stack:
        current_path = stack.pop()
        try:
            items = dbutils.fs.ls(current_path)
        except:
            continue  # skip if permission denied or error

        for item in items:
            if item.isDir():
                # Check for Delta table marker
                if item.name == "_delta_log/":
                    return ("delta", current_path)
                stack.append(item.path)
            else:
                if item.path.endswith(".csv"):
                    return ("csv", item.path)
                elif item.path.endswith(".parquet"):
                    return ("parquet", item.path)
    return (None, None)

# Get all top-level folders in /databricks-datasets/
top_level = [f for f in dbutils.fs.ls("/databricks-datasets/") if f.isDir()]

# Search each folder for a sample file of desired format
results = []
for entry in top_level:
    fmt, sample_path = find_first_matching_file(entry.path)
    if fmt:
        results.append((entry.name, fmt, sample_path))

# Create DataFrame
df = spark.createDataFrame(results, ["dataset", "format", "sample_path"])
display(df)
