In [4]:
from bootcamp.databricks_compat import display, dbutils, spark
from data.config import DATA_DIR

In [5]:
%fs ls /databricks-datasets/nyctaxi/tripdata/yellow/

path,name,size,modificationTime
dbfs:/home/jovyan/work/data/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_sample.tsv.gz,yellow_tripdata_sample.tsv.gz,81887950,1765285617688


In [6]:
nytaxi_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", "\t") \
    .option("inferSchema", "true") \
    .option("codec", "gzip") \
    .load(f"{DATA_DIR}/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_sample.tsv.gz")

In [7]:
nytaxi_df.count()

1000660

In [8]:
nytaxi_df.show()

+----------+---------+-----------+-------------------+------------+-------------------+------------------+------------+------------------+------------------+------------------+------------------+---------------+------------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+------+-------+--------+-------------------+--------------+---------------+-------------+-----------------+-----------------+--------------+--------------------+-----------+--------------------+---------------+----------------+--------------+------------------+------------------+---------------+--------------------+------------+
|   trip_id|vendor_id|pickup_date|    pickup_datetime|dropoff_date|   dropoff_datetime|store_and_fwd_flag|rate_code_id|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|passenger_count|     trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amo

In [10]:
nytaxi_df.write.format("delta").mode("overwrite").saveAsTable("nytaxi_yellow_trips")

## Delta Cache (`spark.databricks.io.cache.enabled`)

**This is a Databricks-proprietary feature** — it does NOT work in open-source Apache Spark or Delta Lake.

On Databricks clusters, Delta Cache:
- Uses local NVMe SSD storage on worker nodes
- Automatically caches remote Parquet/Delta files from cloud storage (S3/ADLS/GCS)
- Is transparent (no code changes needed)

### Caching alternatives for local Spark:

**DataFrame caching:**
```python
df.cache()  # or df.persist(StorageLevel.MEMORY_AND_DISK)
df.unpersist()  # when done
```

**SQL table caching:**
```sql
CACHE TABLE my_table
UNCACHE TABLE my_table
```

**Delta optimizations (improves read performance):**
```sql
OPTIMIZE my_table
OPTIMIZE my_table ZORDER BY (column1, column2)
```

Since local development reads from local disk (not remote cloud storage), standard Spark caching provides the best performance improvement for repeated operations.


In [12]:
%%sql

WITH vendors AS (
  SELECT DISTINCT vendor_id
  FROM nytaxi_yellow_trips
)
SELECT COUNT(*) AS distinct_vendor_count
FROM vendors

distinct_vendor_count
2


In [14]:
%%sql

WITH vendors AS (
  SELECT DISTINCT vendor_id
  FROM nytaxi_yellow_trips
)
SELECT COUNT(*) AS distinct_vendor_count
FROM vendors

distinct_vendor_count
2


In [0]:
distinct_vendor_count = nytaxi_df.select("vendor_name").distinct().count()


In [0]:
distinct_vendor_count

Out[16]: 3

In [0]:
distinct_vendor_count = nytaxi_df.select("vendor_name").distinct().count()
