# Lance-Spark Getting Started

This notebook demonstrates how to use Lance with Apache Spark for reading and writing Lance datasets.

## 1. Initialize Spark Session

The Spark session is already configured with the Lance catalog in the Docker container.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd

# Get the existing Spark session
spark = (
    SparkSession.builder
        # Directory namespace
        .config("spark.sql.catalog.lance_dir", "com.lancedb.lance.spark.LanceNamespaceSparkCatalog")
        .config("spark.sql.catalog.lance_dir.impl", "dir")
        .config("spark.sql.catalog.lance_dir.root", "s3://lance-warehouse/dir_ns")
        .config("spark.sql.catalog.lance_dir.storage.endpoint", "http://minio:9000")
        .config("spark.sql.catalog.lance_dir.storage.aws_allow_http", "true")
        .config("spark.sql.catalog.lance_dir.storage.access_key_id", "admin")
        .config("spark.sql.catalog.lance_dir.storage.secret_access_key", "password")
        # Glue namespace
        .config("spark.sql.catalog.lance_glue", "com.lancedb.lance.spark.LanceNamespaceSparkCatalog")
        .config("spark.sql.catalog.lance_glue.impl", "glue")
        .config("spark.sql.catalog.lance_glue.root", "s3://lance-warehouse/glue_ns")
        .config("spark.sql.catalog.lance_glue.access_key_id", "xyz")
        .config("spark.sql.catalog.lance_glue.secret_access_key", "abc")
        .config("spark.sql.catalog.lance_glue.region", "us-east-1")
        .config("spark.sql.catalog.lance_glue.storage.endpoint", "http://minio:9000")
        .config("spark.sql.catalog.lance_glue.storage.aws_allow_http", "true")
        .config("spark.sql.catalog.lance_glue.storage.access_key_id", "admin")
        .config("spark.sql.catalog.lance_glue.storage.secret_access_key", "password")
        .getOrCreate()
)

# Enable loading the Spark catalog
spark.sql("set spark.sql.defaultCatalog=lance_glue")
spark.sql("use default")

# Verify Lance catalog is configured
spark.sql("SHOW CATALOGS").show()

## 2. Create Sample Data

In [None]:
# Create a sample DataFrame
data = [
    (1, "Alice", 25, "Engineering", 75000),
    (2, "Bob", 30, "Marketing", 65000),
    (3, "Charlie", 35, "Sales", 70000),
    (4, "Diana", 28, "Engineering", 80000),
    (5, "Eve", 32, "HR", 60000)
]

columns = ["id", "name", "age", "department", "salary"]
df = spark.createDataFrame(data, columns)

df.show()

## 3. Create table

In [None]:
spark.sql("CREATE TABLE employees (id INT, name STRING, age INT, department STRING, salary INT)");

## 4. Show tables

In [None]:
spark.sql("SHOW TABLES").show()

## 5. Describe table

In [None]:
spark.sql("DESCRIBE TABLE EXTENDED employees").show()

## 6. Write data

In [None]:
df.writeTo("employees").append()

## 7. Simple read

In [None]:
spark.table("employees").show()

## 8. Query Lance Table using SQL

In [None]:
# Query the Lance table using SQL
result = spark.sql("""
    SELECT department, 
           COUNT(*) as employee_count,
           AVG(salary) as avg_salary
    FROM employees
    GROUP BY department
    ORDER BY avg_salary DESC
""")

result.show()

## 9. Append More Data to Lance Table

In [None]:
# Create new data to append
new_data = [
    (6, "Frank", 29, "Engineering", 77000),
    (7, "Grace", 31, "Marketing", 68000)
]

new_df = spark.createDataFrame(new_data, columns)

# Append to the Lance table
new_df.writeTo("employees").append()

# Verify the append
spark.sql("SELECT * FROM employees ORDER BY id DESC LIMIT 2").show()

## 10. Cleanup

In [None]:
spark.sql("DROP TABLE IF EXISTS employees")