In [None]:
from spark_init import create_spark_session

In [None]:
# Testing Local Databricks Stack
# This script demonstrates the integration of all components in our local Databricks-like environment.

# Import required libraries
from pyspark.sql import SparkSession
# import mlflow
# import mlflow.spark
from delta import *
# import pandas as pd
# import polars as pl
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor

In [None]:
spark = create_spark_session()

print("Spark session created successfully!")
print(f"Spark version: {spark.version}")

In [None]:
# Create sample data
data = [(1, "test1"), (2, "test2"), (3, "test3")]
df = spark.createDataFrame(data, ["id", "value"])

In [None]:
df.count()

In [None]:
# Test Delta Lake
# Create sample data
data = [(1, "test1"), (2, "test2"), (3, "test3")]
df = spark.createDataFrame(data, ["id", "value"])

# Write to Delta Lake format in MinIO
# df.write.format("delta").mode("overwrite").save("s3a://delta/test-table")
df.write.mode("overwrite").save("s3a://delta/test-table")

# Read from Delta Lake
# df_read = spark.read.format("delta").load("s3a://delta/test-table")
df_read = spark.read.load("s3a://delta/test-table")
df_read.show()

In [None]:
# Test MLflow integration
# Create sample ML model
mlflow.set_experiment("test-experiment")

with mlflow.start_run():
    # Generate sample data
    X = np.random.rand(100, 4)
    y = np.random.rand(100)
    
    # Train a model
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X, y)
    
    # Log parameters and model
    mlflow.log_param("n_estimators", 100)
    mlflow.sklearn.log_model(model, "model")

In [None]:
# Test SQL Magic
query = "SELECT * FROM delta.`s3a://delta/test-table`"
spark.sql(query).show()

In [None]:
# Test Polars integration
# Convert Spark DataFrame to Polars
pandas_df = df_read.toPandas()
polars_df = pl.from_pandas(pandas_df)
print("Polars DataFrame:")
print(polars_df)

In [None]:
spark.stop()