DatabricksSession classâ€™s remote()

In [4]:
from databricks.connect import DatabricksSession
from utils import secrets

# Set the host, token, and cluster_id fields in DatabricksSession.builder.remote.
spark = DatabricksSession.builder.remote(
   host       = f'{secrets.retrieve("host")}',
   token      = f'{secrets.retrieve("token")}',
   cluster_id = f'{secrets.retrieve("cluster_id")}'
).getOrCreate()

In [5]:
df = spark.read.table("samples.nyctaxi.trips")
df.show(5)

+--------------------+---------------------+-------------+-----------+----------+-----------+
|tpep_pickup_datetime|tpep_dropoff_datetime|trip_distance|fare_amount|pickup_zip|dropoff_zip|
+--------------------+---------------------+-------------+-----------+----------+-----------+
| 2016-02-14 16:52:13|  2016-02-14 17:16:04|         4.94|       19.0|     10282|      10171|
| 2016-02-04 18:44:19|  2016-02-04 18:46:00|         0.28|        3.5|     10110|      10110|
| 2016-02-17 17:13:57|  2016-02-17 17:17:55|          0.7|        5.0|     10103|      10023|
| 2016-02-18 10:36:07|  2016-02-18 10:41:45|          0.8|        6.0|     10022|      10017|
| 2016-02-22 14:14:41|  2016-02-22 14:31:52|         4.51|       17.0|     10110|      10282|
+--------------------+---------------------+-------------+-----------+----------+-----------+
only showing top 5 rows



In [15]:
from databricks.connect import DatabricksSession
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType
from datetime import date

# create databricks.connect session with profile
spark = DatabricksSession.builder.profile("vscode").getOrCreate()

# Create a Spark DataFrame consisting of high and low temperatures by airport code and date.
schema = StructType([
  StructField('AirportCode', StringType(), False),
  StructField('Date', DateType(), False),
  StructField('TempHighF', IntegerType(), False),
  StructField('TempLowF', IntegerType(), False)
])

data = [
  [ 'BLI', date(2021, 4, 3), 52, 43],
  [ 'BLI', date(2021, 4, 2), 50, 38],
  [ 'BLI', date(2021, 4, 1), 52, 41],
  [ 'PDX', date(2021, 4, 3), 64, 45],
  [ 'PDX', date(2021, 4, 2), 61, 41],
  [ 'PDX', date(2021, 4, 1), 66, 39],
  [ 'SEA', date(2021, 4, 3), 57, 43],
  [ 'SEA', date(2021, 4, 2), 54, 39],
  [ 'SEA', date(2021, 4, 1), 56, 41]
]

temps = spark.createDataFrame(data, schema)


In [16]:

# Create a table on the Databricks main catalog

spark.sql('USE main.default') # Set the default database by convention
spark.sql('DROP TABLE IF EXISTS main.default.temps_table')
temps.write.saveAsTable('main.default.temps_table')

In [25]:
plt.style.available

['Solarize_Light2',
 '_classic_test_patch',
 '_mpl-gallery',
 '_mpl-gallery-nogrid',
 'bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn-v0_8',
 'seaborn-v0_8-bright',
 'seaborn-v0_8-colorblind',
 'seaborn-v0_8-dark',
 'seaborn-v0_8-dark-palette',
 'seaborn-v0_8-darkgrid',
 'seaborn-v0_8-deep',
 'seaborn-v0_8-muted',
 'seaborn-v0_8-notebook',
 'seaborn-v0_8-paper',
 'seaborn-v0_8-pastel',
 'seaborn-v0_8-poster',
 'seaborn-v0_8-talk',
 'seaborn-v0_8-ticks',
 'seaborn-v0_8-white',
 'seaborn-v0_8-whitegrid',
 'tableau-colorblind10']

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Set the default style
plt.style.use( 'seaborn-v0_8-whitegrid') 

# Query the table on the Databricks cluster
df_temps = spark.sql(
    "SELECT * FROM temps_table " \
    "WHERE AirportCode != 'BLI' AND Date > '2021-04-01' " \
    "GROUP BY AirportCode, Date, TempHighF, TempLowF " \
    "ORDER BY TempHighF DESC")

df_temps.toPandas().plot(kind='bar', x='AirportCode', y=['TempHighF', 'TempLowF'],figsize=(6, 3), title='High and Low Temperatures by Airport Code and Date');

In [55]:
# Clean up by deleting the table from the Databricks cluster.
spark.sql('DROP TABLE temps_table')