- DataBricks Doc : [link](https://docs.databricks.com/aws/en/getting-started/dataframes)
- อธิบายคร่าวๆ(TH Ver.) : [link](https://docs.google.com/document/d/1j_pvp97Zi-EklHmFmac8nPTjyC2I1uYm4jN6-HToDls/edit?usp=sharing)

In [0]:
%sql
create volume if not exists tutorial

In [0]:
catalog = "workspace"
schema = "default"
volume = "tutorial"
download_url = "https://health.data.ny.gov/api/views/jxy9-yhdk/rows.csv"
file_name = "rows.csv"
table_name = "baby"
path_volume = "/Volumes/" + catalog + "/" + schema + "/" + volume
path_table = catalog + "." + schema
print(path_table) # Show the complete path
print(path_volume) # Show the complete path

In [0]:
dbutils.fs.cp(f"{download_url}", f"{path_volume}/{file_name}")

In [0]:
data = [[2021, "test", "Albany", "M", 42]]
columns = ["Year", "First_Name", "County", "Sex", "Count"]

df1 = spark.createDataFrame(data, schema="Year int, First_Name STRING, County STRING, Sex STRING, Count int")
display(df1) # The display() method is specific to Databricks notebooks and provides a richer visualization.
# df1.show() The show() method is a part of the Apache Spark DataFrame API and provides basic visualization.

In [0]:
df_csv = spark.read.csv(f"{path_volume}/{file_name}",
                        header=True,
                        inferSchema=True,
                        sep=",")
display(df_csv)


In [0]:
df_csv.printSchema()
df1.printSchema()

In [0]:
df_csv = df_csv.withColumnRenamed("First Name", "First_Name")
df_csv.printSchema

In [0]:
df = df1.union(df_csv)
display(df)
df.printSchema

In [0]:
display(df.filter(df["Count"] > 50))

In [0]:
display(df.where(df["Count"] > 50))

In [0]:
from pyspark.sql.functions import desc
display(df.select("First_Name", "Count").orderBy(desc("Count")))

In [0]:
subsetDF = df.filter((df["Year"] == 2009) & (df["Count"] > 100) & (df["Sex"] == "F")).select("First_Name", "County", "Count").orderBy(desc("Count"))
display(subsetDF)

In [0]:
df.count()

In [0]:
df.write.mode("overwrite").saveAsTable(f"{path_table}.{table_name}")

In [0]:
df.write.format("json").mode("overwrite").save("dbfs:/Volumes/workspace/default/tutorial/json_data/")


In [0]:
display(spark.read.format("json").json("dbfs:/Volumes/workspace/default/tutorial/json_data/"))

In [0]:
display(spark.read.format("json").json("dbfs:/Volumes/workspace/default/tutorial/json_data/").count())

In [0]:
display(df.selectExpr("Count", "upper(County) as big_name"))


In [0]:
from pyspark.sql.functions import expr
display(df.select("Count", expr("lower(County) as little_name")))


In [0]:
display(spark.sql(f"SELECT * FROM {path_table}.{table_name}"))
