In [0]:
print("Hello, Databricks!")
data = [("Alice", 34), ("Bob", 45), ("Cathy", 29)]
columns = ["Name", "Age"]

df = spark.createDataFrame(data, columns)
df.show()


Hello, Databricks!
+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
|Cathy| 29|
+-----+---+



In [0]:
df_filtered = df.filter(df["Age"] > 30)
df_filtered.show()


+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
+-----+---+



In [0]:
csv_path = "/FileStore/shared_uploads/letebrhan@gmail.com/weather_climate_data_4006312.csv"  # change to your path
df_csv = spark.read.option("header", "true").csv(csv_path)
df_csv.show(5)


+-----------+--------------------+--------+----------+---------+----------+-----+---------------+----+---------------+----+---------------+----+---------------+----+---------------+----+---------------+-----+---------------+-----+---------------+----+---------------+----+---------------+-----+---------------+----+---------------+----+---------------+-----+---------------+
|    STATION|                NAME|LATITUDE| LONGITUDE|ELEVATION|      DATE| AWND|AWND_ATTRIBUTES|PGTM|PGTM_ATTRIBUTES|PRCP|PRCP_ATTRIBUTES|TAVG|TAVG_ATTRIBUTES|TMAX|TMAX_ATTRIBUTES|TMIN|TMIN_ATTRIBUTES| WDF2|WDF2_ATTRIBUTES| WDF5|WDF5_ATTRIBUTES|WSF2|WSF2_ATTRIBUTES|WSF5|WSF5_ATTRIBUTES| WT01|WT01_ATTRIBUTES|WT02|WT02_ATTRIBUTES|WT03|WT03_ATTRIBUTES| WT08|WT08_ATTRIBUTES|
+-----------+--------------------+--------+----------+---------+----------+-----+---------------+----+---------------+----+---------------+----+---------------+----+---------------+----+---------------+-----+---------------+-----+---------------+----

In [0]:
csv_path = "/FileStore/shared_uploads/letebrhan@gmail.com/hw_people_200-2.csv"  # change to your path
df_csv = spark.read.option("header", "true").csv(csv_path)
df_csv.show(5)



+-----+----------------+-----------------+
|Index| Height(Inches)"| "Weight(Pounds)"|
+-----+----------------+-----------------+
|    1|           65.78|           112.99|
|    2|           71.52|           136.49|
|    3|           69.40|           153.03|
|    4|           68.22|           142.34|
|    5|           67.79|           144.30|
+-----+----------------+-----------------+
only showing top 5 rows



In [0]:
df = spark.read.option("header", True).csv(csv_path)
cleaned_df = df.toDF(*[c.strip().replace('"', '') for c in df.columns])
cleaned_df.createOrReplaceTempView("people_csv")
spark.sql("SELECT * FROM people_csv WHERE `Height(Inches)` > 70").show()


+-----+--------------+--------------+
|Index|Height(Inches)|Weight(Pounds)|
+-----+--------------+--------------+
|    2|         71.52|        136.49|
|   16|         71.09|        140.00|
|   19|         71.23|        137.90|
|   35|         71.80|        140.10|
|   93|         71.49|        140.61|
|  132|         71.23|        130.70|
|  135|         71.10|        128.14|
|  139|         73.90|        151.39|
|  155|         72.44|        136.74|
|  159|         72.02|        138.78|
|  175|         73.83|        139.30|
|  200|         71.39|       127.88 |
+-----+--------------+--------------+



In [0]:
df = spark.read.option("header", "true").csv(csv_path, inferSchema=True)
print(df.columns)


['Index', ' Height(Inches)"', ' "Weight(Pounds)"']


In [0]:
import re

# Clean column names manually
cleaned_col_names = [re.sub(r'[^A-Za-z0-9_]', '', c.replace(' ', '_')) for c in df.columns]
df_clean = df.toDF(*cleaned_col_names)
df_clean.show(7)

from pyspark.sql.functions import col

df_transformed = df_clean.filter(col("_HeightInches") > 70)
df_transformed = df_transformed.withColumn("Height_cm", col("_HeightInches") * 2.54)

df_transformed.write.mode("overwrite").saveAsTable("tall_people")
df_transformed.show(4)

+-----+-------------+-------------+
|Index|_HeightInches|_WeightPounds|
+-----+-------------+-------------+
|    1|        65.78|       112.99|
|    2|        71.52|       136.49|
|    3|         69.4|       153.03|
|    4|        68.22|       142.34|
|    5|        67.79|        144.3|
|    6|         68.7|        123.3|
|    7|         69.8|       141.49|
+-----+-------------+-------------+
only showing top 7 rows

+-----+-------------+-------------+---------+
|Index|_HeightInches|_WeightPounds|Height_cm|
+-----+-------------+-------------+---------+
|    2|        71.52|       136.49| 181.6608|
|    8|        70.01|       136.46| 177.8254|
|   16|        71.09|        140.0| 180.5686|
|   19|        71.23|        137.9| 180.9242|
+-----+-------------+-------------+---------+
only showing top 4 rows



In [0]:

df = spark.read.option("header", "true").csv("dbfs:/FileStore/shared_uploads/letebrhan@gmail.com/sales_data.csv", inferSchema=True)
df.show(5)


+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|  Profit|
+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|     1|CA-2016-152156|2016-11-08|2016-11-11|  Second Class|   CG-12520|    Claire Gute| Consumer|United States|      Henderson|  Kentucky|      42420| South|FUR-BO-10001798|      Furniture|   Bookcases|Bush Somerset 

In [0]:
from pyspark.sql.functions import sum

# Total sales by category
df.groupBy("Category").agg(sum("Sales").alias("TotalSales")).orderBy("TotalSales", ascending=False).show()
+


+---------------+-----------------+
|       Category|       TotalSales|
+---------------+-----------------+
|     Technology|835900.0669999964|
|      Furniture|733046.8612999996|
|Office Supplies|703502.9280000031|
+---------------+-----------------+



In [0]:
import re

# Sanitize column names to remove invalid characters
cleaned_cols = [re.sub(r'[^A-Za-z0-9_]', '', c.replace(' ', '_')) for c in df.columns]
df_clean = df.toDF(*cleaned_cols)

df_clean.write.format("delta").mode("overwrite").save("/tmp/sales_data_delta")


In [0]:
df_delta = spark.read.format("delta").load("/tmp/sales_data_delta")
df_delta.show(5)


+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+-----------+--------------------+--------+--------+--------+--------+
|Row_ID|      Order_ID|Order_Date| Ship_Date|     Ship_Mode|Customer_ID|  Customer_Name|  Segment|      Country|           City|     State|Postal_Code|Region|     Product_ID|       Category|SubCategory|        Product_Name|   Sales|Quantity|Discount|  Profit|
+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+-----------+--------------------+--------+--------+--------+--------+
|     1|CA-2016-152156|2016-11-08|2016-11-11|  Second Class|   CG-12520|    Claire Gute| Consumer|United States|      Henderson|  Kentucky|      42420| South|FUR-BO-10001798|      Furniture|  Bookcases|Bush Somerset Col.

In [0]:
spark.conf.set("fs.s3a.access.key", "AKIAZM4TVHFBZKGH22OP")
spark.conf.set("fs.s3a.secret.key", "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")
spark.conf.set("fs.s3a.endpoint", "s3.amazonaws.com")

df_s3 = spark.read.option("header", "true").csv("s3a://dataengineering-glue-dmo-2025/hw_people_200.csv")
df_s3.show(5)
# dbfs:/FileStore/shared_uploads/letebrhan@gmail.com/sales_data-1.csv

+-----+---------------+--------------+
|Index| Height(Inches)|Weight(Pounds)|
+-----+---------------+--------------+
|    1|          65.78|        112.99|
|    2|          71.52|        136.49|
|    3|           69.4|        153.03|
|    4|          68.22|        142.34|
|    5|          67.79|         144.3|
+-----+---------------+--------------+
only showing top 5 rows

