### **Bronze layer**

In [0]:
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql import functions as F

business_schema = StructType([
StructField("business_id", StringType(), True),
StructField("name", StringType(), True),
StructField("address", StringType(), True),
StructField("city", StringType(), True),
StructField("state", StringType(), True),
StructField("postal_code", StringType(), True),
StructField("latitude", DoubleType(), True),
StructField("longitude", DoubleType(), True),
StructField("stars", DoubleType(), True),
StructField("review_count", IntegerType(), True),
StructField("is_open", IntegerType(), True),
StructField("attributes", MapType(StringType(), StringType()), True),
StructField("categories", StringType(), True),
StructField("hours", MapType(StringType(), StringType()), True)
])

business_bronze = spark.read \
.schema(business_schema) \
.json("/Volumes/workspace/default/yelp-reviews/yelp_academic_dataset_business.json")

business_bronze.write \
.format("delta") \
.mode("overwrite") \
.option("mergeSchema", "true") \
.saveAsTable("bronzebusiness")

### **Silver layer**

In [0]:
from pyspark.sql import *

src = spark.table("bronzebusiness")
print("Row count:", src.count())


In [0]:
key_cols = ["business_id", "name", "city", "state", "postal_code", "stars", "review_count", "latitude", "longitude", "is_open", "categories", "hours"]
missing_stats = (
    src.select([
        F.sum(F.col(c).isNull().cast("int")).alias(f"{c}_nulls")
        for c in key_cols
    ] + [
        F.count(F.lit(1)).alias("row_count")
    ])
)
missing_stats.display()

In [0]:
dupes = (
    src.groupBy("business_id")
       .count()
       .filter(F.col("count") > 1)
       .orderBy(F.col("count").desc())
)
print("Duplicate business_id rows:", dupes.count())
dupes.display()

In [0]:
from pyspark.sql.functions import col, round

numeric_cols = ["stars", "review_count", "latitude", "longitude"]
summaries = src.select(numeric_cols).summary("count","min","max","mean","stddev")
rounded = summaries.select("summary",*[round(col(c).cast("double"), 2).alias(c) for c in numeric_cols])
rounded.display()


In [0]:
top_states = src.groupBy("state").count().orderBy(F.col("count").desc())
top_states.display()

In [0]:
top_cities = src.groupBy("city").count().orderBy(F.col("count").desc())
print("Top cities:")
top_cities.show(20, truncate=False)

In [0]:
cats = (
    src
    .withColumn("category", F.explode_outer(
        F.expr("filter(transform(split(categories, ','), x -> trim(x)), x -> x != '')")
    ))
)

cat_missing = src.filter(
    F.col("categories").isNull() | (F.trim("categories") == "") | (F.size(F.split(F.col("categories"), ",")) == 0)
)
print("Rows with missing/empty categories:", cat_missing.count())

cat_per_business = (
    cats.groupBy("business_id")
        .agg(F.count("category").alias("category_count"))
)
cat_per_business.describe("category_count").display()

top_categories = cats.groupBy("category").count().orderBy(F.col("count").desc())
print("Top categories:")
top_categories.show(20, truncate=False)


In [0]:
attr_keys = (
    src.select(F.explode_outer(F.map_keys("attributes")).alias("attr_key"))
       .groupBy("attr_key").count()
       .orderBy(F.col("count").desc())
)
print("Attribute keys:")
attr_keys.display()

In [0]:
credit_cards_acceptance_counts = (
    src
    .select(F.col("attributes")["BusinessAcceptsCreditCards"].alias("BusinessAcceptsCreditCards"))
    .groupBy("BusinessAcceptsCreditCards")
    .count()
    .orderBy(F.col("count").desc())
)
credit_cards_acceptance_counts.show(truncate=False)

In [0]:
parking_schema = "struct<garage:boolean,street:boolean,validated:boolean,lot:boolean,valet:boolean>"

parking = (
    src
    .withColumn("bp_raw", F.col("attributes")["BusinessParking"])
    # normalize to JSON: single -> double quotes, capitalized booleans -> lowercase
    .withColumn("bp_json",
        F.regexp_replace(
            F.regexp_replace(F.regexp_replace(F.regexp_replace("bp_raw", "'", '"'),
                                              r"\bTrue\b", "true"),
                             r"\bFalse\b", "false"),
            r"\bNone\b", "null")
    )
    .withColumn("bp", F.from_json("bp_json", parking_schema))
)

parking_counts = (
    parking
    .selectExpr(
        "stack(5, "
        "'garage', bp.garage, "
        "'street', bp.street, "
        "'validated', bp.validated, "
        "'lot', bp.lot, "
        "'valet', bp.valet) as (k, v)"
    )
    .groupBy("k","v")
    .count()
    .orderBy("k", F.col("count").desc())
)

parking_counts.display()

In [0]:
restaurants_price_range2_counts = (
    src
    .select(F.col("attributes")["RestaurantsPriceRange2"].alias("RestaurantsPriceRange2"))
    .groupBy("RestaurantsPriceRange2")
    .count()
    .orderBy(F.col("count").desc())
)
restaurants_price_range2_counts.show(truncate=False)

In [0]:
bike_parking_counts = (
    src
    .select(F.col("attributes")["BikeParking"].alias("BikeParking"))
    .groupBy("BikeParking")
    .count()
    .orderBy(F.col("count").desc())
)
bike_parking_counts.show(truncate=False)

In [0]:
restaurant_take_out_counts = (
    src
    .select(F.col("attributes")["RestaurantsTakeOut"].alias("RestaurantsTakeOut"))
    .groupBy("RestaurantsTakeOut")
    .count()
    .orderBy(F.col("count").desc())
)
restaurant_take_out_counts.show(truncate=False)

In [0]:
wifi_counts = (
    src
    .select(F.col("attributes")["WiFi"].alias("WiFi"))
    .groupBy("WiFi")
    .count()
    .orderBy(F.col("count").desc())
)
wifi_counts.show(truncate=False)

In [0]:
delivery_counts = (
    src
    .select(F.col("attributes")["RestaurantsDelivery"].alias("RestaurantsDelivery"))
    .groupBy("RestaurantsDelivery")
    .count()
    .orderBy(F.col("count").desc())
)
delivery_counts.show(truncate=False)

In [0]:
good_for_kids_counts = (
    src
    .select(F.col("attributes")["GoodForKids"].alias("GoodForKids"))
    .groupBy("GoodForKids")
    .count()
    .orderBy(F.col("count").desc())
)
good_for_kids_counts.show(truncate=False)

In [0]:
outdoor_seating_counts = (
    src
    .select(F.col("attributes")["OutdoorSeating"].alias("OutdoorSeating"))
    .groupBy("OutdoorSeating")
    .count()
    .orderBy(F.col("count").desc())
)
outdoor_seating_counts.show(truncate=False)

In [0]:
reservations_counts = (
    src
    .select(F.col("attributes")["RestaurantsReservations"].alias("RestaurantsReservations"))
    .groupBy("RestaurantsReservations")
    .count()
    .orderBy(F.col("count").desc())
)
reservations_counts.show(truncate=False)

In [0]:
has_tv_counts = (
    src
    .select(F.col("attributes")["HasTV"].alias("HasTV"))
    .groupBy("HasTV")
    .count()
    .orderBy(F.col("count").desc())
)
has_tv_counts.show(truncate=False)

In [0]:
ambience_schema = """
struct<
  touristy:boolean,
  hipster:boolean,
  romantic:boolean,
  divey:boolean,
  intimate:boolean,
  trendy:boolean,
  upscale:boolean,
  classy:boolean,
  casual:boolean
>
"""

amb = (
    src
    .withColumn("amb_raw", F.col("attributes")["Ambience"])
    .withColumn(
        "amb_json",
        F.regexp_replace(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace("amb_raw", "'", '"'),
                    r"\bTrue\b", "true"),
                r"\bFalse\b", "false"),
            r"\bNone\b", "null")
    )
    .withColumn("amb", F.from_json("amb_json", ambience_schema))
)

amb_kv = amb.selectExpr(
    "business_id",
    "stack(9, "
    "'touristy', amb.touristy, "
    "'hipster', amb.hipster, "
    "'romantic', amb.romantic, "
    "'divey', amb.divey, "
    "'intimate', amb.intimate, "
    "'trendy', amb.trendy, "
    "'upscale', amb.upscale, "
    "'classy', amb.classy, "
    "'casual', amb.casual) as (k, v)"
)

amb_classified = amb_kv.select(
    "k",
    F.when(F.col("v").isNull(), "NULL")
     .when(F.col("v") == True, "True")
     .when(F.col("v") == False, "False")
     .when(F.lower(F.col("v").cast("string")) == "none", "None")
     .otherwise("Other")
     .alias("value_class")
)

amb_counts = (
    amb_classified
    .groupBy("k", "value_class")
    .count()
    .orderBy("k", "value_class")
)

amb_counts.show(100, truncate=False)

In [0]:
restaurants_good_for_groups_counts = (
    src
    .select(F.col("attributes")["RestaurantsGoodForGroups"].alias("RestaurantsGoodForGroups"))
    .groupBy("RestaurantsGoodForGroups")
    .count()
    .orderBy(F.col("count").desc())
)
restaurants_good_for_groups_counts.show(truncate=False)

In [0]:
alcohol_counts = (
    src
    .select(F.col("attributes")["Alcohol"].alias("Alcohol"))
    .groupBy("Alcohol")
    .count()
    .orderBy(F.col("count").desc())
)
alcohol_counts.show(truncate=False)

In [0]:
by_appointment_only_counts = (
    src
    .select(F.col("attributes")["ByAppointmentOnly"].alias("ByAppointmentOnly"))
    .groupBy("ByAppointmentOnly")
    .count()
    .orderBy(F.col("count").desc())
)
by_appointment_only_counts.show(truncate=False)

In [0]:
caters_counts = (
    src
    .select(F.col("attributes")["Caters"].alias("Caters"))
    .groupBy("Caters")
    .count()
    .orderBy(F.col("count").desc())
)
caters_counts.show(truncate=False)

In [0]:
restaurants_attire_counts = (
    src
    .select(F.col("attributes")["RestaurantsAttire"].alias("RestaurantsAttire"))
    .groupBy("RestaurantsAttire")
    .count()
    .orderBy(F.col("count").desc())
)
restaurants_attire_counts.show(truncate=False)

In [0]:
noise_level_counts = (
    src
    .select(F.col("attributes")["NoiseLevel"].alias("NoiseLevel"))
    .groupBy("NoiseLevel")
    .count()
    .orderBy(F.col("count").desc())
)
noise_level_counts.show(truncate=False)

In [0]:
gfm_schema = """
struct<
  dessert:boolean,
  latenight:boolean,
  lunch:boolean,
  dinner:boolean,
  brunch:boolean,
  breakfast:boolean
>
"""

gfm = (
    src
    .withColumn("gfm_raw", F.col("attributes")["GoodForMeal"])
    # normalize to valid JSON
    .withColumn(
        "gfm_json",
        F.regexp_replace(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace("gfm_raw", "'", '"'),
                    r"\bTrue\b", "true"),
                r"\bFalse\b", "false"),
            r"\bNone\b", "null")
    )
    .withColumn("gfm", F.from_json("gfm_json", gfm_schema))
)

gfm_kv = gfm.selectExpr(
    "business_id",
    "stack(6, "
    "'dessert', gfm.dessert, "
    "'latenight', gfm.latenight, "
    "'lunch', gfm.lunch, "
    "'dinner', gfm.dinner, "
    "'brunch', gfm.brunch, "
    "'breakfast', gfm.breakfast) as (k, v)"
)

gfm_classified = gfm_kv.select(
    "k",
    F.when(F.col("v").isNull(), "NULL")
     .when(F.col("v") == True, "True")
     .when(F.col("v") == False, "False")
     .when(F.lower(F.col("v").cast("string")) == "none", "None")
     .otherwise("Other")
     .alias("value_class")
)

gfm_counts = (
    gfm_classified
    .groupBy("k", "value_class")
    .count()
    .orderBy("k", "value_class")
)

gfm_counts.show(100, truncate=False)

In [0]:
wheelchair_accessible_counts = (
    src
    .select(F.col("attributes")["WheelchairAccessible"].alias("WheelchairAccessible"))
    .groupBy("WheelchairAccessible")
    .count()
    .orderBy(F.col("count").desc())
)
wheelchair_accessible_counts.show(truncate=False)

In [0]:
restaurants_table_service_counts = (
    src
    .select(F.col("attributes")["RestaurantsTableService"].alias("RestaurantsTableService"))
    .groupBy("RestaurantsTableService")
    .count()
    .orderBy(F.col("count").desc())
)
restaurants_table_service_counts.show(truncate=False)

In [0]:
dogs_allowed_counts = (
    src
    .select(F.col("attributes")["DogsAllowed"].alias("DogsAllowed"))
    .groupBy("DogsAllowed")
    .count()
    .orderBy(F.col("count").desc())
)
dogs_allowed_counts.show(truncate=False)

In [0]:
business_accepts_bitcoin_counts = (
    src
    .select(F.col("attributes")["BusinessAcceptsBitcoin"].alias("BusinessAcceptsBitcoin"))
    .groupBy("BusinessAcceptsBitcoin")
    .count()
    .orderBy(F.col("count").desc())
)
business_accepts_bitcoin_counts.show(truncate=False)

In [0]:
happy_hour_counts = (
    src
    .select(F.col("attributes")["HappyHour"].alias("HappyHour"))
    .groupBy("HappyHour")
    .count()
    .orderBy(F.col("count").desc())
)
happy_hour_counts.show(truncate=False)

In [0]:
drive_thru_counts = (
    src
    .select(F.col("attributes")["DriveThru"].alias("DriveThru"))
    .groupBy("DriveThru")
    .count()
    .orderBy(F.col("count").desc())
)
drive_thru_counts.show(truncate=False)

In [0]:
music_schema = """
struct<
  dj:boolean,
  background_music:boolean,
  no_music:boolean,
  jukebox:boolean,
  live:boolean,
  video:boolean,
  karaoke:boolean
>
"""

music = (
    src
    .withColumn("music_raw", F.col("attributes")["Music"])
    # normalize to valid JSON
    .withColumn(
        "music_json",
        F.regexp_replace(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace("music_raw", "'", '"'),
                    r"\bTrue\b", "true"),
                r"\bFalse\b", "false"),
            r"\bNone\b", "null")
    )
    .withColumn("music_struct", F.from_json("music_json", music_schema))
)

music_kv = music.selectExpr(
    "business_id",
    "stack(7, "
    "'dj', music_struct.dj, "
    "'background_music', music_struct.background_music, "
    "'no_music', music_struct.no_music, "
    "'jukebox', music_struct.jukebox, "
    "'live', music_struct.live, "
    "'video', music_struct.video, "
    "'karaoke', music_struct.karaoke) as (k, v)"
)

music_classified = music_kv.select(
    "k",
    F.when(F.col("v").isNull(), "NULL")
     .when(F.col("v") == True, "True")
     .when(F.col("v") == False, "False")
     .when(F.lower(F.col("v").cast("string")) == "none", "None")
     .otherwise("Other")
     .alias("value_class")
)

music_counts = (
    music_classified
    .groupBy("k", "value_class")
    .count()
    .orderBy("k", "value_class")
)

music_counts.show(100, truncate=False)

In [0]:
accepts_insurance_counts = (
    src
    .select(F.col("attributes")["AcceptsInsurance"].alias("AcceptsInsurance"))
    .groupBy("AcceptsInsurance")
    .count()
    .orderBy(F.col("count").desc())
)
accepts_insurance_counts.show(truncate=False)

In [0]:
bestnights_schema = """
struct<
  monday:boolean,
  tuesday:boolean,
  wednesday:boolean,
  thursday:boolean,
  friday:boolean,
  saturday:boolean,
  sunday:boolean
>
"""

bn = (
    src
    .withColumn("bn_raw", F.col("attributes")["BestNights"])
    # normalize to valid JSON
    .withColumn(
        "bn_json",
        F.regexp_replace(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace("bn_raw", "'", '"'),
                    r"\bTrue\b", "true"),
                r"\bFalse\b", "false"),
            r"\bNone\b", "null")
    )
    .withColumn("bn_struct", F.from_json("bn_json", bestnights_schema))
)

bn_kv = bn.selectExpr(
    "business_id",
    "stack(7, "
    "'monday', bn_struct.monday, "
    "'tuesday', bn_struct.tuesday, "
    "'wednesday', bn_struct.wednesday, "
    "'thursday', bn_struct.thursday, "
    "'friday', bn_struct.friday, "
    "'saturday', bn_struct.saturday, "
    "'sunday', bn_struct.sunday) as (k, v)"
)

bn_classified = bn_kv.select(
    "k",
    F.when(F.col("v").isNull(), "NULL")
     .when(F.col("v") == True, "True")
     .when(F.col("v") == False, "False")
     .when(F.lower(F.col("v").cast("string")) == "none", "None")
     .otherwise("Other")
     .alias("value_class")
)

bn_counts = (
    bn_classified
    .groupBy("k", "value_class")
    .count()
    .orderBy("k", "value_class")
)

bn_counts.show(100, truncate=False)

In [0]:
coat_check_counts = (
    src
    .select(F.col("attributes")["CoatCheck"].alias("CoatCheck"))
    .groupBy("CoatCheck")
    .count()
    .orderBy(F.col("count").desc())
)
coat_check_counts.show(truncate=False)

In [0]:
good_for_dancing_counts = (
    src
    .select(F.col("attributes")["GoodForDancing"].alias("GoodForDancing"))
    .groupBy("GoodForDancing")
    .count()
    .orderBy(F.col("count").desc())
)
good_for_dancing_counts.show(truncate=False)

In [0]:
smoking_counts = (
    src
    .select(F.col("attributes")["Smoking"].alias("Smoking"))
    .groupBy("Smoking")
    .count()
    .orderBy(F.col("count").desc())
)
smoking_counts.show(truncate=False)

In [0]:
byob_counts = (
    src
    .select(F.col("attributes")["BYOB"].alias("BYOB"))
    .groupBy("BYOB")
    .count()
    .orderBy(F.col("count").desc())
)
byob_counts.show(truncate=False)

In [0]:
corkage_counts = (
    src
    .select(F.col("attributes")["Corkage"].alias("Corkage"))
    .groupBy("Corkage")
    .count()
    .orderBy(F.col("count").desc())
)
corkage_counts.show(truncate=False)

In [0]:
byob_corkage_counts = (
    src
    .select(F.col("attributes")["BYOBCorkage"].alias("BYOBCorkage"))
    .groupBy("BYOBCorkage")
    .count()
    .orderBy(F.col("count").desc())
)
byob_corkage_counts.show(truncate=False)

In [0]:
hair_schema = """
struct<
  straightperms:boolean,
  coloring:boolean,
  extensions:boolean,
  africanamerican:boolean,
  curly:boolean,
  kids:boolean,
  perms:boolean,
  asian:boolean
>
"""

hair = (
    src
    .withColumn("hair_raw", F.col("attributes")["HairSpecializesIn"])
    .withColumn(
        "hair_json",
        F.regexp_replace(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace("hair_raw", "u'", "'"),  
                    "'", '"'),
                r"\bTrue\b", "true"),
            r"\bFalse\b", "false")
    )
    .withColumn("hair_json", F.regexp_replace("hair_json", r"\bNone\b", "null"))
    .withColumn("hair_struct", F.from_json("hair_json", hair_schema))
)

hair_kv = hair.selectExpr(
    "business_id",
    "stack(8, "
    "'straightperms', hair_struct.straightperms, "
    "'coloring', hair_struct.coloring, "
    "'extensions', hair_struct.extensions, "
    "'africanamerican', hair_struct.africanamerican, "
    "'curly', hair_struct.curly, "
    "'kids', hair_struct.kids, "
    "'perms', hair_struct.perms, "
    "'asian', hair_struct.asian) as (k, v)"
)

hair_classified = hair_kv.select(
    "k",
    F.when(F.col("v").isNull(), "NULL")
     .when(F.col("v") == True, "True")
     .when(F.col("v") == False, "False")
     .when(F.lower(F.col("v").cast("string")) == "none", "None")
     .otherwise("Other")
     .alias("value_class")
)

hair_counts = (
    hair_classified
    .groupBy("k", "value_class")
    .count()
    .orderBy("k", "value_class")
)

hair_counts.show(100, truncate=False)

In [0]:
ages_allowed_counts = (
    src
    .select(F.col("attributes")["AgesAllowed"].alias("AgesAllowed"))
    .groupBy("AgesAllowed")
    .count()
    .orderBy(F.col("count").desc())
)
ages_allowed_counts.show(truncate=False)

In [0]:
open_24_hours_counts = (
    src
    .select(F.col("attributes")["Open24Hours"].alias("Open24Hours"))
    .groupBy("Open24Hours")
    .count()
    .orderBy(F.col("count").desc())
)
open_24_hours_counts.show(truncate=False)

In [0]:
diet_schema = """
struct<
  `dairy-free`:boolean,
  `gluten-free`:boolean,
  vegan:boolean,
  kosher:boolean,
  halal:boolean,
  `soy-free`:boolean,
  vegetarian:boolean
>
"""

diet = (
    src
    .withColumn("diet_raw", F.col("attributes")["DietaryRestrictions"])
    .withColumn(
        "diet_json",
        F.regexp_replace(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace("diet_raw", "'", '"'),
                    r"\bTrue\b", "true"),
                r"\bFalse\b", "false"),
            r"\bNone\b", "null")
    )
    .withColumn("diet_struct", F.from_json("diet_json", diet_schema))
)

diet_kv = diet.selectExpr(
    "business_id",
    "stack(7, "
    "'dairy-free', diet_struct.`dairy-free`, "
    "'gluten-free', diet_struct.`gluten-free`, "
    "'vegan', diet_struct.vegan, "
    "'kosher', diet_struct.kosher, "
    "'halal', diet_struct.halal, "
    "'soy-free', diet_struct.`soy-free`, "
    "'vegetarian', diet_struct.vegetarian) as (k, v)"
)

diet_classified = diet_kv.select(
    "k",
    F.when(F.col("v").isNull(), "NULL")
     .when(F.col("v") == True, "True")
     .when(F.col("v") == False, "False")
     .when(F.lower(F.col("v").cast("string")) == "none", "None")
     .otherwise("Other")
     .alias("value_class")
)

diet_counts = (
    diet_classified
    .groupBy("k", "value_class")
    .count()
    .orderBy("k", "value_class")
)

diet_counts.show(100, truncate=False)

In [0]:
restaurants_counter_service_counts = (
    src
    .select(F.col("attributes")["RestaurantsCounterService"].alias("RestaurantsCounterService"))
    .groupBy("RestaurantsCounterService")
    .count()
    .orderBy(F.col("count").desc())
)
restaurants_counter_service_counts.show(truncate=False)

In [0]:
hours_presence = src.filter(F.col("hours").isNotNull()).count()
print("Rows with hours populated:", hours_presence)

hours_bad = src.select(
    "business_id",
    F.explode_outer("hours").alias("day","range")
).filter(~F.col("range").rlike(r"^\d{2}:\d{2}-\d{2}:\d{2}$"))
print("Malformed hours rows:", hours_bad.count())

hours_exploded = (
    src
    .select("business_id", F.explode_outer("hours").alias("day", "range"))
    .withColumn("open_time", F.split("range","-")[0])
    .withColumn("close_time", F.split("range","-")[1])
)
hours_exploded.display(20, truncate=False)

In [0]:
quality_flags = src.select(
    "business_id",
    F.when(F.col("business_id").isNull(), "null_business_id")
     .when(F.col("name").isNull(), "null_name")
     .when(~F.col("stars").between(0,5), "stars_out_of_range")
     .when(F.col("review_count") < 0, "negative_review_count")
     .when(~F.col("latitude").between(-90,90) | ~F.col("longitude").between(-180,180), "bad_lat_lon")
     .when(F.col("categories").isNull() | (F.trim("categories") == ""), "missing_categories")
     .when(F.col("hours").isNull(), "missing_hours")
     .alias("issue")
).filter(F.col("issue").isNotNull())

issues_summary = quality_flags.groupBy("issue").count().orderBy(F.col("count").desc())
print("Quality issues summary:")
issues_summary.show(truncate=False)

In [0]:
from pyspark.sql.functions import split

business_silver = src.filter(col("business_id").isNotNull()) \
.filter(col("name").isNotNull()) \
.withColumn("categories_array", split(col("categories"), ",")) \
.dropDuplicates(["business_id"])

business_silver.write.format("delta").mode("overwrite") \
.saveAsTable("silverbusiness")