In [14]:
from pyspark.sql import SparkSession, functions as f

In [15]:
spark = (
    SparkSession.builder.appName("Preprocessing_Yellow")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '3g')   
    .config('spark.executor.memory', '4g')  
    .config('spark.executor.instances', '2')  
    .config('spark.executor.cores', '2')
    .getOrCreate()
) 

In [16]:
descriptions = spark.read.parquet("../../../data/insights/pre_insights/descriptions.parquet/")

In [17]:
tags_count = descriptions.groupBy("merchant_abn").agg(f.countDistinct("merchant_description").alias("description_count"))
print(tags_count.filter(tags_count.description_count > 1).count())



0


                                                                                

In [18]:
descriptions_agg = descriptions.groupBy("merchant_abn").\
            agg(f.first("merchant_description").alias("merchant_description"))

In [19]:
descriptions_count = descriptions_agg.groupBy("merchant_description").agg(f.count("merchant_abn").alias("number_of_merchants_with_description"))

In [20]:
descriptions_count.orderBy("number_of_merchants_with_description")

                                                                                

merchant_description,number_of_merchants_with_description
"jewelry, watch, c...",91
art dealers and g...,112
telecom,125
antique shops - s...,129
"equipment, tool, ...",134
"hobby, toy and ga...",142
"opticians, optica...",151
motor vehicle sup...,151
lawn and garden s...,153
"stationery, offic...",161


In [21]:
tech_and_electronics = ["computer", "digital", "television", "telecom"]
retail_and_novelty = ["newspaper", "novelty", "hobby", "shoe", "instruments", "bicycle", "craft","office"]
garden_and_furnishings = ["florists", "furniture", "garden", "tent"]
antiques_and_jewellery = ["galleries", "antique", "jewelry"]
specialized_services = ["health", "motor", "opticians"]

In [22]:
def segment(description):
    for segment, keywords in [("tech_and_electronics", tech_and_electronics),
                               ("retail_and_novelty", retail_and_novelty),
                               ("garden_and_furnishings", garden_and_furnishings),
                               ("antiques_and_jewellery", antiques_and_jewellery),
                               ("specialized_services", specialized_services)]:
        if any(keyword in description for keyword in keywords):
            return segment
    return "other"

In [23]:
from pyspark.sql.types import StringType
segment_udf = f.udf(segment, StringType())

In [24]:
descriptions_agg = descriptions_agg.withColumn("segment", segment_udf(descriptions_agg.merchant_description))

In [25]:
descriptions_agg.write.mode("overwrite").parquet("../../../data/insights/agg_insight_data/descriptions_agg.parquet")

                                                                                

In [26]:
spark.stop()

### Insights
- There 25 different descriptions for all merchants