In [16]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

spark = SparkSession.builder.getOrCreate()

In [21]:
df = spark.sql("DESCRIBE TABLE EXTENDED dev.silver.olist_orders_dataset")
df.show(truncate=False)

+-----------------------------+------------------------------------------+-------+
|col_name                     |data_type                                 |comment|
+-----------------------------+------------------------------------------+-------+
|order_id                     |string                                    |NULL   |
|customer_id                  |string                                    |NULL   |
|order_status                 |string                                    |NULL   |
|order_purchase_timestamp     |timestamp                                 |NULL   |
|order_approved_at            |timestamp                                 |NULL   |
|order_delivered_carrier_date |timestamp                                 |NULL   |
|order_delivered_customer_date|timestamp                                 |NULL   |
|order_estimated_delivery_date|timestamp                                 |NULL   |
|                             |                                          |       |
|# M

In [11]:
geolocation = spark.read.table("dev.silver.olist_geolocation_dataset")
geolocation.show()

+---------------------------+-------------------+-------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|    geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+-------------------+----------------+-----------------+
|                       1020|-23.551247650297142|-46.628331542472104|       sao paulo|               SP|
|                       1015|-23.548031800450076| -46.63357615958799|       sao paulo|               SP|
|                       1033| -23.54077242862274| -46.63658627751788|       são paulo|               SP|
|                       1122|-23.529995958364616| -46.64040623812689|       sao paulo|               SP|
|                       1103|-23.539337475993985| -46.62899422217096|       são paulo|               SP|
|                       1154| -23.52897674715792| -46.65591334248948|       sao paulo|               SP|
|                       1130| -23.52849984179699| -46.6

In [14]:
from pyspark.sql import functions as F
from pyspark.sql import Window

# 1. 각 zip_code에서 가장 많은 city 추출
city_counts = (
    geolocation.groupBy("geolocation_zip_code_prefix", "geolocation_city")
    .count()
)

# 2. zip_code별로 가장 많이 등장한 city 선택
window_spec = Window.partitionBy("geolocation_zip_code_prefix").orderBy(F.desc("count"))
top_cities = (
    city_counts.withColumn("rank", F.row_number().over(window_spec))
    .filter("rank = 1")
    .drop("count", "rank")
)
top_cities.show()


+---------------------------+----------------+
|geolocation_zip_code_prefix|geolocation_city|
+---------------------------+----------------+
|                       1001|       sao paulo|
|                       1002|       sao paulo|
|                       1003|       sao paulo|
|                       1004|       sao paulo|
|                       1005|       sao paulo|
|                       1006|       sao paulo|
|                       1007|       sao paulo|
|                       1008|       sao paulo|
|                       1009|       sao paulo|
|                       1010|       sao paulo|
|                       1011|       sao paulo|
|                       1012|       sao paulo|
|                       1013|       sao paulo|
|                       1014|       sao paulo|
|                       1015|       sao paulo|
|                       1016|       sao paulo|
|                       1017|       sao paulo|
|                       1018|       sao paulo|
|            

In [None]:

# 3. 원본 geolocation에서 city를 대체
# 먼저 zip_code 기준으로 top city를 join
new_geolocation = (
    geolocation.alias("geo")
    .join(top_cities.alias("top"), on="geolocation_zip_code_prefix", how="left")
    .withColumn(
        "geolocation_city",
        F.coalesce(F.col("top.geolocation_city"), F.col("geo.geolocation_city"))
    )
    .drop("top.geolocation_city")
)

# 4. 위도/경도 평균 계산
city_location_means = (
    new_geolocation
    .groupBy("geolocation_city", "geolocation_zip_code_prefix")
    .agg(
        F.mean("geolocation_lat").alias("geolocation_lat"),
        F.mean("geolocation_lng").alias("geolocation_lng")
    )
)

# 5. 중복 제거 (위도/경도 제외)
deduplicated = (
    new_geolocation
    .drop("geolocation_lat", "geolocation_lng")
    .dropDuplicates()
)

# 6. 평균 위경도와 병합
final_geolocation = (
    deduplicated
    .join(city_location_means, on=["geolocation_city", "geolocation_zip_code_prefix"], how="left")
    .dropDuplicates()
)

# 7. 특정 zip 확인
final_geolocation.filter(F.col("geolocation_zip_code_prefix") == 1001).show()

# 8. 모든 zip_code 가 보존되었는지 확인
unique_zip_original = geolocation.select("geolocation_zip_code_prefix").distinct().count()
unique_zip_final = final_geolocation.select("geolocation_zip_code_prefix").distinct().count()

print("보존 여부:", unique_zip_original == unique_zip_final)


In [None]:
# city_name_count_by_zip = geolocation.groupby('geolocation_zip_code_prefix')['geolocation_city'].value_counts()

top_pair = city_name_count_by_zip.groupby(level=0).idxmax()
top_city_per_zip = top_pair.map(lambda t: t[1])
top_city_per_zip.sort_index()

new_geolocation = geolocation.copy()
new_geolocation['geolocation_city'] = (
    new_geolocation['geolocation_zip_code_prefix']
        .map(top_city_per_zip)                # look-up replacement
        .fillna(geolocation['geolocation_city'])  # keep non-matches
)

city_location_means = new_geolocation.groupby(["geolocation_city", "geolocation_zip_code_prefix"])[['geolocation_lat', 'geolocation_lng']].mean()
city_location_means = city_location_means.reset_index()


deduplicated_geolocation = new_geolocation.drop(columns=['geolocation_lat', 'geolocation_lng']).drop_duplicates()
deduplicated_geolocation.head()

new_geolocation = pd.merge(deduplicated_geolocation[['geolocation_zip_code_prefix', 'geolocation_state']], city_location_means, on="geolocation_zip_code_prefix", how='left')
new_geolocation.drop_duplicates(inplace=True)
new_geolocation[new_geolocation['geolocation_zip_code_prefix'] == 1001]

new_geolocation['geolocation_zip_code_prefix'].nunique() == geolocation['geolocation_zip_code_prefix'].nunique()

NameError: name 'geolocation' is not defined