In [1]:
import pandas as pd
import os
from unidecode import unidecode
import numpy as np
from pathlib import Path

In [2]:
REDEFINED_DIR = "../../../downloads/olist_redefined"
STREAM_DST = Path(os.path.join(REDEFINED_DIR, 'stream'))
CDC_DST = Path(os.path.join(REDEFINED_DIR, 'cdc'))
os.makedirs(STREAM_DST, exist_ok=True)
os.makedirs(CDC_DST, exist_ok=True)

In [3]:
geolocations = pd.read_csv("../../../downloads/olist/olist_geolocation_dataset.csv")
geolocationssss = geolocations.drop_duplicates()
geolocations.shape

(1000163, 5)

In [4]:
geolocations.columns = [col.replace('geolocation_', '') for col in geolocations.columns]
geolocations = geolocations.rename(columns={'zip_code_prefix': 'zip_code'})

In [5]:
def get_max_count_cities_by_zip_code(geolocations: pd.DataFrame) -> pd.DataFrame:
    """
    Return the most frequent  and state per ZIP code.
    - Normalize Portuguese accents to unify mixed  names.
    - Stable tie-breaking (by city name alphabetically).
    """
    # Copy to avoid modifying the original
    geo = geolocations.copy()
    # Accent normalization (lowercased for consistency)
    geo['geolocation_city'] = geo['city'].apply(
        lambda x: unidecode(str(x)).lower() if pd.notna(x) else x
    )
    # Count occurrences per ZIP/city/state
    counts = (
        geo.groupby(['zip_code', 'city', 'state'])
        .size()
        .reset_index(name='count')
    )
    # Select most frequent per ZIP, stable sort on ties
    max_counts = (
        counts.sort_values(
            ['zip_code', 'count', 'city'],
            ascending=[True, False, True]
        )
        .groupby('zip_code')
        .head(1)
        [['zip_code', 'city', 'state']]
    )
    return max_counts

def replace_city_state_with_max(geolocations: pd.DataFrame, max_count_cities: pd.DataFrame) -> pd.DataFrame:
    """
    Replace city/state in geolocation with the most frequent per ZIP code.
    """
    geo = geolocations.drop(columns=['city', 'state']).merge(
        max_count_cities,
        on='zip_code',
        how='left'
    )
    geo['city'] = geo['city'].fillna('unknown')
    geo['state'] = geo['state'].fillna('unknown')
    geo['city'] = geo['city'].apply(lambda x: x.lower() if x != 'unknown' else x)
    return geo

def aggregate_lat_lng(new_geolocations: pd.DataFrame, group, method: str = 'mean') -> pd.DataFrame:
    """
    Aggregate latitude and longitude by ZIP/city/state using mean or median.
    """
    agg_func = 'mean' if method == 'mean' else 'median'
    return (
        new_geolocations.groupby(group)
        .agg({
            'lat': agg_func,
            'lng': agg_func
        })
        .reset_index()
    )

def check_data_integrity(original: pd.DataFrame, final: pd.DataFrame, new_geo: pd.DataFrame,
                         lat_range=(-33, 5), lng_range=(-74, -34),
                         unique_change_threshold=0.1):
    """
    Check data integrity:
    - Unique value preservation threshold
    - Null city/state
    - Latitude/longitude range validation
    - 현업 추가: 0 division 방지, 로그 강화 (Grafana 연계 추천)
    """
    # Unique counts
    unique_comparison = pd.DataFrame({
        'Metric': ['Zip Code', 'Cities', 'States'],
        'Original': [
            original['zip_code'].nunique(),
            original['city'].nunique(),
            original['state'].nunique()
        ],
        'Final': [
            final['zip_code'].nunique(),
            final['city'].nunique(),
            final['state'].nunique()
        ]
    })
    unique_comparison['Change Ratio'] = (
        (unique_comparison['Final'] - unique_comparison['Original']).abs() / 
        unique_comparison['Original'].replace(0, 1)  # 0 division 방지
    )
    # Null check
    null_city = new_geo['city'].isna().sum()
    null_state = new_geo['state'].isna().sum()
    # Lat/Lng range check
    lat_out = final[
        (final['lat'] < lat_range[0]) | (final['lat'] > lat_range[1])
    ].shape[0]
    lng_out = final[
        (final['lng'] < lng_range[0]) | (final['lng'] > lng_range[1])
    ].shape[0]
    # Reporting (현업: 로그 포맷 강화, Superset 대시보드 연계 추천)
    print("=== Unique Value Change ===")
    print(unique_comparison.to_string(index=False), "\n")
    print(f"Null count in city: {null_city}")
    print(f"Null count in state: {null_state}\n")
    print(f"Latitude out-of-range rows: {lat_out}")
    print(f"Longitude out-of-range rows: {lng_out}\n")
    # Integrity decision
    if (unique_comparison['Change Ratio'] > unique_change_threshold).any() or null_city > 0 or null_state > 0 or lat_out > 0 or lng_out > 0:
        print("[WARNING] Data integrity issue detected. Please correct before saving to Iceberg.")
    else:
        print("[INFO] Data integrity OK. Safe to proceed.")


In [6]:
# `preprocessed geolocation`

# Step 1: Get most frequent city/state per ZIP
max_count_cities = get_max_count_cities_by_zip_code(geolocations)
# Step 2: Replace original city/state
new_geolocations = replace_city_state_with_max(geolocations, max_count_cities)
new_geolocations = new_geolocations[['zip_code', 'lat', 'lng', 'state', 'city']]
new_geolocations.sort_values(['zip_code', 'state', 'city'], inplace=True)
# new_geolocations.to_csv(f"{BATCH_DST}/geolocation.tsv", index=False, sep='\t')
# Step 3: Aggregate lat/lng (mean or median)
mean_location_zip_code = aggregate_lat_lng(new_geolocations, ['zip_code', 'city', 'state'], method='mean')  # median 추천 가능
# Step 4: Data integrity check
check_data_integrity(geolocations, mean_location_zip_code, new_geolocations)


=== Unique Value Change ===
  Metric  Original  Final  Change Ratio
Zip Code     19015  19015      0.000000
  Cities      8011   5829      0.272375
  States        27     27      0.000000 

Null count in city: 0
Null count in state: 0

Latitude out-of-range rows: 12
Longitude out-of-range rows: 10



In [7]:
# `mean geolocation by zip code`
# 원본에서 동일 `zip_code`가 여러 개인데, 세부 주소가 없어서 특정할 수 없으므로 평균으로 대체하여 배치 데이터로 사용한다.
mean_location_zip_code.sort_values(['zip_code', 'state', 'city'], inplace=True)
mean_location_zip_code = mean_location_zip_code[['zip_code', 'lat', 'lng', 'state', 'city']]
mean_location_zip_code.to_csv(f"{CDC_DST}/geolocation.tsv", index=False, sep='\t')

In [8]:
# `mean geolocations by city`
mean_location_city = aggregate_lat_lng(new_geolocations, ['city', 'state'], method='mean')
mean_location_city.sort_values(['state', 'city', 'lat', 'lng'], inplace=True)
mean_location_city = mean_location_city[['lat', 'lng', 'state', 'city']]
# mean_location_city.to_csv(f"{CDC_DST}/mean_location_city.tsv", index=False, sep='\t')