In [1]:
import pandas as pd
from common.loader import *

In [2]:
geolocation = get_bronze_df(OlistFileName.GEOLOCATION)

(1000163, 5)


In [3]:
geolocation.nunique()

geolocation_zip_code_prefix     19015
geolocation_lat                717360
geolocation_lng                717613
geolocation_city                 8011
geolocation_state                  27
dtype: int64

In [4]:
city_name_count_by_zip = geolocation.groupby('geolocation_zip_code_prefix')['geolocation_city'].value_counts()
city_name_count_by_zip

geolocation_zip_code_prefix  geolocation_city
1001                         sao paulo            8
                             são paulo            3
1002                         sao paulo            5
                             são paulo            1
1003                         sao paulo           10
                                                 ..
99965                        agua santa           6
99970                        ciriaco             15
                             ciríaco              1
99980                        david canabarro     21
99990                        muliterno            2
Name: count, Length: 27907, dtype: int64

In [5]:
top_pair = city_name_count_by_zip.groupby(level=0).idxmax()
top_city_per_zip = top_pair.map(lambda t: t[1])
top_city_per_zip.sort_index()

geolocation_zip_code_prefix
1001           sao paulo
1002           sao paulo
1003           sao paulo
1004           sao paulo
1005           sao paulo
              ...       
99960            charrua
99965         agua santa
99970            ciriaco
99980    david canabarro
99990          muliterno
Name: count, Length: 19015, dtype: object

In [6]:
# `top_city_per_zip`  → Series
#   index  = geolocation_zip_code_prefix
#   values = most-frequent city for that prefix

# replace only where a prefix is in the lookup table,
# keep the original value everywhere else
new_geolocation = geolocation.copy()
new_geolocation['geolocation_city'] = (
    new_geolocation['geolocation_zip_code_prefix']
        .map(top_city_per_zip)                # look-up replacement
        .fillna(geolocation['geolocation_city'])  # keep non-matches
)

In [7]:
city_location_means = new_geolocation.groupby(["geolocation_city", "geolocation_zip_code_prefix"])[['geolocation_lat', 'geolocation_lng']].mean()
city_location_means = city_location_means.reset_index()
city_location_means.head()

Unnamed: 0,geolocation_city,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng
0,abadia de goias,75345,-16.767161,-49.438178
1,abadia dos dourados,38540,-18.477752,-47.406319
2,abadiania,72940,-16.193718,-48.709452
3,abaete,35620,-19.158364,-45.446897
4,abaetetuba,68440,-1.723644,-48.881349


In [8]:
deduplicated_geolocation = new_geolocation.drop(columns=['geolocation_lat', 'geolocation_lng']).drop_duplicates()
deduplicated_geolocation.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_city,geolocation_state
0,1037,sao paulo,SP
1,1046,sao paulo,SP
3,1041,sao paulo,SP
4,1035,sao paulo,SP
5,1012,sao paulo,SP


In [9]:
new_geolocation = pd.merge(deduplicated_geolocation[['geolocation_zip_code_prefix', 'geolocation_state']], city_location_means, on="geolocation_zip_code_prefix", how='left')
new_geolocation.drop_duplicates(inplace=True)
new_geolocation[new_geolocation['geolocation_zip_code_prefix'] == 1001]

Unnamed: 0,geolocation_zip_code_prefix,geolocation_state,geolocation_city,geolocation_lat,geolocation_lng
37,1001,SP,sao paulo,-23.550227,-46.634039


In [10]:
geolocation[geolocation['geolocation_zip_code_prefix'] == 1001].describe()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng
count,11.0,11.0,11.0
mean,1001.0,-23.550227,-46.634039
std,0.0,0.000686,0.000228
min,1001.0,-23.551427,-46.63441
25%,1001.0,-23.55057,-46.634135
50%,1001.0,-23.549951,-46.634027
75%,1001.0,-23.549779,-46.633957
max,1001.0,-23.549292,-46.633559


In [11]:
new_geolocation['geolocation_zip_code_prefix'].nunique() == geolocation['geolocation_zip_code_prefix'].nunique()

True

In [12]:
save_path = os.path.join(SILVER_DIR, 'geolocation.csv')
new_geolocation.to_csv(save_path, index=False)