# Data cleaning for olist_sellers_datasets
The dataset was previously extracted from Kaggle and load in Azure Datalake. This python will is to extract the csv files from the Azure Datalake, cleaning the data and load the clean data to Azure Datalake under "transform-data"

In [1]:
# Install package that will be need to provide necessary tools to interact with Azure Data Lake Storage Gen2
!pip install azure-storage-file-datalake
!pip install pyarrow



In [10]:
import os
from pathlib import Path
from azure.storage.filedatalake import DataLakeServiceClient
import pandas as pd
from io import StringIO
from io import BytesIO
import IPython
from IPython.display import display, HTML
import re
import pyarrow as pa
import pyarrow.parquet as pq
import unicodedata



In [11]:
#Setup the azure connection 
connection_string = # <-- input the connection string here
container_name = "raw-data"
file_path = "geolocation.csv"

#Authenticate the connection
service_client = DataLakeServiceClient.from_connection_string(connection_string)

#Get file system and file client
file_system_client = service_client.get_file_system_client(file_system=container_name)
file_client = file_system_client.get_file_client(file_path)

#Download file contents from the raw-data container
download = file_client.download_file()
downloaded_bytes = download.readall()

# Convert to pandas DataFrame
csv_data = downloaded_bytes.decode("utf-8")
geolocations = pd.read_csv(StringIO(csv_data))

# Print the df that contains sellers.csv data
print(geolocations.head(10000))

      geolocation_zip_code_prefix  geolocation_lat  geolocation_lng  \
0                            1037       -23.545621       -46.639292   
1                            1046       -23.546081       -46.644820   
2                            1046       -23.546129       -46.642951   
3                            1041       -23.544392       -46.639499   
4                            1035       -23.541578       -46.641607   
...                           ...              ...              ...   
9995                         1319       -23.551121       -46.641907   
9996                         1307       -23.553756       -46.653169   
9997                         1306       -23.553267       -46.650932   
9998                         1317       -23.554134       -46.638626   
9999                         1315       -23.552704       -46.640620   

     geolocation_city geolocation_state  
0           sao paulo                SP  
1           sao paulo                SP  
2           sao paulo

In [12]:
# show the schema of the files
geolocations.dtypes

geolocation_zip_code_prefix      int64
geolocation_lat                float64
geolocation_lng                float64
geolocation_city                object
geolocation_state               object
dtype: object

In [13]:

display(HTML(geolocations.head(100).to_html(index=False)))

geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
1037,-23.545621,-46.639292,sao paulo,SP
1046,-23.546081,-46.64482,sao paulo,SP
1046,-23.546129,-46.642951,sao paulo,SP
1041,-23.544392,-46.639499,sao paulo,SP
1035,-23.541578,-46.641607,sao paulo,SP
1012,-23.547762,-46.635361,são paulo,SP
1047,-23.546273,-46.641225,sao paulo,SP
1013,-23.546923,-46.634264,sao paulo,SP
1029,-23.543769,-46.634278,sao paulo,SP
1011,-23.54764,-46.636032,sao paulo,SP


#### Casting Column to String in Pandas

To cast a column to string type in pandas (when it's being automatically read as an integer), and to add leading zeros to maintain the correct format (like zip codes),

In [14]:
geolocations["geolocation_zip_code_prefix"] = geolocations["geolocation_zip_code_prefix"].astype(str).str.zfill(5)
display(HTML(geolocations.head(10).to_html(index=False)))

geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
1037,-23.545621,-46.639292,sao paulo,SP
1046,-23.546081,-46.64482,sao paulo,SP
1046,-23.546129,-46.642951,sao paulo,SP
1041,-23.544392,-46.639499,sao paulo,SP
1035,-23.541578,-46.641607,sao paulo,SP
1012,-23.547762,-46.635361,são paulo,SP
1047,-23.546273,-46.641225,sao paulo,SP
1013,-23.546923,-46.634264,sao paulo,SP
1029,-23.543769,-46.634278,sao paulo,SP
1011,-23.54764,-46.636032,sao paulo,SP


### Data loading and initial analysis
The scripts starts by loading 'geolocation.csv" data from a CSV file into a pandas DataFrame. It then performs an exploratory analysis, including

* Total numbers of records (row)
* Numbers of columns
* Unique geolocation_lat and geolocation_lng
* Missing values

In [15]:
# Print dataset shape
print(f"\nDataset contains {geolocations.shape[0]:,} records and {geolocations.shape[1]} columns.")

# Count unique seller cities and states
unique_cities = geolocations['geolocation_city'].nunique()
unique_states = geolocations['geolocation_state'].nunique()
print(f"Number of unique cities: {unique_cities}")
print(f"Number of unique states: {unique_states}")


Dataset contains 1,000,163 records and 5 columns.
Number of unique cities: 8011
Number of unique states: 27


#### Check missing values and duplicate records

In [16]:
# Count missing values in each column
missing_values = geolocations.isnull().sum()

# Calculate percentage of missing values
missing_pct = (missing_values / len(geolocations)) * 100

# Combine into a DataFrame 
missing_report = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage (%)': missing_pct
    })

# Check for duplicate records 
duplicate_geolocation_count = geolocations.duplicated(
    subset=['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state'],
    keep=False
).sum()

# Show only columns with missing values
print("\nMissing Values Analysis:")
print(missing_report[missing_report['Missing Values'] > 0])
print(f"Number of duplicate geolocation records: {duplicate_geolocation_count}")


Missing Values Analysis:
Empty DataFrame
Index: []
Number of duplicate geolocation records: 390005


### Data Cleaning Process
The dataset undergoes detail cleaning through following steps

#### a) Drop Duplicates longitude and latitude

#### b) Basic String Cleaning:
* Trims leading/trailing whitespace from all string columns
#### c) Enhance City Name Cleaning:
* Removes numeric values and special charactes
* Standardizes city name by
* Splitting entries on common delimiters
* Converting special characters (e.g., replacing "são" with "sao"
#### d) State Name Standardise
* Converts all state names to uppercase format

In [17]:
# a) Drop Duplicates row

# Count the original number of rows
initial_count=len(geolocations)

# Drop duplicates 
geolocations = geolocations.drop_duplicates()

# Count new number of rows
final_count = len(geolocations)

# calculate how many row was drop
dropped_count = initial_count - final_count

print(f"The initial rows in geolocation dataset : {initial_count}")
print(f"The finial count rows in geolocation dataset : {final_count}")
print(f"Number of duplicates rows were dropped : {dropped_count}")

The initial rows in geolocation dataset : 1000163
The finial count rows in geolocation dataset : 738332
Number of duplicates rows were dropped : 261831


In [18]:
# b) basic string cleaning
# Trims leading/trailing whitespace from all strings columns

geolocations['geolocation_city'] = geolocations['geolocation_city'].astype(str).str.strip()
geolocations['geolocation_state'] = geolocations['geolocation_state'].astype(str).str.strip()

In [19]:
# c) Enhance city name cleaning process
# d) Standarise geolocation state name to uppercase format

# c) Enhanced city name cleaning with special character normalization

def clean_city_name(city):
    if pd.isnull(city):
        return city

    # Normalize unicode characters (e.g. ü → u, superscript ³ → nothing)
    city = unicodedata.normalize('NFKD', city)
    city = city.encode('ASCII', 'ignore').decode('utf-8')

    # Lowercase, strip spaces
    city = city.lower().strip()

    # Remove digits and non-word characters (keep letters, hyphens, spaces)
    city = re.sub(r'[^a-z\s\-]', '', city)

    # Optional: keep only part before delimiter if needed
    city = re.split(r'[,;]', city)[0].strip()

    return city

# Apply to your dataframe
geolocations['geolocation_city'] = geolocations['geolocation_city'].astype(str).apply(clean_city_name)


# d) Standardize geolocation state name to uppercase
geolocations['geolocation_state'] = geolocations['geolocation_state'].str.upper()

display(HTML(geolocations.head(50).to_html(index=False)))


geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
1037,-23.545621,-46.639292,sao paulo,SP
1046,-23.546081,-46.64482,sao paulo,SP
1046,-23.546129,-46.642951,sao paulo,SP
1041,-23.544392,-46.639499,sao paulo,SP
1035,-23.541578,-46.641607,sao paulo,SP
1012,-23.547762,-46.635361,sao paulo,SP
1047,-23.546273,-46.641225,sao paulo,SP
1013,-23.546923,-46.634264,sao paulo,SP
1029,-23.543769,-46.634278,sao paulo,SP
1011,-23.54764,-46.636032,sao paulo,SP


In [20]:
# Portuguese-aware title casing
def custom_title(text):
    exceptions = {'de', 'da', 'do', 'das', 'dos', 'e'}
    special_cases = {"d'oeste": "d'Oeste", "d'alianca": "d'Alianca"}
    words = text.lower().split()
    result_words = []
    for i, word in enumerate(words):
        if word in special_cases:
            result_words.append(special_cases[word])
            continue
        if '-' in word:
            subwords = word.split('-')
            new_subwords = [
                sub.capitalize() if (j == 0 or sub not in exceptions) else sub
                for j, sub in enumerate(subwords)
            ]
            result_words.append('-'.join(new_subwords))
        else:
            result_words.append(word.capitalize() if i == 0 or word not in exceptions else word)
    return ' '.join(result_words)

geolocations['geolocation_city'] = geolocations['geolocation_city'].apply(custom_title)

In [21]:
display(HTML(geolocations.head(50).to_html(index=False)))

geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
1037,-23.545621,-46.639292,Sao Paulo,SP
1046,-23.546081,-46.64482,Sao Paulo,SP
1046,-23.546129,-46.642951,Sao Paulo,SP
1041,-23.544392,-46.639499,Sao Paulo,SP
1035,-23.541578,-46.641607,Sao Paulo,SP
1012,-23.547762,-46.635361,Sao Paulo,SP
1047,-23.546273,-46.641225,Sao Paulo,SP
1013,-23.546923,-46.634264,Sao Paulo,SP
1029,-23.543769,-46.634278,Sao Paulo,SP
1011,-23.54764,-46.636032,Sao Paulo,SP


#### Show the top 10 most frequent cities that purchasing from ecommerce site

In [22]:
# Total number of rows
total_records = len(geolocations)

# Group by final cleaned city column and count the occurrences
city_freq = geolocations['geolocation_city'].value_counts().reset_index()
city_freq.columns = ['geolocation_city', 'count']

# Calculate the percentage
city_freq['percentage'] = (city_freq['count'] / total_records * 100).round(2)

# Show top 10 cities with highest occurrences
top10city = city_freq.head(10)
HTML(top10city.to_html(index=False, border=1, classes='table table-striped'))


geolocation_city,count,percentage
Sao Paulo,99646,13.5
Rio de Janeiro,35177,4.76
Belo Horizonte,19474,2.64
Curitiba,11263,1.53
Brasilia,8791,1.19
Porto Alegre,8702,1.18
Salvador,8084,1.09
Guarulhos,7411,1.0
Sao Bernardo do Campo,5915,0.8
Santo Andre,5723,0.78


Check null value

In [23]:
# Count nulls and blank strings ("") in each relevant column
cols_to_check = [
    'geolocation_zip_code_prefix',
    'geolocation_lat',
    'geolocation_lng',
    'geolocation_city',
    'geolocation_state'
]

null_counts = geolocations[cols_to_check].isnull().sum()
blank_counts = (geolocations[cols_to_check] == '').sum()

result_null = pd.DataFrame({
    'null_count': null_counts,
    'blank_count': blank_counts
})

result_null

Unnamed: 0,null_count,blank_count
geolocation_zip_code_prefix,0,0
geolocation_lat,0,0
geolocation_lng,0,0
geolocation_city,0,0
geolocation_state,0,0


#### Check total rows in geolocation clean DataFrame before upload

In [24]:
print(f'Total row in geolocation_clean DataFrame: {len(geolocations):,}')

Total row in geolocation_clean DataFrame: 738,332


#### Save cleaned dataset to local as csv file

In [25]:
geolocations.to_csv("cleaned_geolocations.csv", index=False)
print("The CSV files has saved locally as 'cleaned_geolocations.csv'")

The CSV files has saved locally as 'cleaned_geolocations.csv'


#### Save cleaned dataset to Azure Data Lake (transform_data) as parquet file

In [26]:
# Convert to in-memory parquet
buffer = BytesIO()
pq.write_table(pa.Table.from_pandas(geolocations), buffer)
buffer.seek(0)
data_bytes = buffer.getvalue()

# Azure connection config
connection_string = "DefaultEndpointsProtocol=https;AccountName=jde06dark;AccountKey=5dPJ4+PvxUP5h+GnoaiXU0+I288QJ8qFQSlMV6V8FJvXdgR02ut4Acg/8vC+7FrJQtrsP66a7W6x+AStovZzaw==;EndpointSuffix=core.windows.net"
container_name_clean = "transform-data"
file_path_clean = "cleaned_geolocations.parquet"

# Connect to ADLS
service_client = DataLakeServiceClient.from_connection_string(connection_string)
fs_client = service_client.get_file_system_client(container_name_clean)

# Delete existing file if exists
try:
    fs_client.delete_file(file_path_clean)
except:
    pass    #ignore if the file not found

# Upload new parquet file
file_client = fs_client.create_file(file_path_clean)
file_client.append_data(data_bytes, offset=0, length=len(data_bytes))
file_client.flush_data(len(data_bytes))

print(f" Parquet file uploaded to ADLS under '{container_name_clean}/{file_path_clean}'")
props = file_client.get_file_properties()
print(f" Final Size: {props['size']} bytes")

 Parquet file uploaded to ADLS under 'transform-data/cleaned_geolocations.parquet'
 Final Size: 17304694 bytes


#### Verify row count after upload

In [27]:
# Download file from Azure
file_client = fs_client.get_file_client(file_path_clean)
download = file_client.download_file()
data = download.readall()

# Load into DataFrame
buffer = BytesIO(data)
table = pq.read_table(buffer)
df_parquet = table.to_pandas()

# Print the row count
print(f" Rows in uploaded Parquet file: {len(df_parquet):,}")

 Rows in uploaded Parquet file: 738,332
