# Data Cleaning for olist_product_category_name datasets

The dataset was previously extracted from Kaggle and load in Azure Datalake. This python will is to extract the csv files from the Azure Datalake, cleaning the data and load the clean data to Azure Datalake under "transform-data"

In [1]:
# Install package that will be need to provide necessary tools to interact with Azure Data Lake Storage Gen2
!pip install azure-storage-file-datalake
!pip install pyarrow



In [2]:
import os
from pathlib import Path
from azure.storage.filedatalake import DataLakeServiceClient
import pandas as pd
from io import StringIO
from io import BytesIO
import IPython
from IPython.display import display, HTML
import pyarrow as pa
import pyarrow.parquet as pq

In [3]:
# Setup the azure connection
connection_string =  # input the connection string here
container_name = "raw-data"
file_path = "product_category_name_translation.csv"

# Authenticate the connection
service_client = DataLakeServiceClient.from_connection_string(connection_string)

# Get file system and file client
file_system_client = service_client.get_file_system_client(file_system = container_name)
file_client = file_system_client.get_file_client(file_path)

# Download file contents frrom the raw-data container
download = file_client.download_file()
downloaded_bytes = download.readall()

# Convert to pandas DataFrame
csv_data = downloaded_bytes.decode("utf-8")
product_category = pd.read_csv(StringIO(csv_data))

# Print the df that contains product_category_name_.csv data
print(product_category.head(100))

            product_category_name product_category_name_english
0                    beleza_saude                 health_beauty
1          informatica_acessorios         computers_accessories
2                      automotivo                          auto
3                 cama_mesa_banho                bed_bath_table
4                moveis_decoracao               furniture_decor
..                            ...                           ...
66                         flores                       flowers
67             artes_e_artesanato         arts_and_craftmanship
68                fraldas_higiene           diapers_and_hygiene
69  fashion_roupa_infanto_juvenil     fashion_childrens_clothes
70             seguros_e_servicos         security_and_services

[71 rows x 2 columns]


In [4]:
# Show the schema of the files
product_category.dtypes

product_category_name            object
product_category_name_english    object
dtype: object

In [5]:
display(HTML(product_category.head(100).to_html(index=False)))

product_category_name,product_category_name_english
beleza_saude,health_beauty
informatica_acessorios,computers_accessories
automotivo,auto
cama_mesa_banho,bed_bath_table
moveis_decoracao,furniture_decor
esporte_lazer,sports_leisure
perfumaria,perfumery
utilidades_domesticas,housewares
telefonia,telephony
relogios_presentes,watches_gifts


### Data Loading and initial analysis

The script start by loading "product_category_name_tanslation.csv" data from a CSV file into a panda DataFrame. It then performs an exploratory analysis including 
* Total numbers of records (row)
* Numbers of columns
* Unique cities and states in the dataset
* Missing values

In [6]:
# Print dataset shape
print(f"\nDataset contains {product_category.shape[0]:,} records and {product_category.shape[1]} columns,")

# Count unique product_category_name_english
unique_product_category_name = product_category['product_category_name_english'].nunique()
print(f" Number of unique product category name: {unique_product_category_name}")


Dataset contains 71 records and 2 columns,
 Number of unique product category name: 71


#### Check missing values and duplicate records

In [7]:
# Count missing values in each column
missing_values = product_category.isnull().sum()

# Calculate percentage of missing values
missing_pct = (missing_values / len(product_category)) * 100

# Combine it into a Data Frame
missing_report = pd.DataFrame({
    'Missing Values' : missing_values,
    'Percentage (%)' : missing_pct
})

# Check for duplicate records of the product_category_name_english values
duplicate_product_category_count = product_category.duplicated(subset = 'product_category_name_english', keep=False).sum()

# Show analysis of the missing values and duplicate records
print("\n Missing Values Analysis:")
print(missing_report[missing_report['Missing Values'] > 0])
print(f"Number of duplicate product category name records: {duplicate_product_category_count}")


 Missing Values Analysis:
Empty DataFrame
Index: []
Number of duplicate product category name records: 0


#### Data Cleaning Process

Following initial analysis : the following data cleaning will be done :
A) Remove "_" on the both columns and replace with a space
B) use title function to make the first letter in a word upper case

In [8]:
product_category["product_category_name"] = product_category["product_category_name"].str.replace("_", " ").str.title()
product_category["product_category_name_english"] = product_category["product_category_name_english"].str.replace("_", " ").str.title()



In [9]:
display(HTML(product_category.head(100).to_html(index=False)))

product_category_name,product_category_name_english
Beleza Saude,Health Beauty
Informatica Acessorios,Computers Accessories
Automotivo,Auto
Cama Mesa Banho,Bed Bath Table
Moveis Decoracao,Furniture Decor
Esporte Lazer,Sports Leisure
Perfumaria,Perfumery
Utilidades Domesticas,Housewares
Telefonia,Telephony
Relogios Presentes,Watches Gifts


#### Check total in cleaned product_category Dataframe before upload

In [10]:
print(f"Total row in the cleaned product_category DataFrame: {len(product_category):,}")

Total row in the cleaned product_category DataFrame: 71


#### Save cleaned dataset to local storage in csv file

In [11]:
product_category.to_csv("cleaned_product_category_name.csv", index=False)
print("The CSV files has save locally as 'clean_product_category_name.csv'")

The CSV files has save locally as 'clean_product_category_name.csv'


#### Save cleaned dataset to Azure Data Lake (transform-data) as parquet file

In [12]:
# Convert to in-memory parquet
buffer = BytesIO()
pq.write_table(pa.Table.from_pandas(product_category), buffer)
buffer.seek(0)

# Azure connection config
connection_string = "DefaultEndpointsProtocol=https;AccountName=jde06dark;AccountKey=5dPJ4+PvxUP5h+GnoaiXU0+I288QJ8qFQSlMV6V8FJvXdgR02ut4Acg/8vC+7FrJQtrsP66a7W6x+AStovZzaw==;EndpointSuffix=core.windows.net"
container_name_clean = "transform-data"
file_path_clean = "clean_product_category_name.parquet"

# Connect to ADLS
service_client = DataLakeServiceClient.from_connection_string(connection_string)
fs_client = service_client.get_file_system_client(container_name_clean)

# Delete existing file if exists
try:
    fs_client.delete_file(file_part_clean)
except:
    pass     #ignore if file note found

# Upload the parquet file
file_client = fs_client.create_file(file_path_clean)
file_client.append_data(buffer.read(), offset=0, length=buffer.getbuffer().nbytes)
file_client.flush_data(buffer.getbuffer().nbytes)

print(f" Parquet file uploaded to ADLS under '{container_name_clean}/{file_path_clean}'")

 Parquet file uploaded to ADLS under 'transform-data/clean_product_category_name.parquet'


#### Verify row count after upload

In [13]:
# Download file from Azure
file_client = fs_client.get_file_client(file_path_clean)
download = file_client.download_file()
data = download.readall()

# Load into DataFrame
buffer = BytesIO(data)
table = pq.read_table(buffer)
df_parquet = table.to_pandas()

# Prinmt the row count
print(f" Rows in uploaded Parquet file: {len(df_parquet)}:,")


 Rows in uploaded Parquet file: 71:,
