In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to sys.path
#sys.path.append(os.path.abspath(os.getcwd())) # Add the parent directory to sys.path

from superstore_sales.config import CLEANED_DATA_DIR, RAW_DATA_FILE
import superstore_sales.custom_funcs as cf

# Load Raw csv file
df_raw = pd.read_csv(RAW_DATA_FILE, encoding='ISO-8859-1')

# Load Cleaned DataFrame
parquet_file = os.path.join(CLEANED_DATA_DIR, 'SuperStoreOrders_clean.parquet')
df_clean = pd.read_parquet(parquet_file)

# Comparison of **Raw data** vs **Cleaned data**

In [2]:
cf.compare_info(df_raw, df_clean, df1_name="Raw Data", df2_name="Cleaned Data")

# Function to compare memory usage
cf.compare_memory_usage(df_raw, df_clean)

# Function to check missing values
print("\nMissing Values Comparison:")
print(cf.compare_missing_values(df_raw, df_clean))

# Function to compare column data types
print("\nData Type Changes:")
print(cf.compare_dtypes(df_raw, df_clean))

Unnamed: 0,Raw Data,Non-Null Count,Cleaned Data,Non-Null Count.1
Row ID,int64,9994.0,object,9994
Order ID,object,9994.0,category,9994
Order Date,object,9994.0,datetime64[ns],9994
Ship Date,object,9994.0,datetime64[ns],9994
Ship Mode,object,9994.0,category,9994
Customer ID,object,9994.0,category,9994
Customer Name,object,9994.0,object,9994
Segment,object,9994.0,category,9994
Country,object,9994.0,category,9994
City,object,9994.0,object,9994



Memory Usage (MB):
Raw Data: 10.86 MB
Cleaned Data: 5.20 MB

Missing Values Comparison:
Empty DataFrame
Columns: [Raw Data, Cleaned Data]
Index: []

Data Type Changes:
                   Raw Data     Cleaned Data
Category             object         category
Country              object         category
Customer ID          object         category
Order Date           object   datetime64[ns]
Order ID             object         category
Postal Code           int64         category
Product ID dup          NaN             bool
Product ID updated      NaN         category
Region               object         category
Row ID                int64           object
Segment              object         category
Ship Date            object   datetime64[ns]
Ship Mode            object         category
Shipping Duration       NaN  timedelta64[ns]
State                object         category
Sub-Category         object         category


# Summary
Summary of Data Cleaning Improvements:
- Converted categorical columns to 'category' dtype to reduce memory usage.
- Transformed date columns to proper datetime format.
- Removed missing values or replaced them with appropriate defaults.
- Optimised data structure for efficient analysis.