In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the CSV files
users_path = "users.csv"  # Replace with actual file path
transactions_path = "transactions.csv"  # Replace with actual file path
products_path = "products.csv"  # Replace with actual file path

In [3]:
# Load and process Users data
users = pd.read_csv(users_path)
# Initial Dataset Details
print("Initial Dataset Info:")
print(users.info())
print("\nSummary Statistics:")
print(users.describe(include='all'))
print("\nSample Data:")
print(users.head())

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   ID            100000 non-null  object
 1   CREATED_DATE  100000 non-null  object
 2   BIRTH_DATE    96325 non-null   object
 3   STATE         95188 non-null   object
 4   LANGUAGE      69492 non-null   object
 5   GENDER        94108 non-null   object
dtypes: object(6)
memory usage: 4.6+ MB
None

Summary Statistics:
                              ID               CREATED_DATE  \
count                     100000                     100000   
unique                    100000                      99942   
top     5ef3b4f17053ab141787697d  2023-01-12 18:30:15.000 Z   
freq                           1                          2   

                       BIRTH_DATE  STATE LANGUAGE  GENDER  
count                       96325  95188    69492   94108  
unique                 

In [4]:
# Data Cleaning
# Handle Missing Values
# Replace missing values with "NA" or appropriate placeholders
users['STATE'] = users['STATE'].fillna("NA")
users['LANGUAGE'] = users['LANGUAGE'].fillna("NA")
users['GENDER'] = users['GENDER'].fillna("NA")

In [5]:
#Count duplicates
duplicate_count = users.duplicated(subset=['ID']).sum()
print(f"\nNumber of duplicate rows based on 'ID': {duplicate_count}")


Number of duplicate rows based on 'ID': 0


In [6]:
# Drop duplicates
users = users.drop_duplicates(subset=['ID'])

In [7]:
# Standardize Columns
users['LANGUAGE'] = users['LANGUAGE'].str.lower()
users['GENDER'] = users['GENDER'].str.lower()

In [8]:
users['CREATED_DATE'] = pd.to_datetime(users['CREATED_DATE'], errors='coerce').dt.tz_localize(None) # Remove timezone from datetime64[ns, UTC]
users['BIRTH_DATE'] = pd.to_datetime(users['BIRTH_DATE'], errors='coerce').dt.tz_localize(None)  # Ensure no timezone

In [9]:
# Handle Missing Dates with a Placeholder Date
# Using a placeholder date to replace missing or invalid dates (e.g., "1000-01-01")

# Function to replace invalid dates with a placeholder
def replace_invalid_dates(date_column, placeholder_date):
    # First, convert to datetime, set errors='coerce' to handle invalid dates
    date_column = pd.to_datetime(date_column, errors='coerce', utc=False)
    # Replace NaT with the placeholder date
    date_column.fillna(pd.to_datetime(placeholder_date), inplace=True)
    return date_column

# Replace missing or invalid 'CREATED_DATE' and 'BIRTH_DATE' with placeholder date
users['CREATED_DATE'] = replace_invalid_dates(users['CREATED_DATE'], '1900-01-01')
users['BIRTH_DATE'] = replace_invalid_dates(users['BIRTH_DATE'], '1900-01-01')

In [10]:
# Post-Cleaning Details
print("\nCleaned Dataset Info:")
print(users.info())
print("\nCleaned Sample Data:")
print(users.head())


Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   ID            100000 non-null  object        
 1   CREATED_DATE  100000 non-null  datetime64[ns]
 2   BIRTH_DATE    100000 non-null  datetime64[ns]
 3   STATE         100000 non-null  object        
 4   LANGUAGE      100000 non-null  object        
 5   GENDER        100000 non-null  object        
dtypes: datetime64[ns](2), object(4)
memory usage: 4.6+ MB
None

Cleaned Sample Data:
                         ID        CREATED_DATE          BIRTH_DATE STATE  \
0  5ef3b4f17053ab141787697d 2020-06-24 20:17:54 2000-08-11 00:00:00    CA   
1  5ff220d383fcfc12622b96bc 2021-01-03 19:53:55 2001-09-24 04:00:00    PA   
2  6477950aa55bb77a0e27ee10 2023-05-31 18:42:18 1994-10-28 00:00:00    FL   
3  658a306e99b40f103b63ccf8 2023-12-26 01:46:22 1900-01-01 00:00:0

The data cleaning process involved several steps to ensure the quality and consistency of the `Users` dataset. Missing values in the `STATE`, `LANGUAGE`, and `GENDER` columns were replaced with "NA" to fill the gaps. Duplicate records based on the `ID` column were identified and removed to ensure uniqueness, and the count of duplicates was logged. The `LANGUAGE` and `GENDER` columns were standardized by converting the text to lowercase to ensure uniformity. Date columns, `CREATED_DATE` and `BIRTH_DATE`, were converted to datetime format while removing timezone information to avoid inconsistencies. Invalid or missing dates were replaced with a placeholder date ("1900-01-01") using a custom function to ensure that missing or invalid date values didn't cause issues. Finally, the dataset was reviewed post-cleaning to ensure the changes were applied correctly, and the structure was validated with updated information.

In [11]:
# Save cleaned data
users.to_csv("cleaned_users.csv", index=False)
print("\nCleaned dataset saved as 'cleaned_users.csv'")


Cleaned dataset saved as 'cleaned_users.csv'


In [12]:
# Load and process transactions data
transactions = pd.read_csv(transactions_path)
# Initial Dataset Details
print("Initial Dataset Info:")
print(transactions.info())
print("\nSummary Statistics:")
print(transactions.describe(include='all'))
print("\nSample Data:")
print(transactions.head())

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   RECEIPT_ID      50000 non-null  object 
 1   PURCHASE_DATE   50000 non-null  object 
 2   SCAN_DATE       50000 non-null  object 
 3   STORE_NAME      50000 non-null  object 
 4   USER_ID         50000 non-null  object 
 5   BARCODE         44238 non-null  float64
 6   FINAL_QUANTITY  50000 non-null  object 
 7   FINAL_SALE      50000 non-null  object 
dtypes: float64(1), object(7)
memory usage: 3.1+ MB
None

Summary Statistics:
                                  RECEIPT_ID PURCHASE_DATE  \
count                                  50000         50000   
unique                                 24440            89   
top     bedac253-2256-461b-96af-267748e6cecf    2024-06-15   
freq                                      12           774   
mean                            

In [13]:
# Handle Missing Values
# Replace missing values in categorical columns with "NA"
transactions['STORE_NAME'] = transactions['STORE_NAME'].fillna("NA")
transactions['USER_ID'] = transactions['USER_ID'].fillna("NA")

# Replace missing or invalid numeric columns with 0
transactions['FINAL_SALE'] = pd.to_numeric(transactions['FINAL_SALE'], errors='coerce').fillna(0)
transactions['FINAL_QUANTITY'] = pd.to_numeric(transactions['FINAL_QUANTITY'], errors='coerce').fillna(0)

# Replace missing or invalid values in BARCODE with 0 and convert to int
transactions['BARCODE'] = transactions['BARCODE'].fillna(0).astype("int64")

In [14]:
# Function to replace invalid dates with a placeholder
def replace_invalid_dates(date_column, placeholder_date):
    # Convert to datetime, set errors='coerce' to handle invalid dates
    date_column = pd.to_datetime(date_column, errors='coerce')
    # Replace NaT with the placeholder date
    date_column.fillna(pd.to_datetime(placeholder_date), inplace=True)
    # Remove timezone if it exists
    return date_column.dt.tz_localize(None)

# Replace missing or invalid 'PURCHASE_DATE' and 'SCAN_DATE' with placeholder date and remove timezone
transactions['PURCHASE_DATE'] = replace_invalid_dates(transactions['PURCHASE_DATE'], '1900-01-01')
transactions['SCAN_DATE'] = replace_invalid_dates(transactions['SCAN_DATE'], '1900-01-01')

# Verify changes
print(transactions.dtypes)
print(transactions[['PURCHASE_DATE', 'SCAN_DATE']].head())

RECEIPT_ID                object
PURCHASE_DATE     datetime64[ns]
SCAN_DATE         datetime64[ns]
STORE_NAME                object
USER_ID                   object
BARCODE                    int64
FINAL_QUANTITY           float64
FINAL_SALE               float64
dtype: object
  PURCHASE_DATE               SCAN_DATE
0    2024-08-21 2024-08-21 14:19:06.539
1    2024-07-20 2024-07-20 09:50:24.206
2    2024-08-18 2024-08-19 15:38:56.813
3    2024-06-18 2024-06-19 11:03:37.468
4    2024-07-04 2024-07-05 15:56:43.549


In [15]:
# Verify changes
print(transactions.dtypes)

RECEIPT_ID                object
PURCHASE_DATE     datetime64[ns]
SCAN_DATE         datetime64[ns]
STORE_NAME                object
USER_ID                   object
BARCODE                    int64
FINAL_QUANTITY           float64
FINAL_SALE               float64
dtype: object


In [16]:
# Check for Duplicate Rows
# Count duplicate rows
duplicate_row_count = transactions.duplicated().sum()
print(duplicate_row_count)

335


In [17]:
# drop duplicate rows
transactions = transactions.drop_duplicates()

In [18]:
#  Post-Cleaning Details
print("\nCleaned Dataset Info:")
print(transactions.info())
print("\nCleaned Sample Data:")
print(transactions.head())


Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 49665 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   RECEIPT_ID      49665 non-null  object        
 1   PURCHASE_DATE   49665 non-null  datetime64[ns]
 2   SCAN_DATE       49665 non-null  datetime64[ns]
 3   STORE_NAME      49665 non-null  object        
 4   USER_ID         49665 non-null  object        
 5   BARCODE         49665 non-null  int64         
 6   FINAL_QUANTITY  49665 non-null  float64       
 7   FINAL_SALE      49665 non-null  float64       
dtypes: datetime64[ns](2), float64(2), int64(1), object(3)
memory usage: 3.4+ MB
None

Cleaned Sample Data:
                             RECEIPT_ID PURCHASE_DATE               SCAN_DATE  \
0  0000d256-4041-4a3e-adc4-5623fb6e0c99    2024-08-21 2024-08-21 14:19:06.539   
1  0001455d-7a92-4a7b-a1d2-c747af1c8fd3    2024-07-20 2024-07-20 09:50:24.206   
2  

The data cleaning process for the `Transactions` dataset involved several key steps to ensure consistency and accuracy. Missing values in categorical columns (`STORE_NAME` and `USER_ID`) were replaced with "NA", while missing or invalid numeric columns (`FINAL_SALE`, `FINAL_QUANTITY`) were handled by converting them to numeric values and filling any missing entries with 0. The `BARCODE` column was similarly handled, filling missing values with 0 and converting the data to integers. A custom function was used to replace invalid or missing dates in the `PURCHASE_DATE` and `SCAN_DATE` columns with a placeholder date ("1900-01-01") and remove any timezone information. This was done to ensure that the data would not pose any issues when stored or queried in MySQL servers, where timezone inconsistencies can cause errors. Additionally, 335 duplicate rows were identified and removed to ensure the dataset contained only unique records. After cleaning, the data types and structure were verified, ensuring that the dataset was properly formatted and free from inconsistencies.

In [19]:
# Save cleaned data
transactions.to_csv("cleaned_transactions.csv", index=False)
print("\nCleaned dataset saved as 'cleaned_transactions.csv'")


Cleaned dataset saved as 'cleaned_transactions.csv'


In [20]:
# Load and process products data
products = pd.read_csv(products_path)
# Initial Dataset Details
print("Initial Dataset Info:")
print(products.info())
print("\nSummary Statistics:")
print(products.describe(include='all'))
print("\nSample Data:")
print(products.head())

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 845552 entries, 0 to 845551
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   CATEGORY_1    845441 non-null  object 
 1   CATEGORY_2    844128 non-null  object 
 2   CATEGORY_3    784986 non-null  object 
 3   CATEGORY_4    67459 non-null   object 
 4   MANUFACTURER  619078 non-null  object 
 5   BRAND         619080 non-null  object 
 6   BARCODE       841527 non-null  float64
dtypes: float64(1), object(6)
memory usage: 45.2+ MB
None

Summary Statistics:
               CATEGORY_1 CATEGORY_2        CATEGORY_3 CATEGORY_4  \
count              845441     844128            784986      67459   
unique                 27        121               344        127   
top     Health & Wellness      Candy  Confection Candy  Lip Balms   
freq               512695     121036             56965       9737   
mean                  NaN        NaN            

In [21]:
# Handle Missing Values
# Replace missing values in categorical columns with "NA"
categorical_columns = ['CATEGORY_1', 'CATEGORY_2', 'CATEGORY_3', 'CATEGORY_4', 'MANUFACTURER', 'BRAND']
products[categorical_columns] = products[categorical_columns].fillna("NA")

# Replace missing values in numeric columns with 0
numeric_columns = ['BARCODE']
products[numeric_columns] = products[numeric_columns].fillna(0)

In [22]:
# Check for Duplicate Rows
# Count duplicate rows
products_duplicate_row_count = products.duplicated().sum()
print(products_duplicate_row_count)

215


In [23]:
# Standardize Data Types
# Ensure numeric columns are in numeric format
products['BARCODE'] = products['BARCODE'].apply(pd.to_numeric, errors='coerce')  # Convert to numeric
products['BARCODE'] = products['BARCODE'].fillna(0).astype("int64")  # Handle NaNs and convert to int64
print(products['BARCODE'].head())

0    796494407820
1     23278011028
2    461817824225
3     35000466815
4    806810850459
Name: BARCODE, dtype: int64


In [24]:
# Ensure categorical columns are strings
for col in categorical_columns:
    products[col] = products[col].astype(str)

In [25]:
# Check for Duplicate Rows
# Count duplicate rows
products_duplicate_row_count = products.duplicated().sum()
print(products_duplicate_row_count)

215


In [26]:
# Drop duplicate rows
products = products.drop_duplicates()

In [27]:
#  Post-Cleaning Details
print("\nCleaned Dataset Info:")
print(products.info())
print("\nCleaned Sample Data:")
print(products.head())


Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 845337 entries, 0 to 845551
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   CATEGORY_1    845337 non-null  object
 1   CATEGORY_2    845337 non-null  object
 2   CATEGORY_3    845337 non-null  object
 3   CATEGORY_4    845337 non-null  object
 4   MANUFACTURER  845337 non-null  object
 5   BRAND         845337 non-null  object
 6   BARCODE       845337 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 51.6+ MB
None

Cleaned Sample Data:
          CATEGORY_1              CATEGORY_2                   CATEGORY_3  \
0  Health & Wellness           Sexual Health  Conductivity Gels & Lotions   
1             Snacks           Puffed Snacks         Cheese Curls & Puffs   
2  Health & Wellness               Hair Care        Hair Care Accessories   
3  Health & Wellness               Oral Care                   Toothpaste   
4  Health & Wellness  

The data cleaning process for the `Products` dataset involved several key steps to ensure consistency and accuracy. Missing values in categorical columns (`CATEGORY_1`, `CATEGORY_2`, `CATEGORY_3`, `CATEGORY_4`, `MANUFACTURER`, and `BRAND`) were replaced with "NA", while missing values in the numeric column (`BARCODE`) were filled with 0. The dataset was checked for duplicate rows, and 215 duplicate records were identified and removed to ensure uniqueness. Numeric columns, such as `BARCODE`, were converted to a numeric format and handled by filling missing values with 0 and converting the data to `int64`. Additionally, categorical columns were standardized by ensuring all values were stored as strings. After cleaning, the dataset was verified for consistency and structure, ensuring that it was properly formatted and free from issues such as missing values and duplicates.

In [28]:
# Save cleaned data
products.to_csv("cleaned_products.csv", index=False)
print("\nCleaned dataset saved as 'cleaned_products.csv'")


Cleaned dataset saved as 'cleaned_products.csv'
