
# Bakery Sales Data Preprocessing
This notebook performs data preprocessing for the Bakery Sales dataset, including:
- Loading and inspecting the datasets (`sales` and `prices`)
- Cleaning data by handling missing values and duplicates
- Feature engineering for analysis and chatbot development
- Saving cleaned data for subsequent use.

---


In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns


## Step 1: Load and Inspect Data

In [None]:

# Define file paths (adjust paths based on project folder structure)
sales_file = './data/raw/bakery_sales.csv'
prices_file = './data/raw/bakery_prices.csv'

# Load the datasets
sales_df = pd.read_csv(sales_file)
prices_df = pd.read_csv(prices_file)

# Display basic info and first rows of both datasets
print("Sales Data:")
print(sales_df.info(), "\n")
print(sales_df.head(), "\n")

print("Prices Data:")
print(prices_df.info(), "\n")
print(prices_df.head())


## Step 2: Data Cleaning

In [None]:

# Handle missing values
sales_df.dropna(inplace=True)
prices_df.dropna(inplace=True)

# Remove duplicates
sales_df.drop_duplicates(inplace=True)
prices_df.drop_duplicates(inplace=True)

# Convert datetime column to pandas datetime type
sales_df['datetime'] = pd.to_datetime(sales_df['datetime'])

# Standardize column names (if necessary)
sales_df.columns = sales_df.columns.str.strip().str.lower().str.replace(' ', '_')
prices_df.columns = prices_df.columns.str.strip().str.lower().str.replace(' ', '_')

# Display cleaned data summary
print("Cleaned Sales Data:")
print(sales_df.info(), "\n")
print("Cleaned Prices Data:")
print(prices_df.info())


## Step 3: Feature Engineering

In [None]:

# Merge datasets based on common identifiers (if applicable)
merged_df = pd.merge(sales_df, prices_df, on='item_name', how='left')

# Create new features (e.g., revenue per sale, total items sold per day)
merged_df['revenue'] = merged_df['quantity'] * merged_df['price']
merged_df['day_of_week'] = merged_df['datetime'].dt.day_name()

# Aggregate data for analysis
daily_sales = merged_df.groupby('day_of_week')['revenue'].sum().sort_values()


## Step 4: Save Processed Data

In [None]:

# Create output directories if they don't exist
output_dir = './data/processed/'
os.makedirs(output_dir, exist_ok=True)

# Save cleaned and processed data
merged_df.to_csv(os.path.join(output_dir, 'cleaned_bakery_data.csv'), index=False)

print("Processed data saved to:", output_dir)
