# Sales Data Preprocessing Notebook

This notebook contains comprehensive data preprocessing steps for the sales data dashboard.

## Overview
- Load and explore sales data
- Handle missing values and duplicates
- Normalize categories and extract time features
- Save cleaned data for dashboard consumption


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("Libraries imported successfully!")


## 1. Load and Explore Data


In [None]:
# Load the sales data
df = pd.read_csv('../data/raw/sales_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 5 rows:")
df.head()


In [None]:
# Basic data information
print("Dataset Info:")
print(df.info())
print("\nData Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())
print("\nDuplicate Rows:")
print(f"Total duplicates: {df.duplicated().sum()}")


## 2. Data Cleaning and Preprocessing


In [None]:
# Create a copy for cleaning
df_clean = df.copy()

# Remove duplicates
df_clean = df_clean.drop_duplicates()
print(f"After removing duplicates: {df_clean.shape}")

# Handle missing values
print("\nMissing values before cleaning:")
print(df_clean.isnull().sum())

# Fill missing values
df_clean['CUSTOMERNAME'] = df_clean['CUSTOMERNAME'].fillna('Unknown Customer')
df_clean['COUNTRY'] = df_clean['COUNTRY'].fillna('Unknown Country')
df_clean['PRODUCTLINE'] = df_clean['PRODUCTLINE'].fillna('Unknown Product')
df_clean['STATUS'] = df_clean['STATUS'].fillna('Unknown Status')

# For numeric columns, fill with median
df_clean['SALES'] = df_clean['SALES'].fillna(df_clean['SALES'].median())
df_clean['QUANTITYORDERED'] = df_clean['QUANTITYORDERED'].fillna(df_clean['QUANTITYORDERED'].median())

print("\nMissing values after cleaning:")
print(df_clean.isnull().sum())


In [None]:
# Convert ORDERDATE to datetime
df_clean['ORDERDATE'] = pd.to_datetime(df_clean['ORDERDATE'], errors='coerce')

# Extract time features
df_clean['YEAR'] = df_clean['ORDERDATE'].dt.year
df_clean['MONTH'] = df_clean['ORDERDATE'].dt.month
df_clean['QUARTER'] = df_clean['ORDERDATE'].dt.quarter
df_clean['DAY_OF_WEEK'] = df_clean['ORDERDATE'].dt.day_name()
df_clean['MONTH_NAME'] = df_clean['ORDERDATE'].dt.month_name()

print("Time features extracted successfully!")
print(f"Date range: {df_clean['ORDERDATE'].min()} to {df_clean['ORDERDATE'].max()}")


In [None]:
# Normalize categorical data
df_clean['PRODUCTLINE'] = df_clean['PRODUCTLINE'].str.strip().str.title()
df_clean['COUNTRY'] = df_clean['COUNTRY'].str.strip().str.title()
df_clean['CUSTOMERNAME'] = df_clean['CUSTOMERNAME'].str.strip().str.title()
df_clean['STATUS'] = df_clean['STATUS'].str.strip().str.title()

# Ensure numeric columns are properly typed
df_clean['SALES'] = pd.to_numeric(df_clean['SALES'], errors='coerce')
df_clean['QUANTITYORDERED'] = pd.to_numeric(df_clean['QUANTITYORDERED'], errors='coerce')
df_clean['ORDERNUMBER'] = pd.to_numeric(df_clean['ORDERNUMBER'], errors='coerce')

print("Data normalization completed!")


## 3. Data Analysis and Visualization
