# Sales Performance Dashboard Analysis
## Data Science Assignment

This notebook demonstrates the creation of a comprehensive sales performance dashboard using the Online Retail Dataset. We'll follow data visualization best practices and implement the 4C Principles:
- Clear
- Concise
- Captivating
- Credible

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('default')  # Using default style instead of seaborn
sns.set_theme(style="whitegrid")  # Set seaborn theme explicitly

# Define a custom color palette
colors = ['#2E86C1', '#3498DB', '#5DADE2', '#85C1E9', '#AED6F1']

## 1. Data Loading and Cleaning

In [None]:
# Load the dataset
df = pd.read_excel('Online Retail Data Set.xlsx')

# Display basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\nFirst few rows:")
df.head()

In [None]:
# Data cleaning steps
def clean_data(df):
    # Remove rows with missing values
    df = df.dropna()
    
    # Remove rows with negative quantities or prices
    df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
    
    # Add TotalAmount column
    df['TotalAmount'] = df['Quantity'] * df['UnitPrice']
    
    # Convert InvoiceDate to datetime
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    
    # Extract month and year
    df['Month'] = df['InvoiceDate'].dt.to_period('M')
    
    return df

# Clean the data
df_clean = clean_data(df.copy())
print("Shape after cleaning:", df_clean.shape)

## 2. Monthly Sales Trends

In [None]:
# Calculate monthly sales
monthly_sales = df_clean.groupby('Month')['TotalAmount'].sum().reset_index()
monthly_sales['Month'] = monthly_sales['Month'].astype(str)

# Create monthly sales trend visualization using plotly
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=monthly_sales['Month'],
    y=monthly_sales['TotalAmount'],
    mode='lines+markers',
    name='Monthly Sales',
    line=dict(color=colors[0], width=3),
    marker=dict(size=8)
))

fig.update_layout(
    title='Monthly Sales Trends',
    xaxis_title='Month',
    yaxis_title='Total Sales',
    template='plotly_white',
    showlegend=False
)

fig.show()

## 3. Best-Selling Products Analysis

In [None]:
# Calculate top 10 best-selling products
top_products = df_clean.groupby('Description')[
    ['Quantity', 'TotalAmount']
].agg({
    'Quantity': 'sum',
    'TotalAmount': 'sum'
}).sort_values('TotalAmount', ascending=False).head(10)

# Create visualization using plotly
fig = go.Figure()
fig.add_trace(go.Bar(
    x=top_products['TotalAmount'],
    y=top_products.index,
    orientation='h',
    marker_color=colors[1]
))

fig.update_layout(
    title='Top 10 Best-Selling Products by Revenue',
    xaxis_title='Total Revenue',
    yaxis_title='Product',
    template='plotly_white',
    height=600
)

fig.show()

## 4. Sales by Country

In [None]:
# Calculate sales by country
country_sales = df_clean.groupby('Country')['TotalAmount'].sum().sort_values(ascending=True)

# Create visualization using plotly
fig = go.Figure()
fig.add_trace(go.Bar(
    x=country_sales.values,
    y=country_sales.index,
    orientation='h',
    marker_color=colors[2]
))

fig.update_layout(
    title='Sales by Country',
    xaxis_title='Total Sales',
    yaxis_title='Country',
    template='plotly_white',
    height=600
)

fig.show()

## 5. Product Categories Analysis

In [None]:
# Function to extract categories
def extract_category(description):
    categories = {
        'VINTAGE': 'Vintage Items',
        'GARDEN': 'Garden Accessories',
        'CHRISTMAS': 'Christmas Items',
        'METAL': 'Metal Signs & Decor',
        'WOOD': 'Wooden Items',
        'GLASS': 'Glassware',
        'PAPER': 'Paper Products',
        'CERAMIC': 'Ceramic Items'
    }
    
    description = str(description).upper()
    for key in categories:
        if key in description:
            return categories[key]
    return 'Other'

# Add category column
df_clean['Category'] = df_clean['Description'].apply(extract_category)

# Analyze sales by category
category_sales = df_clean.groupby('Category')['TotalAmount'].sum().sort_values(ascending=True)

# Create visualization using plotly
fig = go.Figure(data=[go.Pie(
    labels=category_sales.index,
    values=category_sales.values,
    hole=0.3,
    marker_colors=colors
)])

fig.update_layout(
    title='Sales Distribution by Product Category',
    template='plotly_white'
)

fig.show()

## 6. Customer Behavior Analysis

In [None]:
# Create customer cohorts
def create_cohort(df):
    df['CohortMonth'] = df.groupby('CustomerID')['InvoiceDate'].transform('min').dt.to_period('M')
    df['CohortIndex'] = (df['InvoiceDate'].dt.to_period('M') - df['CohortMonth']).apply(lambda x: x.n)
    return df

# Create cohorts
df_cohort = create_cohort(df_clean[df_clean['CustomerID'].notna()])

# Create cohort analysis
cohort_data = df_cohort.groupby(['CohortMonth', 'CohortIndex'])['CustomerID'].nunique().reset_index()
cohort_table = cohort_data.pivot(index='CohortMonth',
                                columns='CohortIndex',
                                values='CustomerID')

# Calculate retention rates
cohort_sizes = cohort_table.iloc[:, 0]
retention_table = cohort_table.divide(cohort_sizes, axis=0)

# Create heatmap using plotly
fig = go.Figure(data=go.Heatmap(
    z=retention_table.values,
    x=retention_table.columns,
    y=retention_table.index.astype(str),
    colorscale='RdYlBu',
    text=np.round(retention_table.values * 100, 1),
    texttemplate='%{text}%',
    textfont={"size": 10},
    hoverongaps=False
))

fig.update_layout(
    title='Customer Cohort Retention Analysis',
    xaxis_title='Cohort Index (Months)',
    yaxis_title='Cohort Month',
    template='plotly_white'
)

fig.show()

## Summary

This dashboard provides comprehensive insights into the online retail business:

1. **Monthly Sales Trends**:
   - Clear visualization of temporal patterns
   - Identification of peak sales periods

2. **Best-Selling Products**:
   - Top revenue-generating items
   - Product performance analysis

3. **Geographic Analysis**:
   - Sales distribution by country
   - Market penetration insights

4. **Product Categories**:
   - Category-wise sales breakdown
   - Portfolio mix analysis

5. **Customer Behavior**:
   - Cohort analysis for retention
   - Customer lifecycle patterns

The visualizations follow the 4C principles:
- **Clear**: Easy-to-understand charts
- **Clean**: Consistent styling and colors
- **Concise**: Focused on key metrics
- **Captivating**: Interactive and visually appealing