<a href="https://colab.research.google.com/github/lamyse1/Data-Engineering-Projects/blob/main/DE_Graded_Project_Week1_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Working Towards The DE Project: My Dashboard**

# **1. Set Up the Sales Dataset:**

In [26]:
import pandas as pd

csv_url = "https://raw.githubusercontent.com/lamyse1/Data-Engineering-Projects/main/D.E.%20Graded%20Project.%20Store%20Sales%20Data.csv"
sales_data = pd.read_csv(csv_url)

# Preview the first few rows
print(sales_data.head())


       date  product ID  sales amount   store location 
0  1/1/2013           1             13                1
1  1/2/2013           1             11                1
2  1/3/2013           1             14                1
3  1/4/2013           1             13                1
4  1/5/2013           1             10                1


In [27]:
# Check column names and first few rows
print("Column Names:", sales_data.columns)
print("\nFirst 5 Rows:")
print(sales_data.head())

# Check for missing values
print("\nMissing Values Per Column:")
print(sales_data.isnull().sum())

# Get summary statistics for numerical columns
print("\nBasic Statistics:")
print(sales_data.describe())

# Check data types
print("\nData Types:")
print(sales_data.dtypes)


Column Names: Index(['date', 'product ID', 'sales amount ', 'store location '], dtype='object')

First 5 Rows:
       date  product ID  sales amount   store location 
0  1/1/2013           1             13                1
1  1/2/2013           1             11                1
2  1/3/2013           1             14                1
3  1/4/2013           1             13                1
4  1/5/2013           1             10                1

Missing Values Per Column:
date               0
product ID         0
sales amount       0
store location     0
dtype: int64

Basic Statistics:
          product ID  sales amount   store location 
count  913000.000000  913000.000000    913000.000000
mean       25.500000      52.250287         5.500000
std        14.430878      28.801144         2.872283
min         1.000000       0.000000         1.000000
25%        13.000000      30.000000         3.000000
50%        25.500000      47.000000         5.500000
75%        38.000000      70.000000   

In [28]:
# remove any extra spaces from column names
sales_data.columns = sales_data.columns.str.strip()

# Verify column names after cleaning
print("Updated Column Names:", sales_data.columns)


Updated Column Names: Index(['date', 'product ID', 'sales amount', 'store location'], dtype='object')


In [29]:
# Define mapping for store numbers
store_mapping = {
    1: "New York",
    2: "Los Angeles",
    3: "Chicago",
    4: "Houston",
    5: "Phoenix",
    6: "Philadelphia",
    7: "San Antonio",
    8: "San Diego",
    9: "Dallas",
    10: "San Jose"
}

# Apply mapping to store_location column
sales_data["store location"] = sales_data["store location"].map(store_mapping)

# Confirm mapping applied correctly
print("\nMapped Store Locations:")
print(sales_data[["store location"]].drop_duplicates())  # Display unique locations to verify



Mapped Store Locations:
      store location
0           New York
1826     Los Angeles
3652         Chicago
5478         Houston
7304         Phoenix
9130    Philadelphia
10956    San Antonio
12782      San Diego
14608         Dallas
16434       San Jose


# **Perform Basic Sales Analysis (Grouping & Aggregations)**

In [30]:
# 1. Calculate total sales across all locations
total_sales = sales_data["sales amount"].sum()
print(f"\nTotal Sales (All Locations): {total_sales}")


Total Sales (All Locations): 47704512


In [31]:
# 2. Sales by store location
sales_by_location = sales_data.groupby("store location")["sales amount"].sum().reset_index()
print("\nTotal Sales by Store Location:")
print(sales_by_location)


Total Sales by Store Location:
  store location  sales amount
0        Chicago       5435144
1         Dallas       5025976
2        Houston       5012639
3    Los Angeles       6120128
4       New York       4315603
5   Philadelphia       3627670
6        Phoenix       3631016
7    San Antonio       3320009
8      San Diego       5856169
9       San Jose       5360158


In [32]:
# 3. Sales by product
sales_by_product = sales_data.groupby("product ID")["sales amount"].sum().reset_index()
print("\nTotal Sales by Product:")
print(sales_by_product)


Total Sales by Product:
    product ID  sales amount
0            1        401384
1            2       1069564
2            3        669087
3            4        401907
4            5        335230
5            6       1068281
6            7       1068777
7            8       1405108
8            9        938379
9           10       1337133
10          11       1271925
11          12       1271534
12          13       1539621
13          14       1071531
14          15       1607442
15          16        468480
16          17        602486
17          18       1538876
18          19        736892
19          20        867641
20          21        736190
21          22       1469971
22          23        534979
23          24       1205975
24          25       1473334
25          26        869981
26          27        402628
27          28       1604713
28          29       1271240
29          30        736554
30          31       1070845
31          32        803107
32          33    

In [33]:
# 4. Find the best-selling store location
best_selling_store = sales_by_location.loc[sales_by_location["sales amount"].idxmax()]

print("\nBest-Selling Store Location:")
print(f"{best_selling_store['store location']} with total sales of {best_selling_store['sales amount']}")



Best-Selling Store Location:
Los Angeles with total sales of 6120128


In [34]:
# Group sales by product ID
sales_by_product = sales_data.groupby("product ID")["sales amount"].sum().reset_index()

# 5. Find the best-selling product
best_selling_product = sales_by_product.loc[sales_by_product["sales amount"].idxmax()]

print("\nBest-Selling Product:")
print(f"Product {best_selling_product['product ID']} with total sales of {best_selling_product['sales amount']}")



Best-Selling Product:
Product 15 with total sales of 1607442


In [35]:
# 6. Summary statistics for sales amount
print("\nSales Amount Statistics:")
print(sales_data["sales amount"].describe())


Sales Amount Statistics:
count    913000.000000
mean         52.250287
std          28.801144
min           0.000000
25%          30.000000
50%          47.000000
75%          70.000000
max         231.000000
Name: sales amount, dtype: float64


# **2. Create a Basic Visualization with Dash:**

In [36]:
!pip install dash
!pip install jupyter-dash
!pip install plotly




In [46]:
import dash
from dash import dcc, html
import plotly.express as px
from jupyter_dash import JupyterDash
import pandas as pd
from dash.dependencies import Input, Output

In [47]:
# Aggregate sales data by store location
sales_by_location = sales_data.groupby("store location")["sales amount"].sum().reset_index()


In [48]:
# Create a bar chart for sales by store location ( following the example on the exercise requirements sheet)
fig = px.bar(
    sales_by_location,
    x="store location",
    y="sales amount",
    title="Sales by Store Location"
)


In [50]:

# Initialize the Dash app
app = dash.Dash(__name__)

# Add "All Locations" option for store sales
all_locations = pd.DataFrame({"store location": ["All Locations"], "sales amount": [sales_by_location["sales amount"].sum()]})
sales_by_location_with_all = pd.concat([all_locations, sales_by_location], ignore_index=True)

#  Add "All Products" option for product sales
all_products = pd.DataFrame({"product ID": ["All Products"], "sales amount": [sales_by_product["sales amount"].sum()]})
sales_by_product_with_all = pd.concat([all_products, sales_by_product], ignore_index=True)

# Define the app layout with two separate sections
app.layout = html.Div(children=[
    html.H1(children="Sales Dashboard"),

    # Section 1: Store Location Sales
    html.H3(children="Sales by Store Location"),
    dcc.Dropdown(
        id="store-selector",
        options=[{"label": loc, "value": loc} for loc in sales_by_location_with_all["store location"]],
        value="All Locations",
        clearable=False
    ),
    dcc.Graph(id="sales-location-chart"),

    # Section 2: Product Sales
    html.H3(children="Sales by Product"),
    dcc.Dropdown(
        id="product-selector",
        options=[{"label": str(prod), "value": prod} for prod in sales_by_product_with_all["product ID"]],
        value="All Products",
        clearable=False
    ),
    dcc.Graph(id="sales-product-chart")
])

#  Define Callback for Store Sales Chart
@app.callback(
    Output("sales-location-chart", "figure"),
    [Input("store-selector", "value")]
)
def update_store_chart(selected_store):
    df = sales_by_location_with_all if selected_store == "All Locations" else \
         sales_by_location[sales_by_location["store location"] == selected_store]

    fig = px.bar(
        df,
        x="store location",
        y="sales amount",
        title=f"Sales for {selected_store}"
    )
    return fig

#  Define Callback for Product Sales Chart
@app.callback(
    Output("sales-product-chart", "figure"),
    [Input("product-selector", "value")]
)
def update_product_chart(selected_product):
    df = sales_by_product_with_all if selected_product == "All Products" else \
         sales_by_product[sales_by_product["product ID"] == selected_product]

    fig = px.bar(
        df,
        x="product ID",
        y="sales amount",
        title=f"Sales for {selected_product}"
    )
    return fig

# Run the Dash server
if __name__ == '__main__':
    app.run_server(debug=True)


<IPython.core.display.Javascript object>