In [1]:
import pandas as pd 
import plotly.express as px

In [2]:
# read CSV as dataframe
df = pd.read_csv("https://intro-to-python-asdaf.s3.ap-southeast-2.amazonaws.com/supermarket_sales.csv")

In [3]:
# inspect the data using df.head()
df.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


# Group By

Aggregate data BY a column. 

### How?

1. Group your data by a specific column by using the `.groupby()` function. 
2. Aggregate data using one of the Pandas aggregation functions below: 
    - `count()` – Number of non-null observations
    - `sum()` – Sum of values
    - `mean()` – Mean of values
    - `median()` – Arithmetic median of values
    - `min()` – Minimum
    - `max()` – Maximum
    - `mode()` – Mode
    - `std()` – Standard deviation
    - `var()` – Variance
3. Optionally, use the `.reset_index()` function to switch the specified column from an index to a column. 

In [19]:
# 1. Use groupby
df.groupby(["City"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015573049780>

In [20]:
# 2. Use groupby with an aggregation function e.g. sum()
df.groupby(["City"]).sum()

Unnamed: 0_level_0,Unit price,Quantity,Tax 5%,Total,cogs,gross margin percentage,gross income,Rating
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Mandalay,18478.88,1820,5057.032,106197.672,101140.64,1580.952381,5057.032,2263.6
Naypyitaw,18567.76,1831,5265.1765,110568.7065,105303.53,1561.904762,5265.1765,2319.9
Yangon,18625.49,1859,5057.1605,106200.3705,101143.21,1619.047619,5057.1605,2389.2


In [21]:
# 3. Use groupby with an aggregation function e.g. sum() and reset_index()
df.groupby(["City"]).sum().reset_index()

Unnamed: 0,City,Unit price,Quantity,Tax 5%,Total,cogs,gross margin percentage,gross income,Rating
0,Mandalay,18478.88,1820,5057.032,106197.672,101140.64,1580.952381,5057.032,2263.6
1,Naypyitaw,18567.76,1831,5265.1765,110568.7065,105303.53,1561.904762,5265.1765,2319.9
2,Yangon,18625.49,1859,5057.1605,106200.3705,101143.21,1619.047619,5057.1605,2389.2


In [22]:
# save the full example to a new dataframe 
df_groupby_city = df.groupby(["City"]).sum().reset_index()
df_groupby_city

Unnamed: 0,City,Unit price,Quantity,Tax 5%,Total,cogs,gross margin percentage,gross income,Rating
0,Mandalay,18478.88,1820,5057.032,106197.672,101140.64,1580.952381,5057.032,2263.6
1,Naypyitaw,18567.76,1831,5265.1765,110568.7065,105303.53,1561.904762,5265.1765,2319.9
2,Yangon,18625.49,1859,5057.1605,106200.3705,101143.21,1619.047619,5057.1605,2389.2


In [6]:
# bar plot of total sales by city
px.bar(df_groupby_city.sort_values(["Total"], ascending=False), x="City", y="Total")

In [10]:
# bar plot of total sales by product line and customer type
df_groupby_product_line_customer_type = df.groupby(["Product line", "Customer type"]).sum().reset_index()
px.bar(df_groupby_product_line_customer_type.sort_values(["Total"], ascending=False), x="Product line", y="Total", color="Customer type")

In [18]:
# line plot of total sales by date 
df["Date"] = pd.to_datetime(df["Date"])
df_groupby_date = df.groupby(["Date"]).sum().reset_index()
px.line(df_groupby_date, x="Date", y="Total")