# Plotly Fall Challenge 2022

## Imports

In [75]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

## Data import

In [76]:
raw_df = pd.read_csv('../raw_data/supermarket.csv')

## Data quality

### Missing values, duplicates, etc

In [3]:
raw_df.dtypes

Invoice ID                  object
Branch                      object
City                        object
Customer type               object
Gender                      object
Product line                object
Unit price                 float64
Quantity                     int64
Tax 5%                     float64
Total                      float64
Date                        object
Time                        object
Payment                     object
cogs                       float64
gross margin percentage    float64
gross income               float64
Rating                     float64
dtype: object

In [4]:
raw_df.isna().sum()

Invoice ID                 0
Branch                     0
City                       0
Customer type              0
Gender                     0
Product line               0
Unit price                 0
Quantity                   0
Tax 5%                     0
Total                      0
Date                       0
Time                       0
Payment                    0
cogs                       0
gross margin percentage    0
gross income               0
Rating                     0
dtype: int64

In [5]:
raw_df.duplicated().sum()

0

In [6]:
raw_df['Invoice ID'].nunique()

1000

In [7]:
raw_df['Branch'].unique()

array(['A', 'C', 'B'], dtype=object)

In [8]:
raw_df['City'].unique()

array(['Yangon', 'Naypyitaw', 'Mandalay'], dtype=object)

In [9]:
raw_df['Customer type'].unique()

array(['Member', 'Normal'], dtype=object)

In [10]:
raw_df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [11]:
raw_df['Product line'].unique()

array(['Health and beauty', 'Electronic accessories',
       'Home and lifestyle', 'Sports and travel', 'Food and beverages',
       'Fashion accessories'], dtype=object)

In [12]:
raw_df['Payment'].unique()

array(['Ewallet', 'Cash', 'Credit card'], dtype=object)

In [13]:
raw_df['gross margin percentage'].unique()

array([4.76190476])

There is a unique `gross margin percentage`

In [14]:
raw_df.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


 * `Total = Unit price * Quantity * 1.05`
 * `COGS = Unit price * Quantity`
 * `Gross income = Total - COGS (seems to be equal to 5% Tax)`

### Convert to `datetime`

In [77]:
raw_df['Timestamp'] = raw_df['Date'] + ' ' + raw_df['Time']
raw_df['Timestamp'] = pd.to_datetime(raw_df['Timestamp'])

In [78]:
raw_df['Date'] = pd.to_datetime(raw_df['Date'], format='%m/%d/%Y')

In [79]:
raw_df['Hour'] = pd.to_datetime(raw_df['Time'], format='%H:%M').dt.hour

In [11]:
raw_df.dtypes

Invoice ID                         object
Branch                             object
City                               object
Customer type                      object
Gender                             object
Product line                       object
Unit price                        float64
Quantity                            int64
Tax 5%                            float64
Total                             float64
Date                       datetime64[ns]
Time                               object
Payment                            object
cogs                              float64
gross margin percentage           float64
gross income                      float64
Rating                            float64
Timestamp                  datetime64[ns]
Hour                                int64
dtype: object

### Discretize `Rating`

* 0 to 5 -> Bad
* 5.1 to 7.5 -> Average
* 7.5 to 10 -> Good

In [80]:
def discretize_rating(x):
    if x <= 5:
        return 'bad'
    elif (x > 5) and (x <= 7.5):
        return 'average'
    return 'good'

In [81]:
raw_df['Rating_cat'] = raw_df['Rating'].apply(discretize_rating)

## EDA

### City profiles

Yangon    -> A </br>
Mandalay  -> B </br>
Naypyitaw -> C

In [82]:
gender_per_city = raw_df.groupby(['City', 'Gender'], as_index=False).count()[['City', 'Gender', 'Invoice ID']]
gender_per_city

Unnamed: 0,City,Gender,Invoice ID
0,Mandalay,Female,162
1,Mandalay,Male,170
2,Naypyitaw,Female,178
3,Naypyitaw,Male,150
4,Yangon,Female,161
5,Yangon,Male,179


In [83]:
fig = px.bar(gender_per_city, x='City', y='Invoice ID', color='Gender', title = 'Gender distribution per city')
fig.update_layout(title_x=0.5)
fig.show()

In [84]:
fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]],
                    subplot_titles=['Mandalay', 'Yangon', 'Naypyitaw'])

fig1 = go.Pie(labels=gender_per_city[gender_per_city['City'] == 'Mandalay']['Gender'],
              values=gender_per_city[gender_per_city['City'] == 'Mandalay']['Invoice ID'],
              name="Mandalay")

fig2 = go.Pie(labels=gender_per_city[gender_per_city['City'] == 'Yangon']['Gender'],
              values=gender_per_city[gender_per_city['City'] == 'Yangon']['Invoice ID'],
              name="Yangon")

fig3 = go.Pie(labels=gender_per_city[gender_per_city['City'] == 'Naypyitaw']['Gender'],
              values=gender_per_city[gender_per_city['City'] == 'Naypyitaw']['Invoice ID'],
              name="Naypyitaw")

fig.add_trace(fig1, row=1, col=1)
fig.add_trace(fig2, row=1, col=2)
fig.add_trace(fig3, row=1, col=3)

fig.show()

In [85]:
membership_per_city = raw_df.groupby(['City', 'Gender', 'Customer type'], as_index=False).count()[['City', 'Gender', 'Customer type', 'Invoice ID']]
membership_per_city

Unnamed: 0,City,Gender,Customer type,Invoice ID
0,Mandalay,Female,Member,85
1,Mandalay,Female,Normal,77
2,Mandalay,Male,Member,80
3,Mandalay,Male,Normal,90
4,Naypyitaw,Female,Member,96
5,Naypyitaw,Female,Normal,82
6,Naypyitaw,Male,Member,73
7,Naypyitaw,Male,Normal,77
8,Yangon,Female,Member,80
9,Yangon,Female,Normal,81


In [86]:
fig = px.bar(membership_per_city, x='City', y='Invoice ID', color='Gender', pattern_shape='Customer type', title = 'Membership distribution per city')
fig.update_layout(title_x=0.5)
fig.show()

In [87]:
total_income_per_city = raw_df.groupby(['City', 'Product line'], as_index=False).sum()[['City', 'Product line', 'gross income']]
total_income_per_city

Unnamed: 0,City,Product line,gross income
0,Mandalay,Electronic accessories,811.9735
1,Mandalay,Fashion accessories,781.5865
2,Mandalay,Food and beverages,724.5185
3,Mandalay,Health and beauty,951.46
4,Mandalay,Home and lifestyle,835.6745
5,Mandalay,Sports and travel,951.819
6,Naypyitaw,Electronic accessories,903.2845
7,Naypyitaw,Fashion accessories,1026.67
8,Naypyitaw,Food and beverages,1131.755
9,Naypyitaw,Health and beauty,791.206


In [88]:
fig = px.bar(total_income_per_city, x='City', y='gross income', color='Product line' ,title='Total income per city')
fig.update_layout(title_x=0.5)
fig.show()

In [89]:
total_items_sold_per_city = raw_df.groupby(['City', 'Payment'], as_index=False).sum()[['City', 'Payment', 'Quantity']]
total_items_sold_per_city

Unnamed: 0,City,Payment,Quantity
0,Mandalay,Cash,628
1,Mandalay,Credit card,599
2,Mandalay,Ewallet,593
3,Naypyitaw,Cash,696
4,Naypyitaw,Credit card,543
5,Naypyitaw,Ewallet,592
6,Yangon,Cash,572
7,Yangon,Credit card,580
8,Yangon,Ewallet,707


In [90]:
fig = px.bar(total_items_sold_per_city, x='City', y= 'Quantity', color='Payment', title = 'Total items sold per city')
fig.update_layout(title_x=0.5)
fig.show()

In [91]:
ratings_per_city = raw_df.groupby(['City', 'Rating_cat'], as_index=False).count()[['City', 'Rating_cat', 'Invoice ID']]
ratings_per_city

Unnamed: 0,City,Rating_cat,Invoice ID
0,Mandalay,average,143
1,Mandalay,bad,66
2,Mandalay,good,123
3,Naypyitaw,average,141
4,Naypyitaw,bad,48
5,Naypyitaw,good,139
6,Yangon,average,142
7,Yangon,bad,60
8,Yangon,good,138


In [93]:
fig = px.bar(ratings_per_city, x='City', y= 'Invoice ID', color='Rating_cat', title = 'City ratings')
fig.update_layout(title_x=0.5)
fig.show()

In [94]:
date_sorted_df = raw_df.sort_values('Date')
date_sorted_df.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating,Timestamp,Hour,Rating_cat
856,770-42-8960,B,Mandalay,Normal,Male,Food and beverages,21.12,8,8.448,177.408,2019-01-01,19:31,Cash,168.96,4.761905,8.448,6.3,2019-01-01 19:31:00,19,average
567,651-88-7328,A,Yangon,Normal,Female,Fashion accessories,65.74,9,29.583,621.243,2019-01-01,13:55,Cash,591.66,4.761905,29.583,7.7,2019-01-01 13:55:00,13,good
245,530-90-9855,A,Yangon,Member,Male,Home and lifestyle,47.59,8,19.036,399.756,2019-01-01,14:47,Cash,380.72,4.761905,19.036,5.7,2019-01-01 14:47:00,14,average
696,182-52-7000,A,Yangon,Member,Female,Sports and travel,27.04,4,5.408,113.568,2019-01-01,20:26,Ewallet,108.16,4.761905,5.408,6.9,2019-01-01 20:26:00,20,average
523,133-14-7229,C,Naypyitaw,Normal,Male,Health and beauty,62.87,2,6.287,132.027,2019-01-01,11:43,Cash,125.74,4.761905,6.287,5.0,2019-01-01 11:43:00,11,bad


In [95]:
timeline_df = date_sorted_df.groupby(['City','Date'], as_index=False).sum()[['City', 'Date', 'gross income']]
timeline_df

Unnamed: 0,City,Date,gross income
0,Mandalay,2019-01-01,73.1760
1,Mandalay,2019-01-02,55.3715
2,Mandalay,2019-01-03,48.4140
3,Mandalay,2019-01-04,24.3135
4,Mandalay,2019-01-05,23.1790
...,...,...,...
258,Yangon,2019-03-26,56.5400
259,Yangon,2019-03-27,50.5060
260,Yangon,2019-03-28,78.7110
261,Yangon,2019-03-29,46.5530


In [139]:
fig = make_subplots(rows=3, cols=1)

fig.append_trace(go.Scatter(
    x=timeline_df[timeline_df['City']=='Mandalay']['Date'],
    y=timeline_df[timeline_df['City']=='Mandalay']['gross income'],
    mode='lines+markers',
    name='Mandalay',
), row=1, col=1)

fig.add_hline(y=timeline_df[timeline_df['City']=='Mandalay']['gross income'].mean(),
              row=1, col=1, line_width=1, line_dash="dot", line_color="black",
              annotation_text="mean<br>income", 
              annotation_position="bottom left")

fig.append_trace(go.Scatter(
    x=timeline_df[timeline_df['City']=='Yangon']['Date'],
    y=timeline_df[timeline_df['City']=='Yangon']['gross income'],
    mode='lines+markers',
    name='Yangon'
), row=2, col=1)

fig.add_hline(y=timeline_df[timeline_df['City']=='Yangon']['gross income'].mean(),
              row=2, col=1, line_width=1, line_dash="dot", line_color="black",
              annotation_text="mean<br>income", 
              annotation_position="bottom left")

fig.append_trace(go.Scatter(
    x=timeline_df[timeline_df['City']=='Naypyitaw']['Date'],
    y=timeline_df[timeline_df['City']=='Naypyitaw']['gross income'],
    mode='lines+markers',
    name='Naypyitaw'
), row=3, col=1)

fig.add_hline(y=timeline_df[timeline_df['City']=='Naypyitaw']['gross income'].mean(),
              row=3, col=1, line_width=1, line_dash="dot", line_color="black",
              annotation_text="mean<br>income", 
              annotation_position="top left")

fig.update_layout(height=800, width=1000, title_text="Income",
                  hovermode="x unified", title_x=0.5)
fig.show()

In [127]:
product_ratings_df = raw_df.groupby(['City', 'Product line'], as_index=False).mean()[['City', 'Product line', 'Rating']]
product_ratings_df

Unnamed: 0,City,Product line,Rating
0,Mandalay,Electronic accessories,7.116364
1,Mandalay,Fashion accessories,6.722581
2,Mandalay,Food and beverages,6.994
3,Mandalay,Health and beauty,7.1
4,Mandalay,Home and lifestyle,6.516
5,Mandalay,Sports and travel,6.509677
6,Naypyitaw,Electronic accessories,6.747273
7,Naypyitaw,Fashion accessories,7.44
8,Naypyitaw,Food and beverages,7.080303
9,Naypyitaw,Health and beauty,6.998077


In [128]:
px.bar(data_frame=product_ratings_df, x='City', y='Rating', color='Product line', barmode='group')

### Product profiles

In [141]:
raw_df.head(3)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating,Timestamp,Hour,Rating_cat
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,2019-01-05,13:08,Ewallet,522.83,4.761905,26.1415,9.1,2019-01-05 13:08:00,13,good
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,2019-03-08,10:29,Cash,76.4,4.761905,3.82,9.6,2019-03-08 10:29:00,10,good
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,2019-03-03,13:23,Credit card,324.31,4.761905,16.2155,7.4,2019-03-03 13:23:00,13,average


In [176]:
mean_price = raw_df.groupby('Product line', as_index=False).sum()[['Product line', 'Unit price', 'Quantity']]
mean_price['mean_price'] = mean_price['Unit price']/mean_price['Quantity']
mean_price = mean_price.sort_values('mean_price', ascending=False)
mean_price['bar_values'] = [35,30,25,20,15,10]
mean_price

Unnamed: 0,Product line,Unit price,Quantity,mean_price,bar_values
1,Fashion accessories,10173.35,902,11.278659,35
5,Sports and travel,9460.88,920,10.283565,30
2,Food and beverages,9745.54,952,10.236912,25
3,Health and beauty,8337.88,854,9.763326,20
4,Home and lifestyle,8850.71,911,9.715379,15
0,Electronic accessories,9103.77,971,9.375664,10


In [178]:
px.bar(data_frame=mean_price, x='Product line', y='bar_values')

In [171]:
mean_rating = raw_df.groupby('Product line', as_index=False).mean()[['Product line', 'Rating']].sort_values('Rating', ascending=False)
mean_rating['bar_value'] = [35,30,25,20,15,10]
mean_rating

Unnamed: 0,Product line,Rating,bar_value
2,Food and beverages,7.113218,35
1,Fashion accessories,7.029213,30
3,Health and beauty,7.003289,25
0,Electronic accessories,6.924706,20
5,Sports and travel,6.916265,15
4,Home and lifestyle,6.8375,10


In [172]:
px.bar(data_frame=mean_rating, x='Product line', y='bar_value')

### Customer profiles

In [181]:
raw_df.head(3)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating,Timestamp,Hour,Rating_cat
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,2019-01-05,13:08,Ewallet,522.83,4.761905,26.1415,9.1,2019-01-05 13:08:00,13,good
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,2019-03-08,10:29,Cash,76.4,4.761905,3.82,9.6,2019-03-08 10:29:00,10,good
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,2019-03-03,13:23,Credit card,324.31,4.761905,16.2155,7.4,2019-03-03 13:23:00,13,average


In [184]:
client_profiles = raw_df.groupby(['Gender', 'Customer type'], as_index=False).sum()[['Gender', 'Customer type',
                                                                   'Unit price', 'Quantity', 'gross income']]
client_profiles['mean_spent'] = client_profiles['Unit price']/client_profiles['Quantity']
client_profiles

Unnamed: 0,Gender,Customer type,Unit price,Quantity,gross income,mean_spent
0,Female,Member,14558.14,1492,4197.4735,9.757466
1,Female,Normal,13129.1,1377,3796.9515,9.534568
2,Male,Member,13601.56,1293,3622.6905,10.519381
3,Male,Normal,14383.33,1348,3762.2535,10.670126
