In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

In [3]:
df = pd.read_parquet('Shoppingcarts.parquet')

# County Analysis

In [4]:
#Counties anhand der Bestellanzahl absteigend sortieren
top_counties = pd.DataFrame(df.groupby('county', as_index=False)['order_id'].nunique().sort_values(by= 'order_id', ascending=False)).rename(columns= {'order_id':'no_of_orders'})

Top 10 Counties

In [5]:
px.bar(top_counties.head(10), x='county', y='no_of_orders', title= 'Top 10 Counties')

Anteil an Gesamtbestellungen

In [7]:
print('Die Top 10 Counties machen', (top_counties.head(10).no_of_orders.sum()/df.order_id.nunique()*100).round(2), '% der Gesamtbestellungen aus.') 

Die Top 10 Counties machen 48.9 % der Gesamtbestellungen aus.


Weakest 4 Counties

In [9]:
px.bar(top_counties.tail(4), x= 'county', y= 'no_of_orders', title= 'Weakest 4 Counties')

Most Popular Department by County

In [10]:
px.pie(df.groupby('county', as_index=False).agg({'department' : lambda x: x.mode().iat[0]}).sort_values(by = 'department'), 
        names='department', 
        color='county', 
        title='Most Popular Department by County')

Most Popular Aisle by County

In [11]:
px.pie(df.groupby('county', as_index=False).agg({'aisle' : lambda x: x.mode().iat[0]}).sort_values(by = 'aisle'), 
        names= 'aisle', 
        color='county', 
        title= 'Most Popular Aisle by County')

Most Popular Product by County

In [12]:
px.pie(df.groupby('county', as_index=False).agg({'product_name' : lambda x: x.mode().iat[0]}).sort_values(by= 'product_name'), 
        names= 'product_name', color= 'county', 
        title= 'Most Popular Product by County', 
        color_discrete_sequence=px.colors.qualitative.Dark24)

# High Times by County

In [13]:
px.line(df.groupby(['order_hour_of_day', 'county'], as_index=False)['order_id'].count().rename(columns= {'order_id' : 'no_of_orders'}), 
        x='order_hour_of_day', 
        y='no_of_orders', 
        color= 'county', 
        title= 'Top Order Hours of the Day')

Top Order Hour by County

In [14]:
#Top Order Hour pro County ermitteln
top_hour = df.groupby('county', as_index=False).agg({'order_hour_of_day' : lambda x: x.mode().iat[0]}).sort_values(by = 'order_hour_of_day', ascending= True)

#Nach Order hour gruppieren
px.bar(top_hour.groupby('order_hour_of_day', as_index= False)['county'].count().rename(columns={'county' : 'no_of_counties'}), 
        x= 'order_hour_of_day', 
        y= 'no_of_counties', 
        title= 'Most Popular Order Hour of Day by County', 
        range_x = [0, 23])

High Times per County nach Wochentag und Uhrzeit

In [22]:
#Wie viele Bestellungen werden pro Wochentag und Uhrzeit pro County gemacht
high_time = df.groupby(["county","order_dow","order_hour_of_day"],as_index=False)["order_id"].nunique().sort_values(by="order_id",ascending=False)
#Gruppieren nach dem höchsten Wert pro County
high_time_max = high_time.groupby(["county"],as_index=False).max().sort_values(by="order_id",ascending=False).drop(columns=["order_dow","order_hour_of_day"],axis=1)

px.sunburst(pd.merge(high_time_max, high_time, how="left", on=["county","order_id"],validate="one_to_many"), 
            path=['order_dow', 'order_hour_of_day', 'county'], 
            values='order_id', 
            title= 'Top order hour of day by weekday by county')