In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
df = pd.read_parquet('Shoppingcarts.parquet')

df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,department,aisle,county
0,1,17704,3,1,205970,5,17,12.0,Lemons,123,4,produce,packaged vegetables fruits,Ventura
1,1,17461,7,1,205970,5,17,12.0,Air Chilled Organic Boneless Skinless Chicken ...,35,12,meat seafood,poultry counter,Ventura
2,1,46667,6,1,205970,5,17,12.0,Organic Ginger Root,83,4,produce,fresh vegetables,Ventura
3,1,17668,5,1,205970,5,17,12.0,Unsweetened Chocolate Almond Breeze Almond Milk,91,16,dairy eggs,soy lactosefree,Ventura
4,1,33754,1,1,205970,5,17,12.0,Total 2% with Strawberry Lowfat Greek Strained...,120,16,dairy eggs,yogurt,Ventura
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6145743,604939,41950,4,0,52726,1,11,2.0,Organic Tomato Cluster,83,4,produce,fresh vegetables,Madera
6145744,604939,38061,9,0,52726,1,11,2.0,Organic Raw Coconut Butter,36,16,dairy eggs,butter,Madera
6145745,604939,10667,8,0,52726,1,11,2.0,Imported Butter,36,16,dairy eggs,butter,Madera
6145746,604939,18811,3,1,52726,1,11,2.0,Organic Apple Juice,98,7,beverages,juice nectars,Madera


# County Analysis

In [3]:
top_counties = pd.DataFrame(df.groupby('county', as_index=False)['order_id'].nunique().sort_values(by= 'order_id', ascending=False))

top_counties.columns = ['county', 'number_of_orders']

top_counties

Unnamed: 0,county,number_of_orders
4,Calaveras,45045
9,Fresno,35263
29,Orange,33902
10,Glenn,33517
14,Kern,31088
50,Sutter,25452
21,Mariposa,23973
55,Ventura,23962
26,Monterey,23898
43,Santa Cruz,19711


# Most Popular Department by County

In [7]:
top_department = df.groupby('county', as_index=False).agg({'department' : lambda x: x.mode().iat[0]}).sort_values(by = 'department')

top_department

Unnamed: 0,county,department
3,Butte,alcohol
35,San Bernardino,beverages
51,Tehama,dairy eggs
45,Sierra,dairy eggs
0,Alameda,produce
31,Plumas,produce
32,Riverside,produce
33,Sacramento,produce
34,San Benito,produce
36,San Diego,produce


In [9]:
px.pie(top_department, names='department', color='county', title='Most Popular Department by County')

# Most Popular Aisle by County

In [10]:
top_aisle = df.groupby('county', as_index=False).agg({'aisle' : lambda x: x.mode().iat[0]}).sort_values(by = 'aisle')

top_aisle

Unnamed: 0,county,aisle
3,Butte,dog food care
0,Alameda,fresh fruits
26,Monterey,fresh fruits
27,Napa,fresh fruits
32,Riverside,fresh fruits
33,Sacramento,fresh fruits
34,San Benito,fresh fruits
36,San Diego,fresh fruits
39,San Luis Obispo,fresh fruits
41,Santa Barbara,fresh fruits


In [12]:
px.pie(top_aisle, names= 'aisle', color='county', title= 'Most Popular Aisle by County')

# Most Popular Product by County

In [13]:
top_product = df.groupby('county', as_index=False).agg({'product_name' : lambda x: x.mode().iat[0]}).sort_values(by= 'product_name')

top_product

Unnamed: 0,county,product_name
15,Kings,Bag of Organic Bananas
45,Sierra,Bag of Organic Bananas
40,San Mateo,Bag of Organic Bananas
7,Del Norte,Bag of Organic Bananas
25,Mono,Bag of Organic Bananas
8,El Dorado,Bag of Organic Bananas
27,Napa,Bag of Organic Bananas
54,Tuolumne,Bag of Organic Bananas
33,Sacramento,Bag of Organic Bananas
5,Colusa,Bag of Organic Bananas


In [14]:
px.pie(top_product, names= 'product_name', color= 'county', title= 'Most Popular Product by County', color_discrete_sequence=px.colors.qualitative.Dark24)

# High Times by County

In [15]:
top_order_county = df.groupby(['order_hour_of_day', 'county'], as_index=False)['order_id'].count()
top_order_county.rename(columns= {'order_id' : 'no_of_orders'}, inplace= True)

top_order_county

Unnamed: 0,order_hour_of_day,county,no_of_orders
0,0,Alameda,137
1,0,Alpine,957
2,0,Amador,692
3,0,Calaveras,3231
4,0,Colusa,221
...,...,...,...
1341,23,Tulare,267
1342,23,Tuolumne,258
1343,23,Ventura,3775
1344,23,Yolo,928


In [16]:
px.line(top_order_county, x='order_hour_of_day', y='no_of_orders', color= 'county', title= 'Top Order Hours of the Day')

In [18]:
top_hour = df.groupby('county', as_index=False).agg({'order_hour_of_day' : lambda x: x.mode().iat[0]}).sort_values(by = 'order_hour_of_day', ascending= True)

top_hour

Unnamed: 0,county,order_hour_of_day
0,Alameda,9
54,Tuolumne,10
48,Sonoma,10
47,Solano,10
45,Sierra,10
41,Santa Barbara,10
40,San Mateo,10
35,San Bernardino,10
33,Sacramento,10
21,Mariposa,10


No. of Counties that have each hour as their top hour

In [19]:
counties_top_hour = top_hour.groupby('order_hour_of_day', as_index= False)['county'].count()
counties_top_hour.rename(columns={'county' : 'no_of_counties'}, inplace=True)

counties_top_hour

Unnamed: 0,order_hour_of_day,no_of_counties
0,9,1
1,10,20
2,11,9
3,12,4
4,13,8
5,14,8
6,15,6
7,16,1
8,18,1


In [20]:
px.bar(counties_top_hour, x= 'order_hour_of_day', y= 'no_of_counties', title= 'Most Popular Order Hour of Day by County', range_x = [0, 23])

High Times per County (Lena -> Sunburst)

In [23]:
df12=df.groupby(["order_dow","order_hour_of_day","county","order_id"],as_index=False)["product_id"].count()
df12

Unnamed: 0,order_dow,order_hour_of_day,county,order_id,product_id
0,0,0,Alameda,7165,6
1,0,0,Alameda,39544,8
2,0,0,Alameda,326688,2
3,0,0,Alameda,356832,3
4,0,0,Alameda,389134,2
...,...,...,...,...,...
604934,6,23,Yuba,523411,10
604935,6,23,Yuba,549655,7
604936,6,23,Yuba,563403,8
604937,6,23,Yuba,564392,4


In [24]:
df13 = df12.groupby(["county","order_dow","order_hour_of_day"],as_index=False)["order_id"].count().sort_values(by="order_id",ascending=False)
df13

Unnamed: 0,county,order_dow,order_hour_of_day,order_id
499,Calaveras,0,12,805
501,Calaveras,0,14,803
500,Calaveras,0,13,790
502,Calaveras,0,15,773
498,Calaveras,0,11,748
...,...,...,...,...
5385,San Bernardino,5,13,1
5273,San Benito,4,3,1
6008,San Mateo,0,5,1
5386,San Bernardino,5,15,1


In [25]:
df13_max = df13.groupby(["county"],as_index=False).max().sort_values(by="order_id",ascending=False)
df13_max = df13_max.drop(columns=["order_dow","order_hour_of_day"],axis=1)
df13_max

Unnamed: 0,county,order_id
4,Calaveras,805
9,Fresno,653
14,Kern,646
10,Glenn,584
29,Orange,563
57,Yuba,450
50,Sutter,448
26,Monterey,426
55,Ventura,413
21,Mariposa,391


In [26]:
df14=pd.merge(df13_max,df13, how="left", on=["county","order_id"],validate="one_to_many")

df14

Unnamed: 0,county,order_id,order_dow,order_hour_of_day
0,Calaveras,805,0,12
1,Fresno,653,1,10
2,Kern,646,1,10
3,Glenn,584,0,14
4,Orange,563,0,13
...,...,...,...,...
58,Trinity,6,6,15
59,Placer,3,1,10
60,Butte,1,5,8
61,Butte,1,6,16


In [27]:
px.sunburst(df14, path=['order_dow', 'order_hour_of_day', 'county'], values='order_id', title= 'Sunburst')

# Top 10 Counties

In [4]:
top_10_counties = top_counties.head(10)

px.bar(top_10_counties, x='county', y='number_of_orders', title= 'Top 10 Counties')

In [5]:
px.pie(top_10_counties, values = 'number_of_orders', names = 'county', title = 'Top 10 Counties')

Anteil an Gesamtbestellungen

In [29]:
top_10_percentage = (top_10_counties.number_of_orders.sum()/df.order_id.nunique()*100).round(2)

print('Die Top 10 Counties machen', top_10_percentage, '% der Gesamtbestellungen aus.') 

Die Top 10 Counties machen 48.9 % der Gesamtbestellungen aus.


# Weakest 4 Counties

In [6]:
weakest_4_counties = top_counties.tail(4)

px.bar(weakest_4_counties, x= 'county', y= 'number_of_orders', title= 'Weakest 4 Counties')