In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
import geopandas as gpd
from shapely.geometry import Point, Polygon

In [None]:
orders_clean = pd.read_csv('/Users/josh/Documents/Data Science/Apsey Farms/orders_clean.csv')

In [None]:
orders_clean.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
pd.options.display.max_columns = 100

In [None]:
orders_clean.head()

In [None]:
# convert 'Created at' to datetime
orders_clean['Created at'] = pd.to_datetime(orders_clean['Created at'], utc=True).dt.tz_convert('US/Eastern')

## Understanding the Customer Base

In [None]:
# total number of unique customers
len(orders_clean['Email'].unique())

In [None]:
# recurring customers
(orders_clean.groupby('Email').size()>1).sum()

In [None]:
customer_order_totals = pd.DataFrame(orders_clean.groupby('Email').sum()['Subtotal'].sort_values(ascending=False)).reset_index()
customer_order_totals

In [None]:
print(customer_order_totals['Subtotal'].sum())
print(orders_clean['Subtotal'].sum())

In [None]:
total_customer_dollars = customer_order_totals['Subtotal'].sum()
total_customer_dollars

In [None]:
customer_order_totals['pct_of_total'] = customer_order_totals['Subtotal'] / total_customer_dollars

In [None]:
customer_order_totals.head()

In [None]:
customer_order_totals['pct_of_total'].head(150).sum()

In [None]:
# plot running total/cumulative sum
ax = customer_order_totals['pct_of_total'].cumsum().plot()
ax.set(title='Cumulative Sum of Order Amount ($)', xlabel='Number of Customers', ylabel='% of Total Order Amount ($)')
ax.axvline(x=150, color='r', linestyle='--')
ax.axhline(y=0.75, color='r', linestyle='--');

In [None]:
# total number of orders
orders_clean.groupby('Name').size().count()

In [None]:
# recurring orders
orders_clean['Tags'].value_counts(dropna=False)

## Customer Order Amount by State

In [None]:
state_dollars = orders_clean.groupby('Shipping Province Name').sum()[['Subtotal']]
state_dollars.reset_index(inplace=True)
state_dollars.rename(columns={'Shipping Province Name':'state_name','Subtotal':'order_amount'},inplace=True)
state_dollars.sort_values('order_amount',ascending=False)

In [None]:
print(orders_clean['Subtotal'].sum())
print(state_dollars['order_amount'].sum())

In [None]:
orders_clean['Shipping Province Name'].isnull().sum()

In [None]:
usa = gpd.read_file('/Users/josh/Documents/Data Science/Apsey Farms/States 21basic/geo_export_99f25753-6a02-4b7a-b22f-2d3e41e2a010.shp')

In [None]:
usa.head()

In [None]:
usa.plot();

In [None]:
# remove Hawaii and Alaska
state_map = usa.drop([0,50])
state_map.plot();

In [None]:
state_map_dollars = pd.merge(left=state_map, right=state_dollars, how='left', on='state_name')
state_map_dollars['order_amount'] = state_map_dollars['order_amount'].fillna(0)
state_map_dollars.head()

In [None]:
high_dollar_states = list(state_dollars[state_dollars['order_amount']>=5000]['state_name'])
medium_dollar_state = list(state_dollars[(state_dollars['order_amount']>=1000) & (state_dollars['order_amount']<5000)]['state_name'])
low_dollar_states = list(state_dollars[state_dollars['order_amount']<1000]['state_name'])

fig, ax = plt.subplots(figsize=(12,12))
state_map_dollars.plot(ax=ax, edgecolor='b', alpha=0.1)

for n in state_dollars['state_name']:
    if n in high_dollar_states:
        state_map_dollars[state_map_dollars['state_name'] == f'{n}'].plot(ax=ax, color='darkred', edgecolor='b', linewidth=1)
    elif n in medium_dollar_state:
        state_map_dollars[state_map_dollars['state_name'] == f'{n}'].plot(ax=ax, color='lightcoral', edgecolor='b', linewidth=1)
    elif n in low_dollar_states:
        state_map_dollars[state_map_dollars['state_name'] == f'{n}'].plot(ax=ax, color='mistyrose', edgecolor='b', linewidth=1)

In [None]:
# which states order the most of each enterprise
orders_clean.groupby('Shipping Province Name').sum()[['item_weight_beef','item_weight_pork',
                                                      'item_weight_chicken','item_weight_turkey']].sort_values('item_weight_beef',ascending=False)

## Orders by Product Type

In [None]:
orders_clean.groupby('product_type').size().sort_values(ascending=False).plot.pie(autopct = '%.1f%%',
                                                                                  colors=['cornflowerblue',
                                                                                          'mediumseagreen',
                                                                                          'coral'])
plt.title('Product Type % of \nTotal Number of Line Items Ordered')
plt.ylabel('');

In [None]:
orders_clean.groupby('product_type').sum()['Subtotal'].sort_values(ascending=False).plot.pie(autopct = '%.1f%%',
                                                                                             colors=['mediumseagreen',
                                                                                                     'cornflowerblue',
                                                                                                     'coral'])
plt.title('Product Type % of Total Order Amount ($)')
plt.ylabel('');

In [None]:
# yearly $ by product type
annual_product_amt = pd.DataFrame(orders_clean.groupby([orders_clean['Created at'].dt.year,'product_type']).sum()['Subtotal'])
annual_product_amt = annual_product_amt.reset_index()
annual_product_amt = annual_product_amt.set_index(['Created at','product_type'])['Subtotal'].unstack().reset_index()
annual_product_amt = annual_product_amt.set_index('Created at')
annual_product_amt

In [None]:
ax = annual_product_amt.plot.bar(color=['coral','mediumseagreen','cornflowerblue'],stacked=True,rot=0)
ax.set(xlabel='',ylabel='Order Amount ($)',title='Annual Order Amount ($) by Product Type');

## Orders by Enterprise

In [None]:
orders_clean.groupby('enterprise').size().sort_values(ascending=False).plot.pie(colors=['mediumseagreen',
                                                                                       'cornflowerblue',
                                                                                       'coral',
                                                                                       'plum',
                                                                                       'papayawhip',
                                                                                       'lightgray',
                                                                                       'lightsalmon',
                                                                                       'gold'],
                                                                                autopct = '%.1f%%',
                                                                                figsize=(6,6))
plt.title('Enterprise % of \nTotal Number of Line Items Ordered')
plt.ylabel('');

In [None]:
orders_clean.groupby('enterprise').sum()['Subtotal'].sort_values(ascending=False).plot.pie(colors=['mediumseagreen',
                                                                                       'papayawhip',
                                                                                       'plum',
                                                                                       'lightgray',
                                                                                       'coral',
                                                                                       'cornflowerblue',
                                                                                       'gold',
                                                                                       'lightsalmon'],
                                                                                           autopct = '%.1f%%', 
                                                                                           figsize=(6,6))
plt.title('Enterprise % of Total Order Amount ($)')
plt.ylabel('');

In [None]:
annual_enterprise_amt = pd.DataFrame(orders_clean.groupby([orders_clean['Created at'].dt.year,'enterprise']).sum()['Subtotal'])
annual_enterprise_amt = annual_enterprise_amt.reset_index()
annual_enterprise_amt = annual_enterprise_amt.set_index(['Created at','enterprise'])['Subtotal'].unstack().reset_index()
annual_enterprise_amt = annual_enterprise_amt.set_index('Created at')
annual_enterprise_amt

In [None]:
ax = annual_enterprise_amt.plot.bar(color=['mediumseagreen','plum','lightgray',
                                           'papayawhip','coral','lightsalmon','cornflowerblue','gold'],
                                    stacked=True,rot=0)
ax.set(xlabel='',ylabel='Order Amount ($)',title='Annual Order Amount ($) by Enterprise')
ax.legend(bbox_to_anchor=(1,1));

## Orders by Enterprise-Product Type Combination

In [None]:
# number of line items
ax = orders_clean.groupby(['enterprise','product_type']).size().sort_values().plot.barh()
ax.set(xlabel='Number of Line Items', ylabel='', title='Total Number of Line Items by Enterprise & Product Type');

In [None]:
# order amounts
ax = orders_clean.groupby(['enterprise','product_type']).sum()['Subtotal'].sort_values().plot.barh()
ax.set(xlabel='Order Amount ($)', ylabel='', title='Total Order Amount by Enterprise & Product Type');

Create two separate dataframes then merge into one: 1) total # line items & % of total by enterprise-product combination, 2) total order amount ($) and % of total by enterprise-product combination.

In [None]:
# order amount by enterprise-product combinations
order_combs_dollar = orders_clean.groupby(['enterprise','product_type']).sum()['Subtotal'].sort_values(ascending=False)
order_combs_dollar_df = pd.DataFrame(order_combs_dollar)
order_combs_dollar_df

In [None]:
order_combs_dollar_df['enterprise_product_type'] = order_combs_dollar_df.index
order_combs_dollar_df.reset_index(inplace=True)
order_combs_dollar_df.drop(['enterprise','product_type'],axis=1,inplace=True)
order_combs_dollar_df

In [None]:
# total $ for all orders
total_orders_amt = orders_clean['Subtotal'].sum()

# add % of total to order_combs_df
order_combs_dollar_df['$_pct_of_total'] = (order_combs_dollar_df['Subtotal'] / total_orders_amt) * 100

In [None]:
# number of line items by enterprise-product combinations
order_combs_num = orders_clean.groupby(['enterprise','product_type']).size().sort_values(ascending=False)
order_combs_num_df = pd.DataFrame(order_combs_num)
order_combs_num_df

In [None]:
order_combs_num_df['enterprise_product_type'] = order_combs_num_df.index
order_combs_num_df.reset_index(inplace=True)
order_combs_num_df.drop(['enterprise','product_type'],axis=1,inplace=True)
order_combs_num_df.rename(columns={0:'num_line_items'},inplace=True)
order_combs_num_df

In [None]:
# total $ for all orders
total_orders_num = len(orders_clean)

# add % of total to order_combs_df
order_combs_num_df['#_pct_of_total'] = (order_combs_num_df['num_line_items'] / total_orders_num) * 100

In [None]:
order_combs_num_df

In [None]:
# merge the two dataframes
order_combs_final = pd.merge(left= order_combs_num_df, right=order_combs_dollar_df, how='left', on='enterprise_product_type')
order_combs_final

## Drill Down: Products Ordered - All Products

In [None]:
# top 20 products, not taking into account total #/weight ordered
ax = orders_clean['item_name'].value_counts().head(20).sort_values().plot.barh(figsize=(6,6), 
                                                                          title='Top 20 Products Ordered')
ax.set(xlabel='Number of Line Items');

In [None]:
ax = orders_clean.groupby('item_name').sum()['Subtotal'].sort_values(ascending=False).head(20).sort_values().plot.barh(figsize=(6,6))
ax.set(title='Top 20 Products Ordered by Amount ($): Jan 2018 - July 2021', xlabel='Amount ($)',ylabel='');

In [None]:
# top 5 revenue-generating products ordered by month
top_5_products = list(orders_clean.groupby('item_name').sum()['Subtotal'].sort_values(ascending=False).head(5).index)
top_5_products

In [None]:
ax = orders_clean[orders_clean['item_name'].isin(top_5_products)].groupby('order_month').sum()['Subtotal'].plot()
ax.set(title='Monthly Revenue for Top 5 Revenue-Generating Products', xlabel='Month of Order', ylabel='Amount ($)');

In [None]:
# zoom in on 2020 and 2021
ax = orders_clean[(orders_clean['item_name'].isin(top_5_products)) & ((orders_clean['Created at'].dt.year==2020) | (orders_clean['Created at'].dt.year==2021))].groupby('order_month').sum()['Subtotal'].plot.bar()
ax.set(title='Monthly Revenue for Top 5 Revenue-Generating Products', xlabel='Month of Order', ylabel='Amount ($)');

Find the monthly average revenue

In [None]:
monthly_stats = orders_clean.groupby(orders_clean['Created at'].dt.month).sum()
monthly_stats['avg_revenue'] = monthly_stats['Subtotal']/4
monthly_stats = monthly_stats[['Subtotal','avg_revenue']]
monthly_stats

In [None]:
# we only have data through 7/2021, so avg_revenue should be 'Subtotal'/ 3 years for months 8-12
monthly_stats.iloc[7,1] = 14455.28/3
monthly_stats.iloc[8,1] = 11828.31/3
monthly_stats.iloc[9,1] = 17664.57/3
monthly_stats.iloc[10,1] = 28194.84/3
monthly_stats.iloc[11,1] = 25061.41/3
monthly_stats

In [None]:
# monthly averages, where Jan = 1 and Dec = 12
ax = monthly_stats['avg_revenue'].plot.bar(rot=0)
ax.set(title='Average Monthly Revenue from All Product Orders', xlabel='Month', ylabel='Average Amount ($)');

## Drill Down: Products Ordered - Single Items

In [None]:
# top 10 single items by $
single_item_orders = orders_clean[orders_clean['product_type']=='Single item']
ax = single_item_orders.groupby('item_name').sum()['Subtotal'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Single Items Ordered by Amount ($): \nJan 2018 - July 2021', xlabel='Amount ($)',ylabel='');

In [None]:
single_item_orders_recent = orders_clean[(orders_clean['product_type']=='Single item') & ((orders_clean['Created at'].dt.year==2020) | (orders_clean['Created at'].dt.year==2021))]
ax = single_item_orders_recent.groupby('item_name').sum()['Subtotal'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Single Items Ordered by Amount ($): \nJan 2020 - July 2021', xlabel='Amount ($)',ylabel='');

In [None]:
# calculate price per pound for single items
single_item_orders['price_per_pound'] = single_item_orders['Lineitem price']/single_item_orders['total_item_weight']

In [None]:
single_item_orders.groupby('item_name').mean()['price_per_pound'].sort_values(ascending=False).head(10)

In [None]:
single_item_orders[single_item_orders['item_name']=='Beef - Hanger Steak']

In [None]:
# top single items by weight - if uneven, might indicate not utilizing full carcass
ax = single_item_orders.groupby('item_name').sum()['total_item_weight'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Single Items Ordered by Weight (lbs): \nJan 2018 - July 2021', xlabel='Amount ($)',ylabel='');

## Drill Down: Products Ordered - Bundles

In [None]:
# top 10 bundles by $
bundle_orders = orders_clean[orders_clean['product_type']=='Bundle']
ax = bundle_orders.groupby('item_name').sum()['Subtotal'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Bundles Ordered by Amount ($): \nJan 2018 - July 2021', xlabel='Amount ($)',ylabel='');