Re-running and consolidating analyses 3.0, 3.1, and customer metrics on original data (through 7/22/21) but using orders_clean_v2 (from cleaning & prep v2.1) and using 'total_item_price' rather than 'Subtotal' for product analysis. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
import geopandas as gpd
from shapely.geometry import Point, Polygon
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter
from statsmodels.tsa.filters.hp_filter import hpfilter
import lifetimes

In [None]:
pd.options.display.max_columns = 100

In [None]:
orders_clean = pd.read_csv('/Users/josh/Documents/Data Science/Apsey Farms/orders_clean_v2.csv')
orders_clean.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
orders_clean.head()

In [None]:
# convert 'Created at' to datetime
orders_clean['Created at'] = pd.to_datetime(orders_clean['Created at'], utc=True).dt.tz_convert('US/Eastern')

In [None]:
# create copy of orders data and convert to time series
orders_time_series = orders_clean.copy()
orders_time_series = orders_time_series.set_index('Created at')

In [None]:
orders_time_series.head()

In [None]:
orders_time_month = orders_time_series[['Subtotal', 'total_item_weight', 'item_weight_beef',
                                        'item_weight_pork', 'item_weight_chicken',
                                        'item_weight_turkey', 'total_item_price', 'dollars_beef',
                                        'dollars_pork', 'dollars_chicken',
                                        'dollars_turkey', 'dollars_eggs']].resample(rule='M').sum()

orders_time_quarter = orders_time_series[['Subtotal', 'total_item_weight', 'item_weight_beef',
                                        'item_weight_pork', 'item_weight_chicken',
                                        'item_weight_turkey', 'total_item_price', 'dollars_beef',
                                        'dollars_pork', 'dollars_chicken',
                                        'dollars_turkey', 'dollars_eggs']].resample(rule='Q').sum()

orders_time_year = orders_time_series[['Subtotal', 'total_item_weight', 'item_weight_beef',
                                        'item_weight_pork', 'item_weight_chicken',
                                        'item_weight_turkey', 'total_item_price', 'dollars_beef',
                                        'dollars_pork', 'dollars_chicken',
                                        'dollars_turkey', 'dollars_eggs']].resample(rule='Y').sum()

# Customer Analysis

## Understanding the Customer Base

In [None]:
# total number of unique customers
orders_clean['Email'].nunique()

In [None]:
# recurring customers
(orders_clean.groupby('Email').size()>1).sum()

In [None]:
customer_order_totals = pd.DataFrame(orders_clean.groupby('Email').sum()['Subtotal'].sort_values(ascending=False)).reset_index()
customer_order_totals

In [None]:
print(customer_order_totals['Subtotal'].sum())
print(orders_clean['Subtotal'].sum())

In [None]:
total_customer_dollars = customer_order_totals['Subtotal'].sum()
total_customer_dollars

In [None]:
customer_order_totals['pct_of_total'] = customer_order_totals['Subtotal'] / total_customer_dollars

In [None]:
customer_order_totals.head()

In [None]:
customer_order_totals['pct_of_total'].head(150).sum()

In [None]:
# plot running total/cumulative sum
ax = customer_order_totals['pct_of_total'].cumsum().plot()
ax.set(title='Cumulative Sum of Order Amount ($)', xlabel='Number of Customers', ylabel='% of Total Order Amount ($)')
ax.axvline(x=150, color='r', linestyle='--')
ax.axhline(y=0.75, color='r', linestyle='--');

In [None]:
# total number of orders
orders_clean.groupby('Name').size().count()

In [None]:
# recurring orders
orders_clean['Tags'].value_counts(dropna=False)

## Avg Order Value (AOV)
AOV = Revenue / Number of Orders

## AOV for the business over time

In [None]:
monthly_aov = orders_time_series.fillna(value={'Subtotal': 0})

In [None]:
monthly_aov = monthly_aov.groupby(pd.Grouper(freq='M')).agg({'Name': lambda x: x.nunique(),
                                                       'Subtotal': lambda x: sum(x)})
monthly_aov.columns = ['num_orders', 'total_revenue']
monthly_aov.head()

In [None]:
monthly_aov['avg_order_value'] = monthly_aov['total_revenue']/monthly_aov['num_orders']

In [None]:
monthly_aov.head()

In [None]:
# Monthly AOV
ax = monthly_aov['avg_order_value'].plot()
ax.set(xlabel='', ylabel='Average Order Value ($)', title='Average Order Value by Month');

In [None]:
print(monthly_aov.sort_values('avg_order_value', ascending=False).head(1))
print(monthly_aov.sort_values('avg_order_value', ascending=False).tail(1))

In [None]:
# Cumulative AOV Over Time - AOV for all previous months
ax = monthly_aov['avg_order_value'].expanding(min_periods=1).mean().plot()
ax.set(xlabel='', ylabel='Average Order Value ($)', title='Cumulative Average Order Value Over Time');

In [None]:
# moving/rolling monthly average
monthly_aov['avg_order_value'].rolling(window=2).mean().plot()

## AOV per Customer
AOV = Revenue / Number of Orders

In [None]:
# need to first replace all NaNs in Subtotal column with 0 so total_order_dollars doesn't show up as NaN
orders_by_customer = orders_clean.fillna(value={'Subtotal': 0})

In [None]:
# transform data to customer level
orders_by_customer = orders_by_customer.groupby('Email').agg({'Created at': lambda x: (x.max()-x.min()).days,
                                                       'Name': lambda x: x.nunique(),
                                                       'Subtotal': lambda x: sum(x)})
orders_by_customer.columns = ['age','num_orders','total_order_dollars']
orders_by_customer

In [None]:
orders_by_customer['avg_order_value'] = orders_by_customer['total_order_dollars'] / orders_by_customer['num_orders']

In [None]:
ax = orders_by_customer['avg_order_value'].plot.hist(bins=100)
ax.set(xlabel='Average Order Value', ylabel='Frequency', title='Customer Distribution of Average Order Value')
ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
pd.DataFrame(orders_by_customer['avg_order_value'].describe()).round(2)

In [None]:
# top 10 customers by AOV
orders_by_customer.sort_values('avg_order_value', ascending=False).head(10)

In [None]:
# average value of total orders
round(orders_by_customer['total_order_dollars'].mean(),2)

In [None]:
# average total number of orders
round(orders_by_customer['num_orders'].mean(),2)

In [None]:
# average length of customer relationship (in days) - time between customers first and most recent order
round(orders_by_customer['age'].mean(),2)

## Customer Lifetime Value (CLV)
CLV = ((Avg Sales * Purchase Frequency) / Churn) * Profit Margin

where:
 * Avg Sales = Total Sales / Total Number of Orders
 * Purchase Frequency = Total Number of Orders / Total Unique Customers
 * Retention Rate = Total Number of Orders > 1 / Total Unique Customers
 * Churn = 1 - Retention Rate
 * Profit Margin = based on business context

In [None]:
avg_order_freq = orders_by_customer['num_orders'].mean()
retention_rate = orders_by_customer[orders_by_customer['num_orders']>1].shape[0]/orders_by_customer.shape[0]
churn = 1 - retention_rate

## CLV by Customer

In [None]:
orders_by_customer['lifetime_value'] = (orders_by_customer['avg_order_value']*avg_order_freq)/churn

In [None]:
orders_by_customer.head()

In [None]:
orders_by_customer['lifetime_value'].mean()

In [None]:
orders_by_customer['lifetime_value'].plot.hist(bins=100)

Need a more complex model that takes into account whether each customer is alive (active) or not:
 * If customer is still alive, CLV = (avg_order_value * avg_order_freq)/churn  
 * If customer is not alive, CLV = total_order_dollars
 
Let's define alive as having placed an order within the last 120 days.

In [None]:
lifetime_summary = lifetimes.utils.summary_data_from_transaction_data(orders_clean, 'Email', 'Created at', 'Subtotal')
lifetime_summary = lifetime_summary.reset_index()
lifetime_summary.head()

In [None]:
lifetime_summary['days_since_last_order'] = lifetime_summary['T'] - lifetime_summary['recency']

In [None]:
lifetime_summary

In [None]:
# fit the BG/NBD model to our summary data
bgf = lifetimes.BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(lifetime_summary['frequency'], lifetime_summary['recency'], lifetime_summary['T'])

In [None]:
lifetime_summary['probability_alive'] = bgf.conditional_probability_alive(lifetime_summary['frequency'],
                                                                          lifetime_summary['recency'],
                                                                          lifetime_summary['T'])

In [None]:
def alive_or_not(value):
    if value < 120:
        return True
    else:
        return False

In [None]:
lifetime_summary['alive'] = lifetime_summary['days_since_last_order'].apply(alive_or_not)

In [None]:
lifetime_summary

In [None]:
orders_by_customer_2 = orders_by_customer.copy()
orders_by_customer_2 = orders_by_customer_2.reset_index()
orders_by_customer_2.head()

In [None]:
orders_by_customer_2 = pd.merge(left=orders_by_customer_2, right=lifetime_summary, how='left', on='Email')

In [None]:
orders_by_customer_2['clv'] = np.where(orders_by_customer_2['alive']==True, 
                                       orders_by_customer_2['lifetime_value'],
                                       orders_by_customer_2['total_order_dollars'])

In [None]:
orders_by_customer_2

In [None]:
orders_by_customer_2['clv'].mean()

In [None]:
ax = orders_by_customer_2['clv'].plot.hist(bins=100)
ax.set(xlabel='Customer Lifetime Value', ylabel='Frequency', title='Distribution of Customer Lifetime Value')
ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
pd.DataFrame(orders_by_customer_2['clv'].describe()).round(2)

In [None]:
orders_by_customer_2[orders_by_customer_2['alive']==True]

# Best/High-Value Customers

In [None]:
avg_total_spend = orders_by_customer_2['total_order_dollars'].mean()
avg_total_spend

In [None]:
avg_num_orders = orders_by_customer_2['num_orders'].mean()
avg_num_orders

In [None]:
active_customers = orders_by_customer_2[orders_by_customer_2['alive']==True]
active_customers

In [None]:
# active customers with total spend > average total spend () & num_orders > avg num_orders
active_high_value_customers = active_customers[(active_customers['total_order_dollars']>avg_total_spend)&(active_customers['num_orders']>avg_num_orders)]
active_high_value_customers

In [None]:
top_20_customers = active_high_value_customers.sort_values('clv', ascending=False).head(20)
top_20_customers

In [None]:
# top_20_customers.to_csv('top_20_customers.csv')

## Customer Order Amount by State

In [None]:
state_dollars = orders_clean.groupby('Shipping Province Name').sum()[['Subtotal']]
state_dollars.reset_index(inplace=True)
state_dollars.rename(columns={'Shipping Province Name':'state_name','Subtotal':'order_amount'},inplace=True)
state_dollars.sort_values('order_amount',ascending=False)

In [None]:
state_dollars_sorted = state_dollars.sort_values('order_amount',ascending=False)
state_dollars_sorted = state_dollars_sorted.reset_index().drop('index',axis=1)
state_dollars_sorted['pct_of_total'] = state_dollars_sorted['order_amount']/state_dollars_sorted['order_amount'].sum()*100

In [None]:
state_dollars_sorted.head(10)

In [None]:
print(orders_clean['Subtotal'].sum())
print(state_dollars['order_amount'].sum())

In [None]:
orders_clean['Shipping Province Name'].isnull().sum()

In [None]:
usa = gpd.read_file('/Users/josh/Documents/Data Science/Apsey Farms/States 21basic/geo_export_99f25753-6a02-4b7a-b22f-2d3e41e2a010.shp')

In [None]:
usa.head()

In [None]:
usa.plot();

In [None]:
# remove Hawaii and Alaska
state_map = usa.drop([0,50])
state_map.plot();

In [None]:
state_map_dollars = pd.merge(left=state_map, right=state_dollars, how='left', on='state_name')
state_map_dollars['order_amount'] = state_map_dollars['order_amount'].fillna(0)
state_map_dollars.head()

In [None]:
high_dollar_states = list(state_dollars[state_dollars['order_amount']>=5000]['state_name'])
medium_dollar_state = list(state_dollars[(state_dollars['order_amount']>=1000) & (state_dollars['order_amount']<5000)]['state_name'])
low_dollar_states = list(state_dollars[state_dollars['order_amount']<1000]['state_name'])

fig, ax = plt.subplots(figsize=(12,12))
state_map_dollars.plot(ax=ax, edgecolor='b', alpha=0.1)

for n in state_dollars['state_name']:
    if n in high_dollar_states:
        state_map_dollars[state_map_dollars['state_name'] == f'{n}'].plot(ax=ax, color='darkred', edgecolor='b', linewidth=1)
    elif n in medium_dollar_state:
        state_map_dollars[state_map_dollars['state_name'] == f'{n}'].plot(ax=ax, color='lightcoral', edgecolor='b', linewidth=1)
    elif n in low_dollar_states:
        state_map_dollars[state_map_dollars['state_name'] == f'{n}'].plot(ax=ax, color='mistyrose', edgecolor='b', linewidth=1)

In [None]:
# which states order the most of each enterprise
orders_clean.groupby('Shipping Province Name').sum()[['item_weight_beef','item_weight_pork',
                                                      'item_weight_chicken','item_weight_turkey']].sort_values('item_weight_beef',ascending=False)

In [None]:
instate_orders_time = orders_time_series[orders_time_series['Shipping Province Name']=='Michigan']
outstate_orders_time = orders_time_series[(orders_time_series['Shipping Province Name']!='Michigan') & (orders_time_series['Shipping Province Name'].notna())]

In [None]:
instate_monthly_orders = instate_orders_time[['Subtotal', 'total_item_weight', 'item_weight_beef',
                                        'item_weight_pork', 'item_weight_chicken',
                                        'item_weight_turkey', 'total_item_price', 'dollars_beef',
                                        'dollars_pork', 'dollars_chicken',
                                        'dollars_turkey', 'dollars_eggs']].resample(rule='M').sum()
outstate_monthly_orders = outstate_orders_time[['Subtotal', 'total_item_weight', 'item_weight_beef',
                                        'item_weight_pork', 'item_weight_chicken',
                                        'item_weight_turkey', 'total_item_price', 'dollars_beef',
                                        'dollars_pork', 'dollars_chicken',
                                        'dollars_turkey', 'dollars_eggs']].resample(rule='M').sum()

In [None]:
ax = instate_monthly_orders['Subtotal'].plot(figsize=(6,6))
ax.set(xlabel='', ylabel='Order Amount ($)', title='Total Monthly In-State (MI) Orders ($)')
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
ax = outstate_monthly_orders['Subtotal'].plot(figsize=(6,6))
ax.set(xlabel='', ylabel='Order Amount ($)', title='Total Monthly Out-of-State (non-MI) Orders ($)')
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
ax = instate_monthly_orders['Subtotal'].plot()
ax = outstate_monthly_orders['Subtotal'].plot()
ax.set(xlabel='', ylabel='Order Amount ($)', title='Total Monthly Orders ($): In-State v. Out-of-State')
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
ax.legend(['In-State (MI)','Out-of-State (non-MI)']);

In [None]:
# zoom in on out of state spike starting Nov 2020 - which states were the biggest contributors? How many customers?
outstate_orders_after_nov2020 = outstate_orders_time[:'2020-11-01'].groupby('Shipping Province Name').sum()['Subtotal'].sort_values(ascending=False)
outstate_orders_after_nov2020 = pd.DataFrame(outstate_orders_after_nov2020)
outstate_orders_after_nov2020['pct_of_total'] = outstate_orders_after_nov2020['Subtotal']/outstate_orders_after_nov2020['Subtotal'].sum()*100
outstate_orders_after_nov2020

In [None]:
ax = instate_monthly_orders[['dollars_beef', 'dollars_pork',
                    'dollars_chicken', 'dollars_turkey', 'dollars_eggs']].plot(figsize=(12,8))
ax.set(xlabel='', ylabel='Order Amount ($)', title='Monthly In-State (MI) Order Amount ($) by Enterprise')
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
ax.legend(['Beef','Pork','Chicken','Turkey','Eggs']);

In [None]:
# orders by city (MI)
michigan_city_orders = pd.DataFrame(instate_orders_time.groupby('Shipping City').sum()['Subtotal'].sort_values(ascending=False))
michigan_city_orders['pct_of_total'] = michigan_city_orders['Subtotal']/michigan_city_orders['Subtotal'].sum()*100
michigan_city_orders.head(10)

In [None]:
top10_michigan_cities = list(michigan_city_orders.head(10).index)
top10_michigan_cities

In [None]:
# monthly trend of top 10 in-state cities
ax = instate_orders_time[instate_orders_time['Shipping City'].isin(top10_michigan_cities)]['Subtotal'].resample(rule='M').sum().plot()
ax.set(xlabel='', ylabel='Order Amount ($)', title='Monthly Order Amount ($) for Top 10 In-State (MI) Cities')
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))

# Product Analysis

## Orders by Product Type

In [None]:
orders_clean.groupby('product_type').size().sort_values(ascending=False).plot.pie(autopct = '%.1f%%',
                                                                                  colors=['cornflowerblue',
                                                                                          'mediumseagreen',
                                                                                          'coral'])
plt.title('Product Type % of \nTotal Number of Line Items Ordered')
plt.ylabel('');

In [None]:
orders_clean.groupby('product_type').sum()['total_item_price'].sort_values(ascending=False).plot.pie(autopct = '%.1f%%',
                                                                                             colors=['mediumseagreen',
                                                                                                     'cornflowerblue',
                                                                                                     'coral'])
plt.title('Product Type % of Total Order Amount ($)')
plt.ylabel('');

In [None]:
# yearly $ by product type
annual_product_amt = pd.DataFrame(orders_clean.groupby([orders_clean['Created at'].dt.year,'product_type']).sum()['total_item_price'])
annual_product_amt = annual_product_amt.reset_index()
annual_product_amt = annual_product_amt.set_index(['Created at','product_type'])['total_item_price'].unstack().reset_index()
annual_product_amt = annual_product_amt.set_index('Created at')
annual_product_amt

In [None]:
ax = annual_product_amt.plot.bar(color=['coral','mediumseagreen','cornflowerblue'],stacked=True,rot=0)
ax.set(xlabel='',ylabel='Order Amount ($)',title='Annual Order Amount ($) by Product Type')
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

## Orders by Enterprise* 
*Includes enterprise combinations e.g. Beef, Pork (not broken down at the individual enterprise level)

In [None]:
orders_clean.groupby('enterprise').size().sort_values(ascending=False).plot.pie(colors=['mediumseagreen',
                                                                                       'cornflowerblue',
                                                                                       'coral',
                                                                                       'plum',
                                                                                       'papayawhip',
                                                                                       'lightgray',
                                                                                       'lightsalmon',
                                                                                       'gold'],
                                                                                autopct = '%.1f%%',
                                                                                figsize=(6,6))
plt.title('Enterprise % of \nTotal Number of Line Items Ordered')
plt.ylabel('');

In [None]:
orders_clean.groupby('enterprise').sum()['total_item_price'].sort_values(ascending=False)

In [None]:
orders_clean.groupby('enterprise').sum()['total_item_price'].sort_values(ascending=False).plot.pie(colors=['mediumseagreen',
                                                                                       'papayawhip',
                                                                                       'plum',
                                                                                       'lightgray',
                                                                                       'coral',
                                                                                       'cornflowerblue',
                                                                                       'gold',
                                                                                       'lightsalmon'],
                                                                                           autopct = '%.1f%%', 
                                                                                           figsize=(6,6))
plt.title('Enterprise % of Total Order Amount ($)')
plt.ylabel('');

In [None]:
annual_enterprise_amt = pd.DataFrame(orders_clean.groupby([orders_clean['Created at'].dt.year,'enterprise']).sum()['total_item_price'])
annual_enterprise_amt = annual_enterprise_amt.reset_index()
annual_enterprise_amt = annual_enterprise_amt.set_index(['Created at','enterprise'])['total_item_price'].unstack().reset_index()
annual_enterprise_amt = annual_enterprise_amt.set_index('Created at')
annual_enterprise_amt

In [None]:
ax = annual_enterprise_amt.plot.bar(color=['mediumseagreen','plum','lightgray',
                                           'papayawhip','coral','lightsalmon','cornflowerblue','gold'],
                                    stacked=True,rot=0,figsize=(6,6))
ax.set(xlabel='',ylabel='Order Amount ($)',title='Annual Order Amount ($) by Enterprise')
#ax.legend(bbox_to_anchor=(1,1))
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

## Orders by Enterprise* 
*Broken down at the individual enterprise level. Individual enterprise order amounts for orders spanning more than one enterprise (e.g. Beef and Pork bundles) were calculated by dividing total order amount by number of enterprises.

In [None]:
total_dollars_beef = orders_clean['dollars_beef'].sum()
total_dollars_pork = orders_clean['dollars_pork'].sum()
total_dollars_chicken = orders_clean['dollars_chicken'].sum()
total_dollars_turkey = orders_clean['dollars_turkey'].sum()
total_dollars_eggs = orders_clean['dollars_eggs'].sum()

In [None]:
total_dollars = pd.DataFrame({'enterprise':['Beef','Pork','Chicken','Turkey','Eggs'], 
              'total_dollars':[total_dollars_beef, total_dollars_pork, total_dollars_chicken, 
                               total_dollars_turkey, total_dollars_eggs]})
total_dollars = total_dollars.set_index('enterprise')
total_dollars

In [None]:
ax = total_dollars.plot.pie(y='total_dollars', autopct = '%.1f%%', figsize=(6,6))
ax.set(title='Enterprise % of Total Order Amount ($)',ylabel='')
ax.get_legend().remove();

In [None]:
ax = orders_time_year[['dollars_beef', 'dollars_pork',
                    'dollars_chicken', 'dollars_turkey', 'dollars_eggs']].plot.bar(figsize=(6,6),stacked=True)
ax.set(xlabel='', ylabel='Order Amount ($)', title='Annual Order Amount ($) by Enterprise')
ax.set_xticklabels(['2018','2019','2020','2021'],rotation=0)
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
ax.legend(['Beef','Pork','Chicken','Turkey','Eggs']);

In [None]:
ax = orders_time_month[['dollars_beef', 'dollars_pork',
                    'dollars_chicken', 'dollars_turkey', 'dollars_eggs']].plot(figsize=(12,8))
ax.set(xlabel='', ylabel='Order Amount ($)', title='Monthly Order Amount ($) by Enterprise')
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
ax.legend(['Beef','Pork','Chicken','Turkey','Eggs']);

In [None]:
beef_dollars_cycle, beef_dollars_trend = hpfilter(orders_time_month['dollars_beef'],lamb=129600)
orders_time_month['beef_dollars_trend'] = beef_dollars_trend

pork_dollars_cycle, pork_dollars_trend = hpfilter(orders_time_month['dollars_pork'],lamb=129600)
orders_time_month['pork_dollars_trend'] = pork_dollars_trend

chicken_dollars_cycle, chicken_dollars_trend = hpfilter(orders_time_month['dollars_chicken'],lamb=129600)
orders_time_month['chicken_dollars_trend'] = chicken_dollars_trend

turkey_dollars_cycle, turkey_dollars_trend = hpfilter(orders_time_month['dollars_turkey'],lamb=129600)
orders_time_month['turkey_dollars_trend'] = turkey_dollars_trend

eggs_dollars_cycle, eggs_dollars_trend = hpfilter(orders_time_month['dollars_eggs'],lamb=129600)
orders_time_month['eggs_dollars_trend'] = eggs_dollars_trend

In [None]:
ax = orders_time_month[['dollars_beef','beef_dollars_trend']].plot()
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
ax.legend(['Beef Order Amount ($)','Trend'])
ax.set(xlabel='');

In [None]:
ax = orders_time_month[['dollars_pork','pork_dollars_trend']].plot()
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
ax.set_ylim([0,5000])
ax.legend(['Pork Order Amount ($)','Trend'])
ax.set(xlabel='');

In [None]:
ax = orders_time_month[['dollars_chicken','chicken_dollars_trend']].plot()
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
ax.legend(['Chicken Order Amount ($)','Trend'])
ax.set(xlabel='');

In [None]:
ax = orders_time_month[['dollars_turkey','turkey_dollars_trend']].plot()
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
ax.legend(['Turkey Order Amount ($)','Trend'])
ax.set(xlabel='');

In [None]:
ax = orders_time_month[['dollars_eggs','eggs_dollars_trend']].plot()
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
ax.set_ylim([0,800])
ax.legend(['Eggs Order Amount ($)','Trend'])
ax.set(xlabel='');

In [None]:
# total order amount (lbs) by enterprise
orders_clean[['item_weight_beef','item_weight_pork','item_weight_chicken','item_weight_turkey']].sum()

In [None]:
ax = orders_clean[['item_weight_beef','item_weight_pork',
              'item_weight_chicken','item_weight_turkey']].sum().sort_values(ascending=False).plot.bar()
ax.set_xticklabels(['Beef','Chicken','Pork','Turkey'],rotation=0)
ax.text(-0.2,22900,'22,648')
ax.text(0.83,7400,'6,963')
ax.text(1.83,4800,'4,426')
ax.text(2.83,1700,'1,127')
ax.set_title('Total Ordered Amount (lbs) by Enterprise')
ax.set(ylabel='Weight (lbs)')
ax.set_ylim([0,25000])
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
ax = orders_time_month[['item_weight_beef','item_weight_pork', 
                        'item_weight_chicken','item_weight_turkey']].plot(figsize=(12,8))
ax.set(xlabel='', ylabel='Total Order Amount (lbs)', title='Monthly Order Amount (lbs) by Enterprise')
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
orders_clean[orders_clean['product_type']=='Bulk item'].groupby('item_name').sum()['total_item_price']

In [None]:
ax = orders_clean[orders_clean['product_type']=='Bulk item'].groupby('item_name').sum()['total_item_price'].plot.bar()
ax.set_xticklabels(labels=['Beef','Chicken','Turkey'],rotation=0)
ax.set_title('Total Order Amount ($) for Bulk Orders')
ax.set(xlabel='',ylabel='Order Amount ($)')
ax.text(-0.15,14000,'13,824')
ax.text(0.85,10500,'10,416')
ax.text(1.86,1400,'1,236')
ax.set_ylim([0,15000])
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
bulk_orders_time = orders_time_series[orders_time_series['product_type']=='Bulk item']
bulk_orders_month = bulk_orders_time[['dollars_beef','dollars_chicken','dollars_turkey']].resample(rule='M').sum()
bulk_orders_month.head()

In [None]:
ax = bulk_orders_month.plot(figsize=(8,6))
ax.set(xlabel='', ylabel='Order Amount ($)', title='Monthly Order Amount ($) for Bulk Orders')
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

## Orders by Enterprise-Product Type Combination

In [None]:
# number of line items
ax = orders_clean.groupby(['enterprise','product_type']).size().sort_values().plot.barh()
ax.set(xlabel='Number of Line Items', ylabel='', title='Total Number of Line Items by Enterprise & Product Type')
ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
# order amounts
ax = orders_clean.groupby(['enterprise','product_type']).sum()['total_item_price'].sort_values().plot.barh()
ax.set(xlabel='Order Amount ($)', ylabel='', title='Total Order Amount by Enterprise & Product Type')
ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

Create two separate dataframes then merge into one: 1) total # line items & % of total by enterprise-product combination, 2) total order amount ($) and % of total by enterprise-product combination.

In [None]:
# order amount by enterprise-product combinations
order_combs_dollar = orders_clean.groupby(['enterprise','product_type']).sum()['total_item_price'].sort_values(ascending=False)
order_combs_dollar_df = pd.DataFrame(order_combs_dollar)
order_combs_dollar_df

In [None]:
order_combs_dollar_df['enterprise_product_type'] = order_combs_dollar_df.index
order_combs_dollar_df.reset_index(inplace=True)
order_combs_dollar_df.drop(['enterprise','product_type'],axis=1,inplace=True)
order_combs_dollar_df

In [None]:
# total $ for all orders
total_orders_amt = orders_clean['total_item_price'].sum()

# add % of total to order_combs_df
order_combs_dollar_df['$_pct_of_total'] = (order_combs_dollar_df['total_item_price'] / total_orders_amt) * 100

In [None]:
# number of line items by enterprise-product combinations
order_combs_num = orders_clean.groupby(['enterprise','product_type']).size().sort_values(ascending=False)
order_combs_num_df = pd.DataFrame(order_combs_num)
order_combs_num_df

In [None]:
order_combs_num_df['enterprise_product_type'] = order_combs_num_df.index
order_combs_num_df.reset_index(inplace=True)
order_combs_num_df.drop(['enterprise','product_type'],axis=1,inplace=True)
order_combs_num_df.rename(columns={0:'num_line_items'},inplace=True)
order_combs_num_df

In [None]:
# total $ for all orders
total_orders_num = len(orders_clean)

# add % of total to order_combs_df
order_combs_num_df['#_pct_of_total'] = (order_combs_num_df['num_line_items'] / total_orders_num) * 100

In [None]:
order_combs_num_df

In [None]:
# merge the two dataframes
order_combs_final = pd.merge(left= order_combs_num_df, right=order_combs_dollar_df, how='left', on='enterprise_product_type')
order_combs_final.sort_values('total_item_price',ascending=False)

In [None]:
order_combs_final.sum()

## Drill Down: Products Ordered - All Products

In [None]:
# top 20 products, not taking into account total #/weight ordered
ax = orders_clean['item_name'].value_counts().head(20).sort_values().plot.barh(figsize=(6,6), 
                                                                          title='Top 20 Products Ordered')
ax.set(xlabel='Number of Line Items');

In [None]:
ax = orders_clean.groupby('item_name').sum()['total_item_price'].sort_values(ascending=False).head(20).sort_values().plot.barh(figsize=(6,6))
ax.set(title='Top 20 Products Ordered by Amount ($): Jan 2018 - July 2021', xlabel='Amount ($)',ylabel='')
ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
# top 5 revenue-generating products ordered by month
top_5_products = list(orders_clean.groupby('item_name').sum()['total_item_price'].sort_values(ascending=False).head(5).index)
top_5_products

In [None]:
ax = orders_clean[orders_clean['item_name'].isin(top_5_products)].groupby('order_month').sum()['total_item_price'].plot()
ax.set(title='Monthly Revenue for Top 5 Revenue-Generating Products', xlabel='Month of Order', ylabel='Amount ($)')
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
orders_clean[(orders_clean['item_name'].isin(top_5_products)) & ((orders_clean['Created at'].dt.year==2020) | (orders_clean['Created at'].dt.year==2021))].groupby('order_month').sum()['total_item_price']

In [None]:
# zoom in on 2020 and 2021
ax = orders_clean[(orders_clean['item_name'].isin(top_5_products)) & ((orders_clean['Created at'].dt.year==2020) | (orders_clean['Created at'].dt.year==2021))].groupby('order_month').sum()['total_item_price'].plot.bar()
ax.set(title='Monthly Revenue for Top 5 Revenue-Generating Products', xlabel='Month of Order', ylabel='Amount ($)');

Find the monthly average revenue

In [None]:
monthly_stats = orders_clean.groupby(orders_clean['Created at'].dt.month).sum()
monthly_stats['avg_revenue'] = monthly_stats['total_item_price']/4
monthly_stats = monthly_stats[['total_item_price','avg_revenue']]
monthly_stats

In [None]:
# we only have data through 7/2021, so avg_revenue should be 'total_item_price'/ 3 years for months 8-12
monthly_stats.iloc[7,1] = 14455.28/3
monthly_stats.iloc[8,1] = 11828.31/3
monthly_stats.iloc[9,1] = 17664.57/3
monthly_stats.iloc[10,1] = 28194.84/3
monthly_stats.iloc[11,1] = 25061.41/3
monthly_stats

In [None]:
# monthly averages, where Jan = 1 and Dec = 12
ax = monthly_stats['avg_revenue'].plot.bar(rot=0)
ax.set(title='Average Monthly Revenue from All Product Orders', xlabel='Month', ylabel='Average Amount ($)')
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

## Drill Down: Products Ordered - Single Items

In [None]:
# top 10 single items by $
single_item_orders = orders_clean[orders_clean['product_type']=='Single item']
ax = single_item_orders.groupby('item_name').sum()['total_item_price'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Single Items Ordered by Amount ($): \nJan 2018 - July 2021', xlabel='Amount ($)',ylabel='')
ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
single_item_orders_recent = orders_clean[(orders_clean['product_type']=='Single item') & ((orders_clean['Created at'].dt.year==2020) | (orders_clean['Created at'].dt.year==2021))]
ax = single_item_orders_recent.groupby('item_name').sum()['total_item_price'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Single Items Ordered by Amount ($): \nJan 2020 - July 2021', xlabel='Amount ($)',ylabel='')
ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

In [None]:
# calculate price per pound for single items
single_item_orders['price_per_pound'] = single_item_orders['Lineitem price']/single_item_orders['total_item_weight']

In [None]:
single_item_orders.groupby('item_name').mean()['price_per_pound'].sort_values(ascending=False).head(10)

In [None]:
single_item_orders[single_item_orders['item_name']=='Beef - Hanger Steak']

In [None]:
# top single items by weight - if uneven, might indicate not utilizing full carcass
ax = single_item_orders.groupby('item_name').sum()['total_item_weight'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Single Items Ordered by Weight (lbs): \nJan 2018 - July 2021', xlabel='Amount (lbs)',ylabel='')
ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));

## Drill Down: Products Ordered - Bundles

In [None]:
# top 10 bundles by $
bundle_orders = orders_clean[orders_clean['product_type']=='Bundle']
ax = bundle_orders.groupby('item_name').sum()['total_item_weight'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Bundles Ordered by Amount ($): \nJan 2018 - July 2021', xlabel='Amount ($)',ylabel='')
ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'));
