Major Sections in this Notebook:
 * [Data Collection](#Data-Collection)
 * [Data Cleaning & Prep](#Data-Cleaning-&-Prep)
 * [Analysis](#Analysis)

# Data Collection

In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import re
from bs4 import BeautifulSoup

Get all the urls for the single item pages on apseyfarms.com that we'll need to scrape

In [None]:
def get_urls(url_list):
    links = []
    for url in url_list:
        response = requests.get(url)
        content = response.content
        content = content.decode("utf-8")
        parser = BeautifulSoup(content, 'html.parser')
        
        # find all the anchor tags with "href" attribute starting with "https://"
        # and store the links in a list
        for link in parser.find_all('a', attrs={'href': re.compile("/collections")}):
            links.append('http://www.apseyfarms.com' + link.get('href'))
    return links

In [None]:
urls_to_scrape = ['https://apseyfarms.com/collections/beef-1', 'https://apseyfarms.com/collections/beef-1?page=2',
                 'https://apseyfarms.com/collections/chicken', 'https://apseyfarms.com/collections/pork',
                 'https://apseyfarms.com/collections/pork?page=2']

In [None]:
single_item_links = get_urls(urls_to_scrape)

In [None]:
single_item_links

In [None]:
urls_to_remove = ['http://www.apseyfarms.com/collections', 'http://www.apseyfarms.comhttps://apseyfarms.com/collections/bundles',
                 'http://www.apseyfarms.com/collections/beef-1?page=2','http://www.apseyfarms.com/collections/beef-1?page=1',
                 'http://www.apseyfarms.com/collections/pork?page=2','http://www.apseyfarms.com/collections/pork?page=1']
for i in range(10):
    for url in urls_to_remove:
        try:
            single_item_links.remove(url)
        except:
            break

In [None]:
single_item_links[:10]

Now that we have all the urls from which we'll be scraping saved in a list, let's iterate through those sites and pull out the info we need: item name and quantity (package size).

In [None]:
def get_item_info(url_list):
    item_dict = {}
    for url in url_list:
        response = requests.get(url)
        content = response.content
        content = content.decode("utf-8")
        parser = BeautifulSoup(content, 'html.parser')
        
        item_name_start_index = content.find('"title":"') + len('"title":"')
        item_name_end_index = content.find('","handle":')
        item_name = content[item_name_start_index:item_name_end_index]
        
        item_size_start_index = content.find('Package size:') + len('Package size:')
        item_size_end_index = item_size_start_index + 50
        item_size = content[item_size_start_index:item_size_end_index]
        
        item_dict[item_name] = item_size
    return item_dict

In [None]:
# creates a dictionary with keys as item names and values as package size
single_item_dict = get_item_info(single_item_links)

In [None]:
single_item_dict

The scraped info is a bit messy, so we'll save the dictionary to a pandas dataframe in order to clean up the data and structure into our desired format.

In [None]:
single_item_df = pd.DataFrame.from_dict(single_item_dict, orient='index').reset_index().rename(columns={'index':'item_name',0:'item_size'})

In [None]:
single_item_df.head()

In [None]:
# for the most part, it looks like the item quantity appears before the substring 'Ingredients'
# so let's pull out everything before that substring into a new column
single_item_df['item_size_new'] = single_item_df['item_size'].str.split('Ingredients').str[0]

In [None]:
single_item_df.head()

In [None]:
# let's clean up our new columns containing the item quantity
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace('\n','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace('"> <!-- /snippets/social-meta-tags.l','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace('roughly ','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace('"> <!-- /snippets/social-meta-tags.l','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace('"> <!-- /snippets/social-meta-tags.liq', 'b')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace(' "> <!-- /snippets/social-meta-t','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace(' tubes','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace(' cuts','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace(' steaks','')

# manually check and replace item sizes that didn't pull properly
single_item_df.loc[single_item_df['item_name']=='Beef - Tongue',['item_size_new']] = '1.5-2.5 lbs'
single_item_df.loc[single_item_df['item_name']=='Beef - Rump Roast',['item_size_new']] = '2-3 lbs'
single_item_df.loc[single_item_df['item_name']=='Beef - Flat Iron Steak',['item_size_new']] = '6-10 oz'
single_item_df.loc[single_item_df['item_name']=='Beef - Round Roast',['item_size_new']] = '2-3 lbs'
single_item_df.loc[single_item_df['item_name']=='Chicken - Whole',['item_size_new']] = '3.5-4.5 lbs'
single_item_df.loc[single_item_df['item_name']=='Pork - Smoked Ham Roast',['item_size_new']] = '2.5-3.5 lbs'
single_item_df.loc[single_item_df['item_name']=='Pork - Kielbasa',['item_size_new']] = '1 lb'
single_item_df.loc[single_item_df['item_name']=='Pork - Bratwurst',['item_size_new']] = '1 lb'
single_item_df.loc[single_item_df['item_name']=='Pork - Hocks',['item_size_new']] = '1.8-2.5 lbs'
single_item_df.loc[single_item_df['item_name']=='Pork - Tongue',['item_size_new']] = '8 oz'
single_item_df.loc[single_item_df['item_name']=='Pork - Bone-in Chops',['item_size_new']] = '1 lb'
single_item_df.loc[single_item_df['item_name']=='Pork - Boneless Chops',['item_size_new']] = '1 lb'

# we'll give both 'Chicken - Eggs' items a quantity of 1
single_item_df.iloc[40,2] = 1
single_item_df.iloc[42,2] = 1

# rename column with funky characters
single_item_df.loc[single_item_df['item_name']=='Chicken - Legs \\u0026 Thighs',['item_name']] = 'Chicken - Legs & Thighs'

In [None]:
# drop the old item_size column and rename the new one
single_item_df.drop('item_size',axis=1,inplace=True)
single_item_df.rename(columns={'item_size_new':'item_size'},inplace=True)

In [None]:
single_item_df.head()

In [None]:
# we have two items labeled 'Chicken - Eggs', which do not have an associated weight
# let's go ahead and drop these rows
single_item_df.drop([40,42],inplace=True)

In [None]:
# extract the unit of measurement (lb, lbs, oz) into a new column
def get_measure(value):
    if 'lb' in value:
        return 'lb'
    elif 'oz' in value:
        return 'oz'

single_item_df['measure'] = single_item_df['item_size'].apply(get_measure)

In [None]:
single_item_df.head()

In [None]:
# remove the unit of measurement from the item_size column
single_item_df['item_size'] = single_item_df['item_size'].str.replace(' lbs','').str.replace(' lb','').str.replace(' oz','')
single_item_df['item_size'] = single_item_df['item_size'].str.replace('lb','').str.replace('oz','')
single_item_df['item_size'] = single_item_df['item_size'].str.strip()
single_item_df['item_size'] = single_item_df['item_size'].str.rstrip()
single_item_df['item_size'] = single_item_df['item_size'].str.replace('\ufeff','')
single_item_df['item_size'] = single_item_df['item_size'].str.replace('\xa0s','')

In [None]:
# turn ranges of values in the item_size column into a single value by taking the average of the range min and max
# then create a new column with this value
def find_avg_quantity(value):
    quants = value.split('-')
    if len(quants) == 1:
        return value
    elif len(quants) == 2:
        return (float(quants[0])+float(quants[1]))/2
    
single_item_df['quantity'] = single_item_df['item_size'].apply(find_avg_quantity)

In [None]:
single_item_df.head()

In [None]:
# convert quantities in oz to lbs
single_item_df['quantity'] = single_item_df['quantity'].astype('float')
single_item_df['quantity_lb'] = np.where(single_item_df['measure']=='oz', 
                                         single_item_df['quantity']/16, single_item_df['quantity'])

In [None]:
single_item_df.head()

In [None]:
# drop old columns
single_items = single_item_df.drop(['item_size','measure','quantity'],axis=1)
single_items.head()

Now that we have the item info from apseyfarms.com cleaned up, let's restructure the dataframe into the format we'll need for our analysis.

In [None]:
single_items_crosswalk = single_items.copy()
single_items_crosswalk['quantity_beef_lb'] = np.where(single_items_crosswalk['item_name'].str.contains('Beef'),
                                                     single_items_crosswalk['quantity_lb'],0)
single_items_crosswalk['quantity_pork_lb'] = np.where(single_items_crosswalk['item_name'].str.contains('Pork'),
                                                     single_items_crosswalk['quantity_lb'],0)
single_items_crosswalk['quantity_chicken_lb'] = np.where(single_items_crosswalk['item_name'].str.contains('Chicken'),
                                                     single_items_crosswalk['quantity_lb'],0)

In [None]:
single_items_crosswalk['enterprise'] = 'tbd'
single_items_crosswalk.iloc[0:30]['enterprise'] = 'Beef'
single_items_crosswalk.iloc[30:42]['enterprise'] = 'Chicken'
single_items_crosswalk.iloc[42:66]['enterprise'] = 'Pork'

In [None]:
single_items_crosswalk.head()

In [None]:
single_items_crosswalk.reset_index(inplace=True)
single_items_crosswalk.drop('index',axis=1,inplace=True)
single_items_crosswalk.rename(columns={'quantity_lb': 'total_quantity_lb'},inplace=True)
single_items_crosswalk['quantity_turkey_lb'] = 0
single_items_crosswalk['product_type'] = 'Single item'

In [None]:
pd.options.display.max_rows = 75
single_items_crosswalk

In [None]:
# save the dataframe to a csv that we can use later
single_items_crosswalk.to_csv('single_items_crosswalk.csv',index=False)

# Data Cleaning & Prep

In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# read in orders data, single_items_crosswalk from Data Collection notebook, 
# and bundles_crosswalk (manually created csv from associated bundled product info)
orders = pd.read_csv('/Users/josh/Documents/Data Science/Apsey Farms/orders_export_1.csv')
single_items_crosswalk = pd.read_csv('/Users/josh/Documents/Data Science/Apsey Farms/single_items_crosswalk.csv')
bundles_crosswalk = pd.read_csv('/Users/josh/Documents/Data Science/Apsey Farms/bundle_crosswalk.csv')

In [None]:
pd.options.display.max_columns = 100
orders.head()

In [None]:
orders.info()

In [None]:
single_items_crosswalk.head()

In [None]:
bundles_crosswalk.head()

## Create a single product crosswalk dataframe
Ultimately, we want one product crosswalk that we'll use to cross-reference order data. So, let's get bundles_crosswalk into the same format as single_items_crosswalk and combine the two datasets.

In [None]:
# restructure bundles_crosswalk dataframe
bundles_crosswalk.drop('Contents', axis=1, inplace=True)
bundles_crosswalk['product_type'] = 'Bundle'
bundles_crosswalk.rename(columns={'Bundle Name':'item_name', 'Beef':'quantity_beef_lb', 
                                 'Pork':'quantity_pork_lb', 'Chicken':'quantity_chicken_lb', 
                                 'Turkey':'quantity_turkey_lb', 'Total Weight':'total_quantity_lb',
                                'Enterprise':'enterprise'},inplace=True)

In [None]:
bundles_crosswalk.head()

In [None]:
# combine crosswalk dataframes
product_crosswalk = pd.concat([single_items_crosswalk,bundles_crosswalk])
product_crosswalk.reset_index(drop=True, inplace=True)
product_crosswalk['Lineitem name'] = product_crosswalk['item_name']

In [None]:
product_crosswalk.head()

## Preliminary cleaning of orders data

Let's start by dropping columns we don't need.

In [None]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 75

In [None]:
orders.isnull().sum()

In [None]:
# find columns with >50% of values missing
cols_over_half_missing_values = list(orders.columns[orders.isnull().sum()/len(orders) > 0.5])
cols_over_half_missing_values

In [None]:
# for now let's keep some of these columns we might use in our analysis and drop the rest 
# we'll also drop additional columns that won't be useful (e.g. 'Currency' only has one value of USD)
drop_cols = cols_over_half_missing_values
remove_from_drop_cols = ['Discount Code','Shipping Method','Shipping City','Shipping Zip',
                                                'Shipping Province','Notes','Tags','Shipping Province Name']
add_to_drop_cols = ['Currency','Billing Street','Billing Address1','Billing Country','Payment Reference','Vendor',
                    'Outstanding Balance','Source']

for col in remove_from_drop_cols:
    drop_cols.remove(col)
    
for col in add_to_drop_cols:
    drop_cols.append(col)

In [None]:
drop_cols

In [None]:
# drop columns from orders dataset
orders.drop(drop_cols,axis=1,inplace=True)

In [None]:
orders.head()

Let's convert the 'Created at' column to datetime, then create a new column with just the month and year of each order.

In [None]:
orders['Created at'] = pd.to_datetime(orders['Created at'], utc=True).dt.tz_convert('US/Eastern')

In [None]:
orders['Created at'].dtype

In [None]:
orders['order_month'] = orders['Created at'].dt.strftime('%Y-%m')
orders['order_month'] = pd.to_datetime(orders['order_month'])
orders['order_month'] = orders['order_month'].dt.date

Now let's explore some of our features to determine if there is additional cleaning we can do.

In [None]:
# number of unique line items (products)
len(orders['Lineitem name'].unique())

In [None]:
orders['Lineitem name'].value_counts()

Looks like we have some suspicious "products" e.g. UPS Shipping. Since "legit" products most likely contain certain words e.g. "beef", "pork", "chicken", let's filter those out of the 'Lineitem name' columns and investigate suspicious further.

In [None]:
#dictionary version
suspicious_items_dict = {}
for item in orders['Lineitem name']:
    if not any(value in item.lower() for value in ('beef','pork','chicken','turkey','steak','bundle','box','bone','egg','steer','rib')):
        if item in suspicious_items_dict:
            suspicious_items_dict[item] += 1
        elif item not in suspicious_items_dict:
            suspicious_items_dict[item] = 1

# list version
suspicious_items_list = []
for item in orders['Lineitem name']:
    if not any(value in item.lower() for value in ('beef','pork','chicken','turkey','steak','bundle','box','bone','egg','steer','rib')):
        suspicious_items_list.append(item)

In [None]:
suspicious_items_list

In [None]:
# convert dict to df
suspicious_items_df = pd.DataFrame.from_dict(suspicious_items_dict, orient='index').reset_index()
suspicious_items_df = suspicious_items_df.rename(columns={'index':'Lineitem name', 0:'count'})
suspicious_items_df.sort_values('count', ascending=False)

Now that we've narrowed our list of suspicious products down, let's investigate them further to determine if they should be removed from our dataset for analysis.

In [None]:
num_suspicious_lines = len(orders[orders['Lineitem name'].isin(suspicious_items_list)])
num_total_lines = len(orders)
print('Count of suspicious line items:', num_suspicious_lines)
print('Suspicious line items as a % of total line items:', round(num_suspicious_lines/num_total_lines*100,2))

In [None]:
orders[orders['Lineitem name'].isin(suspicious_items_list)][['Lineitem name','Lineitem quantity','Total','Subtotal','Discount Amount',
                                                       'Lineitem price','Created at']]

In [None]:
# total amount paid from products
orders[orders['Lineitem name'].isin(suspicious_items_list)]['Total'].value_counts()

In [None]:
# Lineitem names for suspicious_items with total = $0
orders[(orders['Lineitem name'].isin(suspicious_items_list)) & (orders['Total']==0)]['Lineitem name']

Since the majority of the suspicious_items have a total amount paid of $0 and suspicious_items only make up <3\% of the total number of line items, let's drop these rows from our dataset. 

In [None]:
suspicious_items_index = list(orders[orders['Lineitem name'].isin(suspicious_items_list)].index)
orders = orders.drop(suspicious_items_index)

In [None]:
# check that rows were dropped
orders[orders['Lineitem name'].isin(suspicious_items_list)]

We know that Apsey Farms sometimes gives away products for free for promotions, gifts, etc. We won't consider these giveaways to be "true" sales/orders, so let's drop them from our dataset.

In [None]:
len(orders[orders['Total']==0])

In [None]:
# check for orders with $0 total
orders[orders['Total']==0].head(10)

In [None]:
free_giveaways_index = list(orders[orders['Total']==0].index)
orders = orders.drop(free_giveaways_index)

In [None]:
# check that rows were dropped
orders[orders['Total']==0]

In [None]:
# number of line items (our true orders/sales) we're left with for analysis
len(orders)

## Modify product crosswalk to be a comprehensive single source of truth
#### Add items from orders data not already in crosswalk to crosswalk

In [None]:
# get unique item names from orders, convert to df, and add to crosswalk
unique_lineitem_names = pd.DataFrame(orders['Lineitem name'].unique(), columns=['Lineitem name'])
product_crosswalk_full = pd.concat([product_crosswalk,unique_lineitem_names])

In [None]:
product_crosswalk_full

In [None]:
# check for duplicates
product_crosswalk_full.duplicated(['Lineitem name']).sum()

In [None]:
# remove duplicates, keeping the first entry as it contains the associated feature values
product_crosswalk_full = product_crosswalk_full.drop_duplicates(['Lineitem name'])

In [None]:
# confirm duplicates were dropped (370 - 72 = 298)
product_crosswalk_full.shape

#### Use Fuzzy Matching to impute values for missing items. 

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
# first let's separate the items with known values from those with missing values
known_items = product_crosswalk_full[product_crosswalk_full['item_name'].notna()]['item_name']
missing_items = product_crosswalk_full[product_crosswalk_full['item_name'].isna()]['Lineitem name']
print(known_items[:5])
print('\n')
print(missing_items[:5])
print('\n')
print('Total # known items:',len(known_items))
print('Total # missing items:',len(missing_items))

In [None]:
# use fuzzywuzzy.process.extractOne() to find the top known item match for each missing item
fuzzy_top_choice = {}
for item in missing_items:
    fuzzy_top_choice[item] = process.extractOne(item, known_items)
fuzzy_top_choice

For the most part, it appears that choices with a score of 90 or greater (the second value in the choice tuple) are a good match with the missing item. However, it looks like fuzzywuzzy didn't do as great when the score is less than 90. So, we'll impute the values for missing items using choices with a score greater than or equal to 90, and use a different fuzzy function (fuzz.token_set_ratio) for cases where the score was less than 90 to pull out the top 3 matches so that we can manually choose the best one.

In [None]:
# use score_cutoff parameter to filter for good matches using process.extractOne
fuzzy_top_choice = {}
for item in missing_items:
    fuzzy_top_choice[item] = process.extractOne(item, known_items, score_cutoff=89)
fuzzy_top_choice

In [None]:
# we'll use these libraries to extract keys with top values from dictionaries
import heapq
from operator import itemgetter

In [None]:
# iterate through fuzzy_top_choice dictionary and create separate dataframes for good and bad matches

fuzzy_below_90 = pd.DataFrame()
good_matches = {}

for key, value in fuzzy_top_choice.items():
    # if score from fuzzy_top_choice was <90, find top three scores using fuzzy token_set_ratio 
    # and add to fuzzy_below_90 df
    if value == None:
        ratios = {}
        for item in known_items:
            ratios[item] = fuzz.token_set_ratio(key, item)
        # get top 3 choices, returned as list of tuples where 1st element in tuple is the choice name and 2nd is fuzzy score
        top_3_items = heapq.nlargest(3, ratios.items(), key=itemgetter(1))
        # select only the choice name, not the score
        top_3_choices = [i[0] for i in top_3_items]
        top_3_choices_dict = {key: top_3_choices}
        top_3_choices_df = pd.DataFrame.from_dict(top_3_choices_dict,orient='index',columns=['choice_1','choice_2',
                                                                                             'choice_3'])
        top_3_choices_df = top_3_choices_df.reset_index().rename(columns={'index':'Lineitem name'})
        fuzzy_below_90 = pd.concat([fuzzy_below_90,top_3_choices_df])
    
    # if score from fuzzy_top_choice was >=90, add choice to good_matches_df
    else:
        good_matches[key] = value
        good_matches_df = pd.DataFrame.from_dict(good_matches, orient='index', columns=['choice','score','index'])

In [None]:
print('Number of "good" matches:',len(good_matches))
print('Number of "bad" matches:',len(fuzzy_below_90))

In [None]:
good_matches_df = good_matches_df.reset_index().drop(['score','index'],axis=1)
good_matches_df = good_matches_df.rename(columns={'level_0':'Lineitem name','choice':'item_name'})
good_matches_df

In [None]:
fuzzy_below_90 = fuzzy_below_90.reset_index().drop('index',axis=1)
fuzzy_below_90

In [None]:
# export fuzzy_below_90 to csv so that we can manually label the best match
fuzzy_below_90.to_csv('fuzzy_below_90.csv')

Let's impute values for good matches using the matching items in product_crosswalk_full, then we'll add the good matches back to our product crosswalk. Note that we'll need to drop the old, non-imputed items from product crosswalk; we'll do this at the end, after we've added values for good and bad matches to the product crosswalk.

In [None]:
good_matches_impute = pd.merge(left=good_matches_df, right=product_crosswalk_full, how='left', on='item_name')
good_matches_impute = good_matches_impute.drop('Lineitem name_y',axis=1).rename(columns={'Lineitem name_x':'Lineitem name'})
good_matches_impute

In [None]:
product_crosswalk_final = pd.concat([product_crosswalk_full,good_matches_impute])

In [None]:
product_crosswalk_final

We manually labeled the best choice for our fuzzy_below_90 matches. Let's pull the data back in and add it to our product crosswalk.

In [None]:
fuzzy_below_90_labeled = pd.read_csv('/Users/josh/Documents/Data Science/Apsey Farms/fuzzy_below_90_labeled.csv')

In [None]:
fuzzy_below_90_labeled.drop(['Unnamed: 0','choice_1','choice_2','choice_3'],axis=1,inplace=True)
fuzzy_below_90_labeled.rename(columns={'final_choice':'item_name'},inplace=True)

In [None]:
fuzzy_below_90_labeled.head()

Now we'll impute values for our fuzzy_below_90 matches using the matching items in product_crosswalk_full, then add those matches back to our product crosswalk. 

In [None]:
fuzzy_matches_impute = pd.merge(left=fuzzy_below_90_labeled, right=product_crosswalk_full, how='left', on='item_name')
fuzzy_matches_impute = fuzzy_matches_impute.drop('Lineitem name_y',axis=1).rename(columns={'Lineitem name_x':'Lineitem name'})
fuzzy_matches_impute

In [None]:
fuzzy_matches_impute['product_type'].isna().sum()

Looks like we have a number of items that did not match a previously defined product, so we'll have to fill in values for these. Notes:
 * for "bulk" items, the 'Lineitem quantity' field in the orders data indicates the weight in pounds, rather than quantity of items ordered. We'll leave the quantity columns blank for now, and impute those values when we merge the product crosswalk back to our orders data.
 * items labeled "eggs" will not have an associated weight, rather we measure quantity by the dozen. So, we'll also leave the quantity columns blank for those items.

In [None]:
fuzzy_matches_impute.to_csv('fuzzy_matches_impute.csv')

In [None]:
fuzzy_matches_impute_labeled = pd.read_csv('/Users/josh/Documents/Data Science/Apsey Farms/fuzzy_matches_impute_labeled.csv')

In [None]:
fuzzy_matches_impute_labeled

Let's merge these items into our product crosswalk, then drop the old, non-imputed/duplicate items.

In [None]:
product_crosswalk_final = pd.concat([product_crosswalk_final,fuzzy_matches_impute_labeled])

In [None]:
product_crosswalk_final

In [None]:
product_crosswalk_final.reset_index(inplace=True)

In [None]:
product_crosswalk_final.drop('index',axis=1,inplace=True)

In [None]:
product_crosswalk_final['item_name'].isnull().sum()

In [None]:
null_items = list(product_crosswalk_final[product_crosswalk_final['item_name'].isnull()].index)

In [None]:
product_crosswalk_final = product_crosswalk_final.drop(null_items)

In [None]:
# check that null_items were dropped
product_crosswalk_final['item_name'].isnull().sum()

In [None]:
# check that product crosswalk contains 298 items
len(product_crosswalk_final)

#### Use weights specified in item names to update item attributes
For example, 'Lineitem Name' = 'Ground Beef - 6 lbs' would previously have been matched with the standard ground beef item and assumed its standard quantity of 1 lb; however, 'quantity_beef_lb' for this item should instead be 6 (lbs).

In [None]:
# find all Lineitem name's containing 'lb' or 'oz'
# only include product_type = 'Single item' since we've manually populated values for some bundles and don't want
# values for bulk items
single_products = product_crosswalk_final[product_crosswalk_final['product_type']=='Single item']
products_with_quantity = single_products[(single_products['Lineitem name'].str.contains('lb')) | (single_products['Lineitem name'].str.contains('oz'))]
products_with_quantity

In [None]:
import re

In [None]:
# extract quantity
# pattern = r"[.+]\s[.+]\s(?P<quantity>[.+])"
# pattern_2 = r"(\d*\.?\d+[+]?[\s]?[-]?[\s]?[\d*\.?\d+]?[\.\d+]?)"
pattern = r"(?P<quantity>\d*\.?\d+[+]?[\s]?[-]?[\s]?[\d*]?[\.]?[\d*]?)\s?(?P<measure>lbs?|lb?|oz?)"
quantity_extract = products_with_quantity['Lineitem name'].str.extract(pattern, flags=re.I)
quantity_extract

In [None]:
# add extract to products_with_quantity dataframe by joining on the index
products_quant_extracted = pd.merge(left=products_with_quantity, right=quantity_extract, how='left', left_index=True, right_index=True)
products_quant_extracted

In [None]:
# drop rows with missing quantity values
drop_rows = list(products_quant_extracted[products_quant_extracted['quantity'].isna()].index)
products_quant_extracted = products_quant_extracted.drop(drop_rows)

In [None]:
# check that rows were dropped
print(products_quant_extracted['quantity'].isna().sum())
print(len(products_quant_extracted))

In [None]:
# turn ranges of values in the quantity column into a single value by taking the average of the range min and max
# then create a new column with this value
def find_avg_quantity(value):
    quants = str(value).split('-')
    if len(quants) == 1:
        return value
    elif len(quants) == 2:
        return (float(quants[0])+float(quants[1]))/2
    
products_quant_extracted['quantity_avg'] = products_quant_extracted['quantity'].apply(find_avg_quantity)

In [None]:
products_quant_extracted.head()

In [None]:
products_quant_extracted['quantity_avg'].value_counts()

In [None]:
# remove '+' from quantity_avg
# products_quant_extracted['quantity_avg'] = products_quant_extracted['quantity_avg'].str.replace('+','')

def remove_plus_sign(value):
    if '+' in str(value):
        return value.replace('+','')
    else:
        return value

products_quant_extracted['quantity_avg'] = products_quant_extracted['quantity_avg'].apply(remove_plus_sign)

In [None]:
products_quant_extracted.head(8)

In [None]:
products_quant_extracted['measure'].value_counts()

In [None]:
# convert quantity_avg's in oz to lbs
products_quant_extracted['quantity_avg'] = products_quant_extracted['quantity_avg'].astype('float')
products_quant_extracted['quantity_avg_lb'] = np.where(products_quant_extracted['measure']=='oz', 
                                         products_quant_extracted['quantity_avg']/16, products_quant_extracted['quantity_avg'])

In [None]:
products_quant_extracted.head()

In [None]:
# use quantity_avg to update quantity values
# excluding Turkey since we updated those manually previously
products_quant_extracted['quantity_beef_lb'] = np.where(products_quant_extracted['enterprise']=='Beef',
                                                     products_quant_extracted['quantity_avg_lb'],0)
products_quant_extracted['quantity_pork_lb'] = np.where(products_quant_extracted['enterprise']=='Pork',
                                                     products_quant_extracted['quantity_avg_lb'],0)
products_quant_extracted['quantity_chicken_lb'] = np.where(products_quant_extracted['enterprise']=='Chicken',
                                                     products_quant_extracted['quantity_avg_lb'],0)

In [None]:
products_quant_extracted.head()

In [None]:
# reset total_quantity_lb
products_quant_extracted['total_quantity_lb'] = products_quant_extracted['quantity_beef_lb']+products_quant_extracted['quantity_pork_lb']+products_quant_extracted['quantity_chicken_lb']+products_quant_extracted['quantity_turkey_lb']
products_quant_extracted.head()

In [None]:
# drop unecessary columns
products_quant_extracted = products_quant_extracted.drop(['quantity','measure','quantity_avg','quantity_avg_lb'],
                                                        axis=1)
products_quant_extracted.head()

In [None]:
products_quant_extracted.shape

Now that we've updated the quantity attributes for items that contained a quantity in their name, let's merge these items back into our product crosswalk.

In [None]:
product_crosswalk_final = pd.concat([product_crosswalk_final,products_quant_extracted])

In [None]:
product_crosswalk_final.duplicated(['Lineitem name']).sum()

In [None]:
# we want to keep the last duplicate row, since products_quant_extracted was added to the end of the 
# product_crosswalk_final df
product_crosswalk_final.drop_duplicates(['Lineitem name'],keep='last',inplace=True)

In [None]:
product_crosswalk_final.reset_index(inplace=True)

In [None]:
product_crosswalk_final.drop('index',axis=1,inplace=True)

In [None]:
product_crosswalk_final.head()

In [None]:
product_crosswalk_final.shape

In [None]:
product_crosswalk_final['product_type'].value_counts()

In [None]:
product_crosswalk_final['enterprise'].value_counts()

In [None]:
product_crosswalk_final.to_csv('product_crosswalk_final.csv')

## Add features to Orders data that we'll use in our analysis

In [None]:
orders_clean = orders.copy()

In [None]:
# merge orders with product crosswalk
orders_clean = pd.merge(left=orders_clean, right=product_crosswalk_final, how='left', on='Lineitem name')

In [None]:
orders_clean.head()

In [None]:
orders_clean[orders_clean['total_quantity_lb'].isna()]

In [None]:
# create new columns with the total item weight and weight per enterprise
# note: doesn't apply to bulk items and eggs
orders_clean['total_item_weight'] = orders_clean['Lineitem quantity'] * orders_clean['total_quantity_lb']
orders_clean['item_weight_beef'] = orders_clean['Lineitem quantity'] * orders_clean['quantity_beef_lb']
orders_clean['item_weight_pork'] = orders_clean['Lineitem quantity'] * orders_clean['quantity_pork_lb']
orders_clean['item_weight_chicken'] = orders_clean['Lineitem quantity'] * orders_clean['quantity_chicken_lb']
orders_clean['item_weight_turkey'] = orders_clean['Lineitem quantity'] * orders_clean['quantity_turkey_lb']

In [None]:
orders_clean.head(10)

In [None]:
orders_clean.to_csv('orders_clean.csv')

# Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
import geopandas as gpd
from shapely.geometry import Point, Polygon

In [None]:
orders_clean = pd.read_csv('/Users/josh/Documents/Data Science/Apsey Farms/orders_clean.csv')

In [None]:
orders_clean.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
pd.options.display.max_columns = 100

In [None]:
orders_clean.head()

In [None]:
# convert 'Created at' to datetime
orders_clean['Created at'] = pd.to_datetime(orders_clean['Created at'], utc=True).dt.tz_convert('US/Eastern')

## Understanding the Customer Base

In [None]:
# total number of unique customers
len(orders_clean['Email'].unique())

In [None]:
# recurring customers
(orders_clean.groupby('Email').size()>1).sum()

In [None]:
customer_order_totals = pd.DataFrame(orders_clean.groupby('Email').sum()['Subtotal'].sort_values(ascending=False)).reset_index()
customer_order_totals

In [None]:
print(customer_order_totals['Subtotal'].sum())
print(orders_clean['Subtotal'].sum())

In [None]:
total_customer_dollars = customer_order_totals['Subtotal'].sum()
total_customer_dollars

In [None]:
customer_order_totals['pct_of_total'] = customer_order_totals['Subtotal'] / total_customer_dollars

In [None]:
customer_order_totals.head()

In [None]:
customer_order_totals['pct_of_total'].head(150).sum()

In [None]:
# plot running total/cumulative sum
ax = customer_order_totals['pct_of_total'].cumsum().plot()
ax.set(title='Cumulative Sum of Order Amount ($)', xlabel='Number of Customers', ylabel='% of Total Order Amount ($)')
ax.axvline(x=150, color='r', linestyle='--')
ax.axhline(y=0.75, color='r', linestyle='--');

In [None]:
# total number of orders
orders_clean.groupby('Name').size().count()

In [None]:
# recurring orders
orders_clean['Tags'].value_counts(dropna=False)

## Customer Order Amount by State

In [None]:
state_dollars = orders_clean.groupby('Shipping Province Name').sum()[['Subtotal']]
state_dollars.reset_index(inplace=True)
state_dollars.rename(columns={'Shipping Province Name':'state_name','Subtotal':'order_amount'},inplace=True)
state_dollars.sort_values('order_amount',ascending=False)

In [None]:
print(orders_clean['Subtotal'].sum())
print(state_dollars['order_amount'].sum())

In [None]:
orders_clean['Shipping Province Name'].isnull().sum()

In [None]:
usa = gpd.read_file('/Users/josh/Documents/Data Science/Apsey Farms/States 21basic/geo_export_99f25753-6a02-4b7a-b22f-2d3e41e2a010.shp')

In [None]:
usa.head()

In [None]:
usa.plot();

In [None]:
# remove Hawaii and Alaska
state_map = usa.drop([0,50])
state_map.plot();

In [None]:
state_map_dollars = pd.merge(left=state_map, right=state_dollars, how='left', on='state_name')
state_map_dollars['order_amount'] = state_map_dollars['order_amount'].fillna(0)
state_map_dollars.head()

In [None]:
high_dollar_states = list(state_dollars[state_dollars['order_amount']>=5000]['state_name'])
medium_dollar_state = list(state_dollars[(state_dollars['order_amount']>=1000) & (state_dollars['order_amount']<5000)]['state_name'])
low_dollar_states = list(state_dollars[state_dollars['order_amount']<1000]['state_name'])

fig, ax = plt.subplots(figsize=(12,12))
state_map_dollars.plot(ax=ax, edgecolor='b', alpha=0.1)

for n in state_dollars['state_name']:
    if n in high_dollar_states:
        state_map_dollars[state_map_dollars['state_name'] == f'{n}'].plot(ax=ax, color='darkred', edgecolor='b', linewidth=1)
    elif n in medium_dollar_state:
        state_map_dollars[state_map_dollars['state_name'] == f'{n}'].plot(ax=ax, color='lightcoral', edgecolor='b', linewidth=1)
    elif n in low_dollar_states:
        state_map_dollars[state_map_dollars['state_name'] == f'{n}'].plot(ax=ax, color='mistyrose', edgecolor='b', linewidth=1)

In [None]:
# which states order the most of each enterprise
orders_clean.groupby('Shipping Province Name').sum()[['item_weight_beef','item_weight_pork',
                                                      'item_weight_chicken','item_weight_turkey']].sort_values('item_weight_beef',ascending=False)

## Orders by Product Type

In [None]:
orders_clean.groupby('product_type').size().sort_values(ascending=False).plot.pie(autopct = '%.1f%%',
                                                                                  colors=['cornflowerblue',
                                                                                          'mediumseagreen',
                                                                                          'coral'])
plt.title('Product Type % of \nTotal Number of Line Items Ordered')
plt.ylabel('');

In [None]:
orders_clean.groupby('product_type').sum()['Subtotal'].sort_values(ascending=False).plot.pie(autopct = '%.1f%%',
                                                                                             colors=['mediumseagreen',
                                                                                                     'cornflowerblue',
                                                                                                     'coral'])
plt.title('Product Type % of Total Order Amount ($)')
plt.ylabel('');

In [None]:
# yearly $ by product type
annual_product_amt = pd.DataFrame(orders_clean.groupby([orders_clean['Created at'].dt.year,'product_type']).sum()['Subtotal'])
annual_product_amt = annual_product_amt.reset_index()
annual_product_amt = annual_product_amt.set_index(['Created at','product_type'])['Subtotal'].unstack().reset_index()
annual_product_amt = annual_product_amt.set_index('Created at')
annual_product_amt

In [None]:
ax = annual_product_amt.plot.bar(color=['coral','mediumseagreen','cornflowerblue'],stacked=True,rot=0)
ax.set(xlabel='',ylabel='Order Amount ($)',title='Annual Order Amount ($) by Product Type');

## Orders by Enterprise

In [None]:
orders_clean.groupby('enterprise').size().sort_values(ascending=False).plot.pie(colors=['mediumseagreen',
                                                                                       'cornflowerblue',
                                                                                       'coral',
                                                                                       'plum',
                                                                                       'papayawhip',
                                                                                       'lightgray',
                                                                                       'lightsalmon',
                                                                                       'gold'],
                                                                                autopct = '%.1f%%',
                                                                                figsize=(6,6))
plt.title('Enterprise % of \nTotal Number of Line Items Ordered')
plt.ylabel('');

In [None]:
orders_clean.groupby('enterprise').sum()['Subtotal'].sort_values(ascending=False).plot.pie(colors=['mediumseagreen',
                                                                                       'papayawhip',
                                                                                       'plum',
                                                                                       'lightgray',
                                                                                       'coral',
                                                                                       'cornflowerblue',
                                                                                       'gold',
                                                                                       'lightsalmon'],
                                                                                           autopct = '%.1f%%', 
                                                                                           figsize=(6,6))
plt.title('Enterprise % of Total Order Amount ($)')
plt.ylabel('');

In [None]:
annual_enterprise_amt = pd.DataFrame(orders_clean.groupby([orders_clean['Created at'].dt.year,'enterprise']).sum()['Subtotal'])
annual_enterprise_amt = annual_enterprise_amt.reset_index()
annual_enterprise_amt = annual_enterprise_amt.set_index(['Created at','enterprise'])['Subtotal'].unstack().reset_index()
annual_enterprise_amt = annual_enterprise_amt.set_index('Created at')
annual_enterprise_amt

In [None]:
ax = annual_enterprise_amt.plot.bar(color=['mediumseagreen','plum','lightgray',
                                           'papayawhip','coral','lightsalmon','cornflowerblue','gold'],
                                    stacked=True,rot=0)
ax.set(xlabel='',ylabel='Order Amount ($)',title='Annual Order Amount ($) by Enterprise')
ax.legend(bbox_to_anchor=(1,1));

## Orders by Enterprise-Product Type Combination

In [None]:
# number of line items
ax = orders_clean.groupby(['enterprise','product_type']).size().sort_values().plot.barh()
ax.set(xlabel='Number of Line Items', ylabel='', title='Total Number of Line Items by Enterprise & Product Type');

In [None]:
# order amounts
ax = orders_clean.groupby(['enterprise','product_type']).sum()['Subtotal'].sort_values().plot.barh()
ax.set(xlabel='Order Amount ($)', ylabel='', title='Total Order Amount by Enterprise & Product Type');

Create two separate dataframes then merge into one: 1) total # line items & % of total by enterprise-product combination, 2) total order amount ($) and % of total by enterprise-product combination.

In [None]:
# order amount by enterprise-product combinations
order_combs_dollar = orders_clean.groupby(['enterprise','product_type']).sum()['Subtotal'].sort_values(ascending=False)
order_combs_dollar_df = pd.DataFrame(order_combs_dollar)
order_combs_dollar_df

In [None]:
order_combs_dollar_df['enterprise_product_type'] = order_combs_dollar_df.index
order_combs_dollar_df.reset_index(inplace=True)
order_combs_dollar_df.drop(['enterprise','product_type'],axis=1,inplace=True)
order_combs_dollar_df

In [None]:
# total $ for all orders
total_orders_amt = orders_clean['Subtotal'].sum()

# add % of total to order_combs_df
order_combs_dollar_df['$_pct_of_total'] = (order_combs_dollar_df['Subtotal'] / total_orders_amt) * 100

In [None]:
# number of line items by enterprise-product combinations
order_combs_num = orders_clean.groupby(['enterprise','product_type']).size().sort_values(ascending=False)
order_combs_num_df = pd.DataFrame(order_combs_num)
order_combs_num_df

In [None]:
order_combs_num_df['enterprise_product_type'] = order_combs_num_df.index
order_combs_num_df.reset_index(inplace=True)
order_combs_num_df.drop(['enterprise','product_type'],axis=1,inplace=True)
order_combs_num_df.rename(columns={0:'num_line_items'},inplace=True)
order_combs_num_df

In [None]:
# total $ for all orders
total_orders_num = len(orders_clean)

# add % of total to order_combs_df
order_combs_num_df['#_pct_of_total'] = (order_combs_num_df['num_line_items'] / total_orders_num) * 100

In [None]:
order_combs_num_df

In [None]:
# merge the two dataframes
order_combs_final = pd.merge(left= order_combs_num_df, right=order_combs_dollar_df, how='left', on='enterprise_product_type')
order_combs_final

## Drill Down: Products Ordered - All Products

In [None]:
# top 20 products, not taking into account total #/weight ordered
ax = orders_clean['item_name'].value_counts().head(20).sort_values().plot.barh(figsize=(6,6), 
                                                                          title='Top 20 Products Ordered')
ax.set(xlabel='Number of Line Items');

In [None]:
ax = orders_clean.groupby('item_name').sum()['Subtotal'].sort_values(ascending=False).head(20).sort_values().plot.barh(figsize=(6,6))
ax.set(title='Top 20 Products Ordered by Amount ($): Jan 2018 - July 2021', xlabel='Amount ($)',ylabel='');

In [None]:
# top 5 revenue-generating products ordered by month
top_5_products = list(orders_clean.groupby('item_name').sum()['Subtotal'].sort_values(ascending=False).head(5).index)
top_5_products

In [None]:
ax = orders_clean[orders_clean['item_name'].isin(top_5_products)].groupby('order_month').sum()['Subtotal'].plot()
ax.set(title='Monthly Revenue for Top 5 Revenue-Generating Products', xlabel='Month of Order', ylabel='Amount ($)');

In [None]:
# zoom in on 2020 and 2021
ax = orders_clean[(orders_clean['item_name'].isin(top_5_products)) & ((orders_clean['Created at'].dt.year==2020) | (orders_clean['Created at'].dt.year==2021))].groupby('order_month').sum()['Subtotal'].plot.bar()
ax.set(title='Monthly Revenue for Top 5 Revenue-Generating Products', xlabel='Month of Order', ylabel='Amount ($)');

Find the monthly average revenue

In [None]:
monthly_stats = orders_clean.groupby(orders_clean['Created at'].dt.month).sum()
monthly_stats['avg_revenue'] = monthly_stats['Subtotal']/4
monthly_stats = monthly_stats[['Subtotal','avg_revenue']]
monthly_stats

In [None]:
# we only have data through 7/2021, so avg_revenue should be 'Subtotal'/ 3 years for months 8-12
monthly_stats.iloc[7,1] = 14455.28/3
monthly_stats.iloc[8,1] = 11828.31/3
monthly_stats.iloc[9,1] = 17664.57/3
monthly_stats.iloc[10,1] = 28194.84/3
monthly_stats.iloc[11,1] = 25061.41/3
monthly_stats

In [None]:
# monthly averages, where Jan = 1 and Dec = 12
ax = monthly_stats['avg_revenue'].plot.bar(rot=0)
ax.set(title='Average Monthly Revenue from All Product Orders', xlabel='Month', ylabel='Average Amount ($)');

## Drill Down: Products Ordered - Single Items

In [None]:
# top 10 single items by $
single_item_orders = orders_clean[orders_clean['product_type']=='Single item']
ax = single_item_orders.groupby('item_name').sum()['Subtotal'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Single Items Ordered by Amount ($): \nJan 2018 - July 2021', xlabel='Amount ($)',ylabel='');

In [None]:
single_item_orders_recent = orders_clean[(orders_clean['product_type']=='Single item') & ((orders_clean['Created at'].dt.year==2020) | (orders_clean['Created at'].dt.year==2021))]
ax = single_item_orders_recent.groupby('item_name').sum()['Subtotal'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Single Items Ordered by Amount ($): \nJan 2020 - July 2021', xlabel='Amount ($)',ylabel='');

In [None]:
# calculate price per pound for single items
single_item_orders['price_per_pound'] = single_item_orders['Lineitem price']/single_item_orders['total_item_weight']

In [None]:
single_item_orders.groupby('item_name').mean()['price_per_pound'].sort_values(ascending=False).head(10)

In [None]:
single_item_orders[single_item_orders['item_name']=='Beef - Hanger Steak']

In [None]:
# top single items by weight - if uneven, might indicate not utilizing full carcass
ax = single_item_orders.groupby('item_name').sum()['total_item_weight'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Single Items Ordered by Weight (lbs): \nJan 2018 - July 2021', xlabel='Amount ($)',ylabel='');

## Drill Down: Products Ordered - Bundles

In [None]:
# top 10 bundles by $
bundle_orders = orders_clean[orders_clean['product_type']=='Bundle']
ax = bundle_orders.groupby('item_name').sum()['Subtotal'].sort_values(ascending=False).head(10).sort_values().plot.barh()
ax.set(title='Top 10 Bundles Ordered by Amount ($): \nJan 2018 - July 2021', xlabel='Amount ($)',ylabel='');