In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import re
from bs4 import BeautifulSoup

Get all the urls for the single item pages on apseyfarms.com that we'll need to scrape

In [None]:
def get_urls(url_list):
    links = []
    for url in url_list:
        response = requests.get(url)
        content = response.content
        content = content.decode("utf-8")
        parser = BeautifulSoup(content, 'html.parser')
        
        # find all the anchor tags with "href" attribute starting with "https://"
        # and store the links in a list
        for link in parser.find_all('a', attrs={'href': re.compile("/collections")}):
            links.append('http://www.apseyfarms.com' + link.get('href'))
    return links

In [None]:
urls_to_scrape = ['https://apseyfarms.com/collections/beef-1', 'https://apseyfarms.com/collections/beef-1?page=2',
                 'https://apseyfarms.com/collections/chicken', 'https://apseyfarms.com/collections/pork',
                 'https://apseyfarms.com/collections/pork?page=2']

In [None]:
single_item_links = get_urls(urls_to_scrape)

In [None]:
single_item_links

In [None]:
urls_to_remove = ['http://www.apseyfarms.com/collections', 'http://www.apseyfarms.comhttps://apseyfarms.com/collections/bundles',
                 'http://www.apseyfarms.com/collections/beef-1?page=2','http://www.apseyfarms.com/collections/beef-1?page=1',
                 'http://www.apseyfarms.com/collections/pork?page=2','http://www.apseyfarms.com/collections/pork?page=1']
for i in range(10):
    for url in urls_to_remove:
        try:
            single_item_links.remove(url)
        except:
            break

In [None]:
single_item_links[:10]

Now that we have all the urls from which we'll be scraping saved in a list, let's iterate through those sites and pull out the info we need: item name and quantity (package size).

In [None]:
def get_item_info(url_list):
    item_dict = {}
    for url in url_list:
        response = requests.get(url)
        content = response.content
        content = content.decode("utf-8")
        parser = BeautifulSoup(content, 'html.parser')
        
        item_name_start_index = content.find('"title":"') + len('"title":"')
        item_name_end_index = content.find('","handle":')
        item_name = content[item_name_start_index:item_name_end_index]
        
        item_size_start_index = content.find('Package size:') + len('Package size:')
        item_size_end_index = item_size_start_index + 50
        item_size = content[item_size_start_index:item_size_end_index]
        
        item_dict[item_name] = item_size
    return item_dict

In [None]:
# creates a dictionary with keys as item names and values as package size
single_item_dict = get_item_info(single_item_links)

In [None]:
single_item_dict

The scraped info is a bit messy, so we'll save the dictionary to a pandas dataframe in order to clean up the data and structure into our desired format.

In [None]:
single_item_df = pd.DataFrame.from_dict(single_item_dict, orient='index').reset_index().rename(columns={'index':'item_name',0:'item_size'})

In [None]:
single_item_df.head()

In [None]:
# for the most part, it looks like the item quantity appears before the substring 'Ingredients'
# so let's pull out everything before that substring into a new column
single_item_df['item_size_new'] = single_item_df['item_size'].str.split('Ingredients').str[0]

In [None]:
single_item_df.head()

In [None]:
# let's clean up our new columns containing the item quantity
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace('\n','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace('"> <!-- /snippets/social-meta-tags.l','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace('roughly ','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace('"> <!-- /snippets/social-meta-tags.l','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace('"> <!-- /snippets/social-meta-tags.liq', 'b')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace(' "> <!-- /snippets/social-meta-t','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace(' tubes','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace(' cuts','')
single_item_df['item_size_new'] = single_item_df['item_size_new'].str.replace(' steaks','')

# manually check and replace item sizes that didn't pull properly
single_item_df.loc[single_item_df['item_name']=='Beef - Tongue',['item_size_new']] = '1.5-2.5 lbs'
single_item_df.loc[single_item_df['item_name']=='Beef - Rump Roast',['item_size_new']] = '2-3 lbs'
single_item_df.loc[single_item_df['item_name']=='Beef - Flat Iron Steak',['item_size_new']] = '6-10 oz'
single_item_df.loc[single_item_df['item_name']=='Beef - Round Roast',['item_size_new']] = '2-3 lbs'
single_item_df.loc[single_item_df['item_name']=='Chicken - Whole',['item_size_new']] = '3.5-4.5 lbs'
single_item_df.loc[single_item_df['item_name']=='Pork - Smoked Ham Roast',['item_size_new']] = '2.5-3.5 lbs'
single_item_df.loc[single_item_df['item_name']=='Pork - Kielbasa',['item_size_new']] = '1 lb'
single_item_df.loc[single_item_df['item_name']=='Pork - Bratwurst',['item_size_new']] = '1 lb'
single_item_df.loc[single_item_df['item_name']=='Pork - Hocks',['item_size_new']] = '1.8-2.5 lbs'
single_item_df.loc[single_item_df['item_name']=='Pork - Tongue',['item_size_new']] = '8 oz'
single_item_df.loc[single_item_df['item_name']=='Pork - Bone-in Chops',['item_size_new']] = '1 lb'
single_item_df.loc[single_item_df['item_name']=='Pork - Boneless Chops',['item_size_new']] = '1 lb'

# we'll give both 'Chicken - Eggs' items a quantity of 1
single_item_df.iloc[40,2] = 1
single_item_df.iloc[42,2] = 1

# rename column with funky characters
single_item_df.loc[single_item_df['item_name']=='Chicken - Legs \\u0026 Thighs',['item_name']] = 'Chicken - Legs & Thighs'

In [None]:
# drop the old item_size column and rename the new one
single_item_df.drop('item_size',axis=1,inplace=True)
single_item_df.rename(columns={'item_size_new':'item_size'},inplace=True)

In [None]:
single_item_df.head()

In [None]:
# we have two items labeled 'Chicken - Eggs', which do not have an associated weight
# let's go ahead and drop these rows
single_item_df.drop([40,42],inplace=True)

In [None]:
# extract the unit of measurement (lb, lbs, oz) into a new column
def get_measure(value):
    if 'lb' in value:
        return 'lb'
    elif 'oz' in value:
        return 'oz'

single_item_df['measure'] = single_item_df['item_size'].apply(get_measure)

In [None]:
single_item_df.head()

In [None]:
# remove the unit of measurement from the item_size column
single_item_df['item_size'] = single_item_df['item_size'].str.replace(' lbs','').str.replace(' lb','').str.replace(' oz','')
single_item_df['item_size'] = single_item_df['item_size'].str.replace('lb','').str.replace('oz','')
single_item_df['item_size'] = single_item_df['item_size'].str.strip()
single_item_df['item_size'] = single_item_df['item_size'].str.rstrip()
single_item_df['item_size'] = single_item_df['item_size'].str.replace('\ufeff','')
single_item_df['item_size'] = single_item_df['item_size'].str.replace('\xa0s','')

In [None]:
# turn ranges of values in the item_size column into a single value by taking the average of the range min and max
# then create a new column with this value
def find_avg_quantity(value):
    quants = value.split('-')
    if len(quants) == 1:
        return value
    elif len(quants) == 2:
        return (float(quants[0])+float(quants[1]))/2
    
single_item_df['quantity'] = single_item_df['item_size'].apply(find_avg_quantity)

In [None]:
single_item_df.head()

In [None]:
# convert quantities in oz to lbs
single_item_df['quantity'] = single_item_df['quantity'].astype('float')
single_item_df['quantity_lb'] = np.where(single_item_df['measure']=='oz', 
                                         single_item_df['quantity']/16, single_item_df['quantity'])

In [None]:
single_item_df.head()

In [None]:
# drop old columns
single_items = single_item_df.drop(['item_size','measure','quantity'],axis=1)
single_items.head()

Now that we have the item info from apseyfarms.com cleaned up, let's restructure the dataframe into the format we'll need for our analysis.

In [None]:
single_items_crosswalk = single_items.copy()
single_items_crosswalk['quantity_beef_lb'] = np.where(single_items_crosswalk['item_name'].str.contains('Beef'),
                                                     single_items_crosswalk['quantity_lb'],0)
single_items_crosswalk['quantity_pork_lb'] = np.where(single_items_crosswalk['item_name'].str.contains('Pork'),
                                                     single_items_crosswalk['quantity_lb'],0)
single_items_crosswalk['quantity_chicken_lb'] = np.where(single_items_crosswalk['item_name'].str.contains('Chicken'),
                                                     single_items_crosswalk['quantity_lb'],0)

In [None]:
single_items_crosswalk['enterprise'] = 'tbd'
single_items_crosswalk.iloc[0:30]['enterprise'] = 'Beef'
single_items_crosswalk.iloc[30:42]['enterprise'] = 'Chicken'
single_items_crosswalk.iloc[42:66]['enterprise'] = 'Pork'

In [None]:
single_items_crosswalk.head()

In [None]:
single_items_crosswalk.reset_index(inplace=True)
single_items_crosswalk.drop('index',axis=1,inplace=True)
single_items_crosswalk.rename(columns={'quantity_lb': 'total_quantity_lb'},inplace=True)
single_items_crosswalk['quantity_turkey_lb'] = 0
single_items_crosswalk['product_type'] = 'Single item'

In [None]:
pd.options.display.max_rows = 75
single_items_crosswalk

In [None]:
# save the dataframe to a csv that we can use later
single_items_crosswalk.to_csv('single_items_crosswalk.csv',index=False)