In [1]:
from bs4 import BeautifulSoup
import requests
from collections import deque
from urllib.request import urlparse, urljoin
import re
import csv
import json
import time

In [2]:
url = 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish'

#### 1. Get Soup Object

In [3]:
def get_page(url):
    try:
        r = requests.get(url)
        if r.status_code == 200:
            return(BeautifulSoup(r.content, 'html.parser'))
    except Exception as e:
        pass
    return None

In [5]:
#test if this works
s = get_page(url)
#s

#### 2. Extract all department link

In [6]:
def extract_department_links(url):
    links = []
    soup = get_page(url)
    time.sleep(2)
    ul = soup.find('ul', class_='categories departments')
    if not ul:
        return links
    for li in ul.find_all('li'):
        a = li.find('a', href=True)
        if a:
            links.append(a['href'])
    return links

In [7]:
department_links = extract_department_links(url)
department_links

['https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/meatandfish-essentials',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/discover-meatandfish',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/meat-and-fish-price-lockdown',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/bbq',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/build-a-roast-dinner',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/chicken---turkey',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/bacon-sausages',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/ham--deli-meats---dips',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/pork',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/fish-seafood',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/beef',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/lamb',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/ready-to-cook-314359-44',
 '

#### 3. Extract product pages link (after processing secondary shelf links)

In [8]:
def extract_links_to_product_pages(department_links):

    # create a list of product pages(which don't have intermediate pages)
    # create a queue list which will loop through all links
    mainpage = 'https://www.sainsburys.co.uk/'
    product_pages = []
    visited = set()
    queue= deque()
    queue.extend(department_links)
    
    while queue:
        #take each link
        link = queue.popleft()
        
        #if link is already visited, don't check that link again, go back to while loop
        if link in visited:
            continue
        # if link not in visited, add it into visited
        visited.add(link)
    
        # get the soup obj of the unvisited link
        soup = get_page(link)
        time.sleep(3)
        
        # find if there's the product grid in soup
        ul = soup.find('ul', class_ = 'productLister gridView')
        # if there's the product grid, that's the product page, append it
        if ul:
            product_pages.append(link)
            
        # if there's no product grid, check to see if there's an extra shelf
        else:
            ul = soup.find('ul', class_ = 'categories aisles')
            # this line to exclude any exception, if cannot find anything, go back to while loop
            if not ul:
                continue
            # if there's an extra shelf, add it to queue, to continue find the product page_which have the product grid
            for li in ul.find_all('li'):
                a = li.find('a', href = True)
                if a:
                    l = urljoin(mainpage,a['href'])
                    queue.append(l)
    return product_pages

In [9]:
product_pages = extract_links_to_product_pages(department_links)

In [10]:
print(len(product_pages))
print(product_pages)

48
['https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/meatandfish-essentials', 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/meat-and-fish-price-lockdown', 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/bbq', 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/build-a-roast-dinner', 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/ready-to-cook-314359-44', 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/mince', 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/turkey-44', 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/duck--game---venison', 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/meat-free-', 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/sauces--marinades---yorkshire-puddings-310901-44', 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/discover-summer-eating', 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/all-chicken-44', 'https://www.sainsburys.co.uk/shop/gb/gr

In [11]:
product_pages_2 = product_pages[::20]
product_pages_2

['https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/meatandfish-essentials',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/all-ham--deli-meats---dips',
 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/steak']

#### 4. Extract each product link after processing next page link)

In [12]:
def extract_product_details(product_pages):
    products = []
    visited = set()
    queue = deque()
    queue.extend(product_pages)
    while queue:
        product_page = queue.popleft()
        if product_page in visited:
            continue
        visited.add(product_page)
        time.sleep(2)
        
        soup = get_page(product_page)
        if soup:
            ul =soup.find('ul', class_ = 'productLister gridView')
            if ul:
                for li in ul.find_all('li', class_ = 'gridItem'):
                    a = li.find('a', href = True)
                    if a:
                        products.append(a['href'])
        next_page = soup.find('li', class_ = 'next')
        if next_page:
            a = next_page.find('a', href = True)
            if a:
                # this code is to replace the 'next page' link, usually with messy unknown format
                # with this format: product_page# next_page_tail
                # by finding the '?' in the next page link, 
                # and replace it with product_page + #
                # write another exception if product page already has a '#' in it
                qm = a['href'].find('?')
                if '#' in product_page:
                    product_page = product_page[:product_page.find('#')]
                queue.append(product_page + '#' + a['href'][qm + 1:])
    return products
                    
            

In [13]:
products = extract_product_details(product_pages_2)

In [14]:
print(len(products))
print(products)

265
['https://www.sainsburys.co.uk/shop/gb/groceries/product/details/meatandfish-essentials/sainsburys-beef-mince-5-fat-500g', 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/meatandfish-essentials/sainsburys-responsibly-sourced-scottish-salmon-fillet-x2-240g', 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/meatandfish-essentials/sainsburys-whole-chicken-breast-fillets-640g', 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/meatandfish-essentials/sainsburys-british-breaded-ham-120g', 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/meatandfish-essentials/sainsburys-air-dried-lean-ham-finely-sliced--taste-the-difference-120g', 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/meatandfish-essentials/sainsburys-british-honey-roast-ham-120g', 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/meatandfish-essentials/sainsburys-chicken-fillets-1kg', 'https://www.sainsburys.co.uk/shop/gb/groceries/produc

#### 5. Extracting product infomations

In [16]:
products_short = products[::20]
len(products_short)

14

In [17]:
products_short

['https://www.sainsburys.co.uk/shop/gb/groceries/product/details/meatandfish-essentials/sainsburys-beef-mince-5-fat-500g',
 'https://www.sainsburys.co.uk/shop/PromotionDisplayView?catalogId=10203&productId=121332&langId=44&storeId=10151&promotionId=10410513',
 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/meatandfish-essentials/sainsburys-british-free-range-whole-chicken--taste-the-difference-900g---22kg',
 '',
 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/all-ham--deli-meats---dips/sainsburys-cooked-chicken-slices-125g',
 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/all-ham--deli-meats---dips/sainsburys-british-smoked-ham-120g',
 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/steak/sainsburys-21-day-matured-beef-fillet-steak-170g',
 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/steak/sainsburys-picanha-steak--taste-the-difference-225g',
 'https://www.sainsburys.co.uk/shop/gb/groceries/product/de

In [18]:
# Info to extract:
#     name
#     url
#     item code
#     nutrition per 100g: energy kcals, energy kjoules, fat, saturates, carbs, total sugars, starch, fibers, protes, salt
#     country of origin
#     price per unit
#     unit
#     number of reviews
#     avg ratings

In [19]:
a_link = 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/meatandfish-essentials/sainsburys-beef-mince-5-fat-500g'

In [20]:
product_info = {}
soup = get_page(a_link)

In [21]:
#product url:
product_info['url'] = a_link



{'url': 'https://www.sainsburys.co.uk/shop/gb/groceries/product/details/meatandfish-essentials/sainsburys-beef-mince-5-fat-500g'}


In [25]:
# Need to use JS tools to scrape info from site.


#product_name: 
h1 = soup.find('h1')
print(h1)
print(name)


None
None


#### 6. Export data

In [30]:
product_information = [{'name': 'meat', 'url': 'abc', 'price': '123'},
                      {'name': 'fish', 'url': 'abcde', 'price': '456'},
                      {'name': 'veggie', 'url': 'xyz', 'price': '789'},
                      {'name': 'milk', 'url': 'zzz', 'price': '10'}]

In [47]:
#Write csv
from csv import DictWriter



In [46]:
def get_field_names(product_information):
    return (list(product_information[0].keys()))


get_field_names(product_information)

['name', 'url', 'price']


In [52]:
with open('result.csv', 'w') as outfile:
    fieldnames = get_field_names(product_information)
    writer = csv.DictWriter(outfile, fieldnames = fieldnames, delimiter = ',', lineterminator = '\n')
    writer.writeheader()
    for i in product_information:
        writer.writerow(i)

In [57]:
# Use in case product info is wrapped inside a class
# with open('result.csv', 'w') as out2:
#     fieldnames = get_field_names(product_information)
#     writer = csv.DictWriter(out2, fieldnames = fieldnames, delimiter = ',', lineterminator = '\n')
#     writer.writeheader()
#     writer.writerows(map(lambda p: p.__dict__, product_information))

In [58]:
# Write JSON
import json


In [59]:
with open('r2.json', 'w') as outfile:
    json.dump(product_information, outfile)