In [25]:
from bs4 import BeautifulSoup as soup
from IPython.display import display_html
from lxml import etree
import numpy as np
import pandas as pd
import requests
import warnings
import time
import re
warnings.filterwarnings('ignore')

In [51]:
SHOPRITE_SEARCH_URL = 'https://www.shoprite.co.za/search/all?q='
SHOPRITE_URL = 'https://www.shoprite.co.za'

CHECKERS_SEARCH_URL = 'https://www.checkers.co.za/search/all?q='
CHECKERS_URL = 'https://www.checkers.co.za/'

WOOLWORTHS_SEARCH_URL = 'https://www.woolworths.co.za/cat/Food/Pantry/_/N-1lw4dzx'
WOOLWORTHS_URL = 'https://www.woolworths.co.za/'

In [3]:
FOOD_BASKET = ['large eggs 6', 'large eggs 18', 'extra large eggs 30', 'table salt 500g',
               'table salt 1kg', 'rice 2kg', 'rice 5kg', 'rice 1kg', 'rice canister 10l', 
              'sugar 500g', 'sugar 1kg', 'sugar 2.5kg', 'sugar 5kg', 'sugar 10kg',
              'flour 500g', 'flour 1kg', 'flour 2.5kg', 'flour 5kg', 'flour 10kg',
              'frozen chicken', 'pork bangers', 'pork rashers', 'pork loin chops',
              'pork braai chops', 'pork chops', 'pork shoulder ribs', 'stewing pork',
              'beef goulash', 'ground beef', 'beef parcel', 'beef brisket', 'coarse salt 500g',
              'fine salt 500g', 'medium salt 500g']

In [20]:
user_basket = np.random.choice(FOOD_BASKET, 1)

In [5]:
def fetch_html(url):
    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
    response = requests.get(url, verify=False, headers=headers, timeout=10)
    print("html downloaded successfully")
    page = soup(response.content, 'html.parser')

    return page

In [6]:
def extract_product_links(html_page):
  
    product_div = html_page.find('div', {"class": re.compile('(\w+-)*search-landing')})
    content = product_div.find('div', class_='yCmsContentSlot')
    component = content.find('div', class_='yCmsComponent')
    product_wrappers = component.find('div', class_='search-landing__block--products')
    product_row = product_wrappers.find('div', class_='row')
    products = product_row.findAll('div', class_='product-frame')
    
    links = []
    for product in products:
        figure = product.find('figure')
        product_link = figure.find('a', href=True)['href']
        links.append(product_link)
    
    return links

In [7]:
def match_product_url(links_arr, item):
    
    pattern = r'([\w+-/%?]+' + str(item.replace(' ', '\-')) + '[\w+-/%?]+)'

    found = []

    for link in links_arr:
        try:
            match = re.search(pattern, link.lower()).group()
            found.append({link: match})
        except Exception:
            None
    
    return found

In [8]:
def extract_brand(link):
    
    text = list(link.values())[0].split('/')[-3]
    
    try:
        value = re.search(r'(\d+(\-)?\d+)', text).group()
        new_link = re.sub(value, value.replace('-', '.'), text)
        
    except Exception:
        new_link = text
    
    
    return new_link.replace('-', ' ').title()

In [9]:
def get_brands(arr_links):
    
    brands = []
    counter = 1
    
    for item in arr_links:
        brands.append({counter: extract_brand(item)})
        counter+=1
        
    return brands

In [10]:
def evaluate_user_choice(user_choice, links, matching_product_urls, brands, item, store_url):
    
    html = ""
    for brand in brands:
        for key, value in brand.items():
            if eval(user_choice) == key:
                k_found = match_product_url(list(matching_product_urls[0].keys()), item)
                if len(k_found) == 0:
                    k_found = closest_products(links, matching_product_urls)
                url = list(k_found[0].keys())[0]
                k_fetch = fetch_html(store_url+url)
                html = k_fetch
                
    return html

In [11]:
def get_product_details(html):
    product_specs = {}
    
    product_div = html.find('div', class_='pdp')
    product_details = product_div.find('div', class_='pdp__details')
    regex = r'\d+\.\d+'
    price = re.search(regex, product_details.find('div', class_='special-price__price').find('span').get_text().strip().lstrip('R')).group()

    try:
        promo_price = product_details.find('div', class_='special-price__extra').find('span').get_text().strip().lstrip('R')
        promo_condition = product_details.find('span', class_='special-price__extra__text').get_text()
    except Exception:
        promo_price = None
        promo_condition = None
        
    try:
        extra_details = product_div.find('div', 'pdp__extras')
        offer = extra_details.find('div', class_= 'extra-message').find('span', class_='extra-message__title').get_text().strip()
        promo_price_1 = re.search(regex, offer).group()
        promo_duration = extra_details.find('div', class_= 'extra-message').find('span', class_='extra-message__valid').get_text().strip().replace('&nbsp;' '')
    except Exception:
        offer = None
        promo_price_1 = None
        promo_duration = None
    
    if not promo_price and not promo_price_1:
        promo_price = None
        
    elif not promo_price and promo_price_1:
        promo_price = promo_price_1
    
    else:
        promo_price = promo_price
        
    product_desc = product_details.find('div', class_='pdp__description').get_text()
    product_name = product_details.find('h1', class_='pdp__name').get_text()
    product_brand = product_name.split()[0]
    product_image_url = SHOPRITE_URL + product_div.find('div', class_='pdp__image').find('img')['src']

    product_specs['Brand'] = product_brand
    product_specs['Name'] = product_name + "**" if promo_price and (price > promo_price) else product_name
    product_specs['Description'] = product_desc
    product_specs['Regular Price'] = price
    product_specs['Offer'] = offer
    product_specs['Promo Price'] = promo_price
    product_specs['Promo Condition'] = promo_condition
    product_specs['Product Image'] = product_image_url
    
    return product_specs

In [12]:
def total_cost(basket_details):
    
    receipt = {}
    total_cost = 0
    regular_total = 0

    for item in basket_details:
        if item['Promo Price'] and (item['Promo Price'] < item['Regular Price']):
            price = item['Promo Price']
        else:
            price = item['Regular Price']

        total_cost += float(price)
        regular_total += float(item['Regular Price'])
        
        receipt[item['Name']] =  (price, item['Regular Price'])
    
    receipt['Total'] = ("R " + str(round(total_cost,2)), "R " + str(round(regular_total, 2)))
    receipt['Savings'] = (f"R {round(abs(total_cost - regular_total), 2)}", "-")
    
    return receipt
    

In [13]:
def closest_products(product_links, product_match):
    all_matches = []
    key = product_links
    value = [link.lower() for link in product_links]
    
    for i in range(len(key)):
        all_matches.append(dict(zip([key[i]], [value[i]])))
    product_match = all_matches
    
    return product_match

In [14]:
def evaluate_user_basket(basket, search_url, store_url):
    user_product_details = []

    for item in basket:
        search_result = fetch_html(search_url+item)
        product_links = extract_product_links(search_result)
        product_match = match_product_url(product_links, item)
        if len(product_match) == 0:
            product_match = closest_products(product_links, product_match)
        product_brands = get_brands(product_match)

        print(product_brands)

        user_choice = input('select brand > ')
        html = evaluate_user_choice(user_choice, product_links, product_match, product_brands, item, store_url)
        product_details = get_product_details(html)

        user_product_details.append(product_details)

        basket_cost = total_cost(user_product_details)
        
    return basket_cost

In [15]:
def get_receipt(receipt):
    receipt = pd.DataFrame.from_dict(receipt, orient='index',
                                    columns=['Price', 'Regular Price'])

    return receipt

In [16]:
def compare_stores(stores, user_basket):
    receipts = []
    promos = {}
    for store in stores:
        for name, urls in store.items():
            store_receipt = evaluate_user_basket(basket=user_basket, search_url=urls[0], store_url=urls[1])
            receipt = get_receipt(store_receipt)
            promo_total = receipt.iloc[-2].values[0].strip('R').strip()
            promos[name] = promo_total
            receipt_styler = receipt.style\
                        .set_table_attributes("style='display:inline; margin-right:30px'")\
                        .set_caption(name.upper() + ' RECEIPT')
            
            receipts.append(receipt_styler)
    print("#"*120)
    display_html([tbl._repr_html_() for tbl in receipts], raw=True)
    
    min_total = min(promos.values())
    for store, total in promos.items():
        if total == min_total:
            return (f" WE RECOMMEND SHOPPING FROM {store} ".upper().center(100, "*"))

In [17]:
stores = [{"checkers": (CHECKERS_SEARCH_URL, CHECKERS_URL)}]

In [18]:
compare_stores(stores, user_basket)

html downloaded successfully
[{1: 'Buffalo Fine Sea Salt 500G'}, {2: 'Buffalo Fine Sea Salt 1Kg'}, {3: 'Imbo Fine Desiccated Coconut 500G'}, {4: 'Medirite Epsom Salts 500G'}, {5: 'Cerebos Iodated Table Salt 1Kg'}, {6: 'Cerebos Iodated Table Salt 500G'}, {7: 'Cerebos Iodated Table Salt Pack 1Kg'}, {8: 'Bonnita Salted Butter Brick 500G'}, {9: 'Kim Isodated Coarse Salt 500G'}, {10: 'Marina Lighthouse Salt Flask 500G'}]


select brand >  2


html downloaded successfully
html downloaded successfully
[{1: 'Lunds Large Eggs 6 Pack'}, {2: 'Lunds Extra Large Eggs 6 Pack'}, {3: 'Nulaid Premium Grade 1 Canola Large Eggs 6 X %3E51G'}, {4: 'Nulaid Premium Grade 1 Canola Extra Large Eggs 6 X %3E59G'}]


select brand >  3


html downloaded successfully
html downloaded successfully
[{1: 'Huletts White Sugar 2.5Kg'}, {2: 'Checkers Housebrand White Sugar 2.5Kg'}, {3: 'Huletts Kosher White Sugar 2.5Kg Bag'}]


select brand >  1


html downloaded successfully
html downloaded successfully
[{1: 'Cerebos Iodated Table Salt 500G'}, {2: 'Cerebos Iodated Table Salt 500G'}, {3: 'Checkers Housebrand Iodated Table Salt 500G'}]


select brand >  1


html downloaded successfully
html downloaded successfully
[{1: 'Huletts White Sugar 5Kg'}, {2: 'Selati White Sugar 5Kg'}, {3: 'Selati Golden Brown Sugar 5Kg'}]


select brand >  3


html downloaded successfully


Unnamed: 0,Price,Regular Price
Buffalo Fine Sea Salt 500g,8.99,8.99
Lunds Large Eggs 6 Pack,15.99,15.99
Huletts White Sugar 2.5kg,49.99,49.99
Cerebos Iodated Table Salt 500g,24.99,24.99
Huletts White Sugar 5kg,102.99,102.99
Total,R 202.95,R 202.95
Savings,R 0.0,-


'******************************* WE RECOMMEND SHOPPING FROM CHECKERS ********************************'