# Shoprite Data Scraper

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# empty dictionary
all_products = {
    "names": [],
    "prices": [],
    "sku": [],
    'categories': [],
    'tag': []
}

In [3]:
# get the content of an HTML tag
def get_contents(tag):
    items = soup.find_all(tag) # find all give tag
    values = [] # empty list
    
    for item in items:
        if item.get_text() != None:
            text = item.get_text() # get text of each tag
            values.append(text)
        else:
            text = 'Null'
            values.append(text)
        
    return values

In [4]:
# get other details of an HTML tag
def get_details(name):
    single_prod_url = f"https://shoprite.ng/product/{name}".replace(' ','-')
    r = requests.get(single_prod_url)
        
    soup = BeautifulSoup(r.content, 'html.parser')
    
    if soup.find('span', class_="sku") != None:
        sku = soup.find('span', class_="sku").get_text()
    else:
        sku = 'Null'

    if soup.find('span', class_="posted_in") != None:
        category = soup.find('span', class_="posted_in").get_text()
    else:
        category = 'Null'

    if soup.find('span', class_="tagged_as") != None:
        tag = soup.find('span', class_="tagged_as").get_text()
    else:
        tag = 'Null'
    
    return (sku, category, tag)

In [5]:
# make a requests to 5 pages
for page in range(1,6):
    url = f"https://shoprite.ng/products/?product-page={page}"
    res = requests.get(url)
    
    # create an object of BeautifulSoup
    soup = BeautifulSoup(res.content, 'html.parser')
    
    # get product name
    names = get_contents('h2')
    
    #get all the product price
    prices = get_contents('bdi')
    
    
    #get all the product sku
    skus = []
    for name in names:
        sku = get_details(name)[0]
        skus.append(sku)
    
    
    #get all the product categories
    categories = []
    for name in names:
        category = get_details(name)[1]
        categories.append(category)
    
    
    #get all the product tags
    tags = []
    for name in names:
        tag = get_details(name)[2]
        tags.append(tag)
              
            
    # Extends the dictionary
    all_products['names'].extend(names[1:])
    all_products['prices'].extend(prices)
    all_products['sku'].extend(skus[1:])
    all_products['categories'].extend(categories[1:])
    all_products['tag'].extend(tags[1:])

In [7]:
# create DataFrame
prod_df = pd.DataFrame(all_products)
prod_df

Unnamed: 0,names,prices,sku,categories,tag
0,Chewy Caramel Alpenliebe 6.5G,₦34.99,Candy Milk Chewy Caramel Alpenliebe 6.5G,Category: Confectionery and Snacks,Tag: Candy
1,Milk Chewy Caramel Alpenliebe 6.5G,₦34.99,Milk Chewy Caramel Alpenliebe 6.5G,Category: Candy & bubble Gum,Tag: Alpenliebe
2,Seasoning Powder Jollof Maggi 8G,₦52.99,Seasoning Powder Jollof Maggi 8G,"Category: Condiments, Oils & Spices",Tag: Seasoning
3,Seasoning Powder Chicken Maggi 10G,₦52.99,Seasoning Powder Chicken Maggi 10G,"Category: Condiments, Oils & Spices",Tag: Seasoning
4,Chocolate Slab Milk Dune 5G,₦54.99,Chocolate Slab Milk Dune 5G,Category: Chocolate,Tag: Chocolate
...,...,...,...,...,...
95,"Coca Cola 350Ml Nrb, Reg",₦149.99,Null,Category: Carbonated Juices,Tag: Soft Drink
96,Oats Quaker 40G Sachet,₦149.99,Oats Quaker 40G Sachet,Category: Cereals,Tag: Oats
97,Noodles Tasty Chicken Supreme 100G,₦149.99,Noodles Tasty Chicken Supreme 100G,"Category: Rice, Grains & Pasta",Tag: Noodles Tasty Chicken
98,Sugar Free 7Up 400Ml,₦149.99,Sugar Free 7Up 400Ml,Category: Soda & Energy drink,Tag: 7Up


In [8]:
prod_df.shape

(100, 5)

## Save to CSV

In [11]:
prod_df.to_csv('shop_rite_messy_data.csv')