# Example for The Market Food Shop (Nigeria)

In [1]:
import time
from datetime import date
from urllib.robotparser import RobotFileParser
import json

import requests
from bs4 import BeautifulSoup
import pandas as pd
from pydantic import BaseModel

Define functions

In [2]:
def safe_get(s: requests.sessions.Session, robots: RobotFileParser, link: str) -> requests.models.Response:
    """Wrapper for a request session get call that respects the robots.txt file
    Parameters:
        s (requests.sessions.Session): Requests session with User-Agent properly set
        robots (RobotFileParser): Initialized robots.txt parsed object for the specific website
        link (str): link that you want to retrive respecting the robots.txt file
    
    Returns:
        Response from session get call or None if the link is forbidden by robots.txt
    """
    if robots.can_fetch(s.headers.get("User-Agent"), link):
        response = s.get(link)
    else:
        response = None
    return response

## Get Robots.txt

In [3]:
root_url = "https://www.themarketfoodshop.com/"

In [4]:
# Setup session and user-agent
headers = {  # Need to be replaced with your details
    'User-Agent': 'Webscraping Capacity Building 1.0',
    'From': 'luigi.palumbo@unitus.it'  
}

s = requests.Session()
s.headers.update(headers)

In [5]:
# Parse robots.txt
robots_tfms = RobotFileParser(root_url + "robots.txt")
robots_tfms.read()

In [6]:
delay = robots_tfms.crawl_delay(s.headers.get("User-Agent"))

## Get the homepage and parse directories

In [7]:
homepage = s.get(root_url)

In [8]:
page = BeautifulSoup(homepage.text, 'html.parser')

In [9]:
div_list = page.find_all("div", {"class": "sub-menu-dropdown"})

In [10]:
link_list = []
for div in div_list:
    for l in div.find_all("a"):
        new_link = {}
        new_link["name"] = l.get_text()
        new_link["link"] = l.get("href")
        link_list.append(new_link)

# Link list has the starting page fo all categories

Get links of individual products

In [11]:
category_page = s.get(link_list[0].get("link"))

In [12]:
category_page = BeautifulSoup(category_page.text, 'html.parser')


In [13]:
product_links = [item.find("a").get("href") for item in category_page.find_all("h3", {"class": "product-title"})]
product_links

['https://www.themarketfoodshop.com/product/bag-of-rice-aroso/',
 'https://www.themarketfoodshop.com/product/black-beans/',
 'https://www.themarketfoodshop.com/product/brown-rice-1kg/',
 'https://www.themarketfoodshop.com/product/bag-of-rice/',
 'https://www.themarketfoodshop.com/product/honey-beans-2/',
 'https://www.themarketfoodshop.com/product/oloyin-beans/',
 'https://www.themarketfoodshop.com/product/ijebu-garri/',
 'https://www.themarketfoodshop.com/product/kidney-beans-nigeria-2/',
 'https://www.themarketfoodshop.com/product/ofada-rice/',
 'https://www.themarketfoodshop.com/product/olotu-beans/',
 'https://www.themarketfoodshop.com/product/tasty-pot-peeled-beans/',
 'https://www.themarketfoodshop.com/product/buy-cotonou-rice-online/']

Find the next page inside the category

In [14]:
next_page = category_page.find("ul", {"class": "page-numbers"}).find("a", {"class": "next"}).get("href")

## Parsing product pages

### With size selection

In [15]:
test_with = s.get("https://www.themarketfoodshop.com/product/bag-of-rice-aroso/")

In [16]:
test_with = BeautifulSoup(test_with.text, 'html.parser')


In [17]:
data_json = json.loads(test_with.find("form", {"class": "variations_form"}).get("data-product_variations"))


In [40]:
data_json

[{'attributes': {'attribute_bag-of-rice-aroso-long-grain-rice': '50kg'},
  'availability_html': '',
  'backorders_allowed': False,
  'dimensions': {'length': '', 'width': '', 'height': ''},
  'dimensions_html': 'N/A',
  'display_price': 50000,
  'display_regular_price': 50000,
  'image': {'title': 'Buy Bag Of Rice (Aroso) online',
   'caption': '',
   'url': 'https://www.themarketfoodshop.com/wp-content/uploads/2018/11/buy-Bag-Of-Rice-Aroso-online.jpg',
   'alt': 'Bag Of Rice (Aroso)',
   'src': 'https://www.themarketfoodshop.com/wp-content/uploads/2018/11/buy-Bag-Of-Rice-Aroso-online-300x168.jpg',
   'srcset': 'https://www.themarketfoodshop.com/wp-content/uploads/2018/11/buy-Bag-Of-Rice-Aroso-online-300x168.jpg 300w, https://www.themarketfoodshop.com/wp-content/uploads/2018/11/buy-Bag-Of-Rice-Aroso-online-768x430.jpg 768w, https://www.themarketfoodshop.com/wp-content/uploads/2018/11/buy-Bag-Of-Rice-Aroso-online.jpg 960w',
   'sizes': '(max-width: 300px) 100vw, 300px',
   'full_src': '

In [18]:
test = []
for item in data_json:
    product_temp = {}
    product_temp["size"] = [v for k,v in item.get("attributes").items() if k.startswith("attri")][0]  # It need improvement
    product_temp["price"] = item.get("display_price")
    test.append(product_temp)

In [19]:
test

[{'size': '50kg', 'price': 50000},
 {'size': '25kg', 'price': 27000},
 {'size': '10kg', 'price': 14000}]

In [20]:
test_with.find("h1", {"class": "product_title"}).get_text()

'Aroso Rice (Original)'

### Without size selection

In [21]:
test_without = s.get("https://www.themarketfoodshop.com/product/black-beans/")

In [22]:
test_without = BeautifulSoup(test_without.text, 'html.parser')


In [23]:
test_without.find_all("script", {"type": "application/ld+json"})

[<script class="yoast-schema-graph" type="application/ld+json">{"@context":"https://schema.org","@graph":[{"@type":"WebPage","@id":"https://www.themarketfoodshop.com/product/black-beans/","url":"https://www.themarketfoodshop.com/product/black-beans/","name":"Buy Black Beans For Frejon Online From the Market Food Shop","isPartOf":{"@id":"https://www.themarketfoodshop.com/#website"},"primaryImageOfPage":{"@id":"https://www.themarketfoodshop.com/product/black-beans/#primaryimage"},"image":{"@id":"https://www.themarketfoodshop.com/product/black-beans/#primaryimage"},"thumbnailUrl":"https://www.themarketfoodshop.com/wp-content/uploads/2018/04/buy-black-beans-online.jpg","datePublished":"2018-04-14T04:17:36+00:00","dateModified":"2022-04-10T13:02:27+00:00","description":"Buy black beans for making frejon, a meal enjoyed by catholics on Good friday. Delivery available across Lagos and Nigeria.","breadcrumb":{"@id":"https://www.themarketfoodshop.com/product/black-beans/#breadcrumb"},"inLanguag

In [24]:
test_without.find("script", {"type": "application/ld+json"}, class_=lambda x: x!= "yoast-schema-graph")

<script type="application/ld+json">{"@context":"https:\/\/schema.org\/","@type":"Product","@id":"https:\/\/www.themarketfoodshop.com\/product\/black-beans\/#product","name":"Black beans for frejon meal","url":"https:\/\/www.themarketfoodshop.com\/product\/black-beans\/","description":"Black beans\r\n\r\nBlack beans for making frejon meal\r\n\r\nalso known as ewa ibeji.\r\n\r\n&amp;nbsp;\r\n\r\nEach pack contains 2 derica\r\n\r\n&amp;nbsp;\r\n\r\nQuantity - 2 derica","image":"https:\/\/www.themarketfoodshop.com\/wp-content\/uploads\/2018\/04\/buy-black-beans-online.jpg","sku":"MFS-313","offers":[{"@type":"Offer","price":"4000","priceValidUntil":"2024-12-31","priceSpecification":{"price":"4000","priceCurrency":"NGN","valueAddedTaxIncluded":"false"},"priceCurrency":"NGN","availability":"http:\/\/schema.org\/InStock","url":"https:\/\/www.themarketfoodshop.com\/product\/black-beans\/","seller":{"@type":"Organization","name":"Online Food Market for Nigerians | The Market Food Shop","url":"ht

In [38]:
json.loads(test_without.find("script", {"type": "application/ld+json"}).get_text())

{'@context': 'https://schema.org',
 '@graph': [{'@type': 'WebPage',
   '@id': 'https://www.themarketfoodshop.com/product/black-beans/',
   'url': 'https://www.themarketfoodshop.com/product/black-beans/',
   'name': 'Buy Black Beans For Frejon Online From the Market Food Shop',
   'isPartOf': {'@id': 'https://www.themarketfoodshop.com/#website'},
   'primaryImageOfPage': {'@id': 'https://www.themarketfoodshop.com/product/black-beans/#primaryimage'},
   'image': {'@id': 'https://www.themarketfoodshop.com/product/black-beans/#primaryimage'},
   'thumbnailUrl': 'https://www.themarketfoodshop.com/wp-content/uploads/2018/04/buy-black-beans-online.jpg',
   'datePublished': '2018-04-14T04:17:36+00:00',
   'dateModified': '2022-04-10T13:02:27+00:00',
   'description': 'Buy black beans for making frejon, a meal enjoyed by catholics on Good friday. Delivery available across Lagos and Nigeria.',
   'breadcrumb': {'@id': 'https://www.themarketfoodshop.com/product/black-beans/#breadcrumb'},
   'inLa

In [25]:
data_wo_json = json.loads(
    test_without.find(
        "script",
        {"type": "application/ld+json"},
        class_=lambda x: x!= "yoast-schema-graph").get_text())

In [26]:
data_wo_json.get("offers")[0].get("price")

'4000'

In [27]:
data_wo_json

{'@context': 'https://schema.org/',
 '@type': 'Product',
 '@id': 'https://www.themarketfoodshop.com/product/black-beans/#product',
 'name': 'Black beans for frejon meal',
 'url': 'https://www.themarketfoodshop.com/product/black-beans/',
 'description': 'Black beans\r\n\r\nBlack beans for making frejon meal\r\n\r\nalso known as ewa ibeji.\r\n\r\n&amp;nbsp;\r\n\r\nEach pack contains 2 derica\r\n\r\n&amp;nbsp;\r\n\r\nQuantity - 2 derica',
 'image': 'https://www.themarketfoodshop.com/wp-content/uploads/2018/04/buy-black-beans-online.jpg',
 'sku': 'MFS-313',
 'offers': [{'@type': 'Offer',
   'price': '4000',
   'priceValidUntil': '2024-12-31',
   'priceSpecification': {'price': '4000',
    'priceCurrency': 'NGN',
    'valueAddedTaxIncluded': 'false'},
   'priceCurrency': 'NGN',
   'availability': 'http://schema.org/InStock',
   'url': 'https://www.themarketfoodshop.com/product/black-beans/',
   'seller': {'@type': 'Organization',
    'name': 'Online Food Market for Nigerians | The Market Fo

# Put all together into a function

In [34]:
def scrape_category(
    link: str, 
    category: str,
    Item: BaseModel,
    s: requests.session(),
    delay: float = 1) -> list:
    """Function to scrape a category following pagination.
    Parameters:
        link (str): starting link for a category
        category (str): category name
        Item (BaseModel): class of the data object for the specific source
        s (requests.Session()): Requests session with User-Agent properly set
        delay (float): delay in seconds between calls to prevent overloading the source

    Returns:
        list of product with all information
    """
    time.sleep(delay)
    page = s.get(link)
    page = BeautifulSoup(page.text, 'html.parser')
    links = [item.find("a").get("href") for item in page.find_all("h3", {"class": "product-title"})]
    results = []
    for l in links[:3]:
        time.sleep(delay)
        product = s.get(l)
        product = BeautifulSoup(product.text, 'html.parser')
        parsed_product = {}
        parsed_product["link"] = l
        parsed_product["name"] = product.find("h1", {"class": "product_title"}).get_text()
        if product.find("form", {"class": "variations_form"}):
            data_json = json.loads(product.find("form", {"class": "variations_form"}).get("data-product_variations"))
            for item in data_json:
                parsed_product["size"] = [v for k,v in item.get("attributes").items() if k.startswith("attri")][0]  # It need improvement
                parsed_product["price"] = item.get("display_price")
                
                results.append(Item(**parsed_product))
        else:
            data_json = json.loads(
                product.find(
                    "script",
                    {"type": "application/ld+json"},
                    class_=lambda x: x!= "yoast-schema-graph"
                ).get_text()
            )
            parsed_product["price"] = data_json.get("offers")[0].get("price")
            parsed_product["description"] = data_json.get("description")
            results.append(Item(**parsed_product))

    # Follow pagination if exists
    try:
        next_page = page.find("ul", {"class": "page-numbers"}).find("a", {"class": "next"})
        if next_page is not None:
            next_page = next_page.get("href")
            next_results = scrape_category(link=next_page, category=category, Item=Item, s=s, delay=delay)
            results.extend(next_results)
    except AttributeError:
        pass
            
    return results


## Define data model

In [29]:
class Product(BaseModel):
    link: str
    source: str
    category: str = None
    subcategory: str = None
    subsubcategory: str = None
    name: str = None
    brand: str = None
    size: str = None
    uid: str = None
    price: float
    regular_price: float = None
    currency: str
    in_stock: str = None
    description: str = None
    date: str = date.today().strftime("%Y-%m-%d")

class Themarket(Product):
    source: str = "The Market Food Shop"
    currency: str = "NGN"

## Test on grains category

In [35]:
data = scrape_category(
    link = link_list[0].get("link"), 
    category = link_list[0].get("name"),
    Item = Themarket,
    s = s,
    delay = delay)

In [36]:
data_df = pd.DataFrame([prod.dict(exclude_none=True) for prod in data])

In [37]:
data_df

Unnamed: 0,link,source,name,size,price,currency,date
0,https://www.themarketfoodshop.com/product/bag-...,The Market Food Shop,Aroso Rice (Original),50kg,50000.0,NGN,2023-03-28
1,https://www.themarketfoodshop.com/product/bag-...,The Market Food Shop,Aroso Rice (Original),25kg,27000.0,NGN,2023-03-28
2,https://www.themarketfoodshop.com/product/bag-...,The Market Food Shop,Aroso Rice (Original),10kg,14000.0,NGN,2023-03-28
3,https://www.themarketfoodshop.com/product/blac...,The Market Food Shop,Black beans for frejon meal,,4000.0,NGN,2023-03-28
4,https://www.themarketfoodshop.com/product/brow...,The Market Food Shop,Brown rice (1kg),,4400.0,NGN,2023-03-28
5,https://www.themarketfoodshop.com/product/thai...,The Market Food Shop,Thailand rice (Aroso),,53000.0,NGN,2023-03-28
6,https://www.themarketfoodshop.com/product/whit...,The Market Food Shop,White Beans,,1100.0,NGN,2023-03-28
7,https://www.themarketfoodshop.com/product/buy-...,The Market Food Shop,Yellow Garri,,2200.0,NGN,2023-03-28


In [39]:
data_df.to_csv("themarketfoodshop_{}.csv".format(date.today().strftime("%Y-%m-%d")), index=False)
