# Import necessary libs

In [54]:
from datetime import datetime, timedelta
from bs4 import BeautifulSoup as bs
from dotenv import load_dotenv
import threading, queue
import pandas as pd
import numpy as np
import requests
import logging
import boto3
import json
import re
import os

# Get environment variable
load_dotenv('.env')
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

#initialize S3
S3 = boto3.resource('s3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

#initialize dynamodb
dynamodb = boto3.resource('dynamodb', region_name='ap-southeast-2',
  aws_access_key_id=AWS_ACCESS_KEY_ID,
  aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

table_scrape_hist = dynamodb.Table('ScrapingHistory')

last_refresh = datetime.now()

# Necessary functions

In [56]:
# Function for filter the urls
def include_url_lambda(x):
    url = x.get("href")
    if url in visited_links:
        return False
    visited_links.add(url)
    if not include_url.search(url):
        return False
    to_visits.put(url)
    return True

# Function for scraping data from urls
def scrape_fn(url):
    while True:
        try:
            resp = requests.get(url)
            html = bs(resp.text)
            _ = [x for x in filter(include_url_lambda, html.findAll("a", {"rel": "bookmark"}))]
            raw_pages[url] = resp.text
            raw_articles[url] = html.findAll("article", {"class": "main-article"})[0]
            break
        except Exception as e:
            print(f"Error: {e} -- Trying to reconnect to url {url}.")
            
# Wrapper function for validate outdated of s3 credential and auto replacing
def validate_update(fn):
    def return_fn(*ar, **kw):
        global S3, last_refresh, refresh_time
        _now = datetime.now()
        if (last_refresh + refresh_time) <= _now:
            S3 = boto3.resource('s3', 
                aws_access_key_id=AWS_ACCESS_KEY_ID,
                aws_secret_access_key=AWS_SECRET_ACCESS_KEY
            )
            last_refresh = _now

        return fn(*ar, **kw)
    return return_fn

# Store the data to the S3 instance
@validate_update
def store_data_on_s3(prefix, source, ext_data, raw_data):
    try:
        now = datetime.now()
        filename = f"{prefix}-{now.strftime('%Y-%m-%d_%H:%M:%S')}.json"

        # Simple extracted recipes
        ext_s3 = S3.Object("eltrial", f"SCRAPING/SIMPLE_EXTRACTED/{filename}")

        # Save raw pages for debugging and correction
        raw_s3 = S3.Object("eltrial", f"SCRAPING/RAW/{filename}")

        ext_json = json.dumps(ext_data)
        raw_json = json.dumps(raw_data)

        ext_s3.put(Body=ext_json, ACL='public-read')
        raw_s3.put(Body=raw_json, ACL='public-read')
        
        table_scrape_hist.put_item(
            Item={
                "source": source,
                "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
            }
        )
        
        print("Storing the data is completed")
    except Exception as e:
        print(f"Error: {e}")

# Settings

In [57]:
MAX_THREAD = 100
NOT_FOUND_TEXT = "Oops! That page can’t be found."
include_url = re.compile("food-recipe")
# include_url = re.compile("\/food-recipe|\/food-recommend|\/easy-menu")
BASE_URL = "https://food.mthai.com/food-recipe"
refresh_time = timedelta(hours=1)

# Crawl data from sources

In [58]:
# Main Thread

if __name__ == "__main__":
    
    visited_links = set()
    to_visits = queue.Queue()
    raw_pages = {}
    raw_articles = {}
    num_page = 1
    
    while True:
        
        if num_page == 1:
            start_url = BASE_URL
        else:
            start_url = f"{BASE_URL}/page/{num_page}"
        
        print(f"Searching for page {num_page}.")
        resp = requests.get(start_url)
        page_text = resp.text
        
        # Stop crawling when current page is not found.
        if resp.status_code == 404:
            print("Not found...\nComplete scraping.")
            break
        
        html = bs(page_text)
        _ = [x for x in filter(include_url_lambda, html.findAll("a", {"rel": "bookmark"}))]

        while True:
            threads = []

            if to_visits.empty():
                break

            qsize = to_visits.qsize()
            num_threads = min(MAX_THREAD, qsize)
            print(f"Generating {num_threads} threads ...")
            for _t in range(num_threads):
                _url = to_visits.get()
                _thread = threading.Thread(target=scrape_fn, args=(_url,))
                _thread.start()
                threads.append(_thread)

            for _t in threads:
                _t.join()
            print(f"finish run {num_threads} threads. {to_visits.qsize()} remaining urls")

            del threads
        print("finished.")
        
        num_page += 1

Searching for page 1.
Generating 13 threads ...
finish run 13 threads. 8 remaining urls
Generating 8 threads ...
finish run 8 threads. 5 remaining urls
Generating 5 threads ...
finish run 5 threads. 1 remaining urls
Generating 1 threads ...
finish run 1 threads. 0 remaining urls
finished.
Searching for page 2.
Generating 11 threads ...
finish run 11 threads. 11 remaining urls
Generating 11 threads ...
finish run 11 threads. 11 remaining urls
Generating 11 threads ...
finish run 11 threads. 3 remaining urls
Generating 3 threads ...
finish run 3 threads. 0 remaining urls
finished.
Searching for page 3.
Generating 9 threads ...
finish run 9 threads. 6 remaining urls
Generating 6 threads ...
finish run 6 threads. 6 remaining urls
Generating 6 threads ...
finish run 6 threads. 2 remaining urls
Generating 2 threads ...
finish run 2 threads. 2 remaining urls
Generating 2 threads ...
finish run 2 threads. 0 remaining urls
finished.
Searching for page 4.
Generating 9 threads ...
finish run 9 th

Generating 7 threads ...
finish run 7 threads. 0 remaining urls
finished.
Searching for page 49.
Generating 10 threads ...
finish run 10 threads. 0 remaining urls
finished.
Searching for page 50.
Generating 9 threads ...
finish run 9 threads. 2 remaining urls
Generating 2 threads ...
finish run 2 threads. 0 remaining urls
finished.
Searching for page 51.
Generating 7 threads ...
finish run 7 threads. 0 remaining urls
finished.
Searching for page 52.
Generating 11 threads ...
finish run 11 threads. 0 remaining urls
finished.
Searching for page 53.
Generating 10 threads ...
finish run 10 threads. 2 remaining urls
Generating 2 threads ...
finish run 2 threads. 0 remaining urls
finished.
Searching for page 54.
Generating 10 threads ...
finish run 10 threads. 5 remaining urls
Generating 5 threads ...
finish run 5 threads. 0 remaining urls
finished.
Searching for page 55.
Generating 9 threads ...
finish run 9 threads. 3 remaining urls
Generating 3 threads ...
finish run 3 threads. 0 remainin

In [59]:
# Print number of pages
len(raw_pages), len(raw_articles)

(910, 910)

# Simple Extract Recipe From Pages

In [60]:
ingredients = {}
EXCLUDE_WORDS = set(["หมายถึง", "อาหารเจ", "เคล็ดลับความอร่อย"])
ingredient_word = re.compile("ส่วนผสม|วัตถุดิบ|เครื่องปรุง|ส่วนประกอบ|วัตุดิบ|อัตราส่วน")
how_to_word = re.compile("ขั้นตอนการทำ|วิธีทำ|วิธีการทำ")

process = 0
_include = 0
_not_include = []

for url, page in raw_articles.items():
    strong = page.findAll(["strong", "p", "h1", "h2", "h3", "li", "a"])
    
    header_exists = False
    title = ""
    for s in strong:
        text = s.text
        is_ingredient = ingredient_word.search(text)
        is_how_to = how_to_word.search(text)

        if (s.name in ["strong", "h3", "h1"]) and not (is_ingredient or is_how_to) and text not in EXCLUDE_WORDS:
            title = f"{text}_{url}"
        else:
            if not header_exists:
                if is_ingredient:
                    header_exists = True
            else:
                if is_how_to:
                    header_exists = False
                else:
                    if s.name == "li":
                        if title not in ingredients:
                            ingredients[title] = [text]
                        else:
                            ingredients[title].append(text)
         
    process += 1
    print("\r", end=f"{process}")

910

In [61]:
ingredients

{'ข้าวผัดกุ้ง_https://food.mthai.com/food-recipe/138335.html': ['น้ำมันรำข้าว',
  'กระเทียมสับ 2 ช้อนโต๊ะ',
  'มันกุ้ง 80 กรัม',
  'เกลือ 1/2 ช้อนชา',
  'กุ้งสด 60 กรัม',
  'ข้าวสวย 1 ถ้วย',
  'ซอสหอยนางรม 2 ช้อนโต๊ะ',
  'แครอท (ซอย) 2 ช้อนโต๊ะ',
  'ต้นหอม (ซอย) 2 ช้อนโต๊ะ',
  'พริกไทยขาว (ป่น) 1 ช้อนชา'],
 ' ผัดเปรี้ยวหวาน_https://food.mthai.com/food-recipe/142292.html': ['เนื้อหมูสันนอก\xa0 200 กรัม',
  'สับปะรด 40 กรัม',
  'ซอสมะเขือเทศ 40 กรัม',
  'ซอสพริก 15 กรัม',
  'น้ำตาลทรายแดง 15 กรัม',
  'น้ำมันรำข้าว 30 กรัม',
  'กระเทียมสับ 10 กรัม',
  'หอมหัวใหญ่ 50 กรัม',
  'แตงกวา 50 กรัม',
  'มะเขือเทศ\xa0 50 กรัม',
  'สับปะรด 70 กรัม',
  'ต้นหอม\xa0 15 กรัม'],
 ' ราดหน้าหมูคะน้า\xa0_https://food.mthai.com/food-recipe/124172.html': ['ซอสหอยนางรม 3 ช้อนโต๊ะ',
  'ซอสปรุงรส 2 ช้อนชา',
  'ซีอิ๊วขาว 2 1/2 ช้อนโต๊ะ',
  'น้ำมันงา 1 ช้อนโต๊ะ',
  'น้ำตาลทราย 2 ช้อนชา',
  'แป้งมันสำปะหลัง 1 ช้อนโต๊ะ',
  'พริกไทยป่น 2 ช้อนชา',
  'น้ำเปล่า 1/4 ถ้วย',
  'เนื้อหมูสันนอก 500 กรัม',
  'ไข่ขาว 2 ฟอง',


# Store crawling data to S3 for further utilization

In [62]:
store_data_on_s3(prefix="Mthai-recipe", source=BASE_URL, ext_data=ingredients, raw_data=raw_pages)

Storing the data is completed
