# Import necessary libs

In [4]:
from datetime import datetime, timedelta
from bs4 import BeautifulSoup as bs
from dotenv import load_dotenv
import threading, queue
import pandas as pd
import numpy as np
import requests
import logging
import boto3
import json
import re
import os

# Get environment variable
load_dotenv('.env')
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

#initialize S3
S3 = boto3.resource('s3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

#initialize dynamodb
dynamodb = boto3.resource('dynamodb', region_name='ap-southeast-2',
  aws_access_key_id=AWS_ACCESS_KEY_ID,
  aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

table_scrape_hist = dynamodb.Table('ScrapingHistory')

last_refresh = datetime.now()

# Necessary functions

In [14]:
# Function for filter the urls
def include_url_lambda(x):
    url = x.get("href")
    if url == None:
        return False
    
    if not "view" in url:
        return False
    
    url = f"https://cooking.kapook.com/view{url.split('view')[1]}"
    
#     if url[:2] == "//":
#         url = "https://" + url[2:]
        
#     if "http://https://" in url:
#         url = url.replace("http://", "")
        
#     if "%3E//" in url:
#         url = url.replace("%3E//", "https://")
        
    if url in visited_links:
        return False
    visited_links.add(url)
    if not include_url.search(url):
        return False
    to_visits.put(url)
    return True

# Function for scraping data from urls
def scrape_fn(url):
    while True:
        try:
            resp = requests.get(url)
            html = bs(resp.text)
            _ = [x for x in filter(include_url_lambda, html.findAll("a"))]
            raw_pages[url] = resp.text
            raw_articles[url] = html.findAll("div", {"id": "container"})
            if len(raw_articles[url]) == 0:
                del raw_articles[url]
            else:
                raw_articles[url] = raw_articles[url][0]
            break
        except Exception as e:
            print(f"Error: {e} -- Trying to reconnect to url {url}.")
            
# Wrapper function for validate outdated of s3 credential and auto replacing
def validate_update(fn):
    def return_fn(*ar, **kw):
        global S3, last_refresh, refresh_time
        _now = datetime.now()
        if (last_refresh + refresh_time) <= _now:
            S3 = boto3.resource('s3', 
                aws_access_key_id=AWS_ACCESS_KEY_ID,
                aws_secret_access_key=AWS_SECRET_ACCESS_KEY
            )
            last_refresh = _now

        return fn(*ar, **kw)
    return return_fn

# Store the data to the S3 instance
@validate_update
def store_data_on_s3(prefix, source, ext_data, raw_data):
    try:
        now = datetime.now()
        filename = f"{prefix}-{now.strftime('%Y-%m-%d_%H:%M:%S')}.json"

        # Simple extracted recipes
        ext_s3 = S3.Object("eltrial", f"SCRAPING/SIMPLE_EXTRACTED/{filename}")

        # Save raw pages for debugging and correction
#         raw_s3 = S3.Object("eltrial", f"SCRAPING/RAW/{filename}")

        ext_json = json.dumps(ext_data)
#         raw_json = json.dumps(raw_data)

        ext_s3.put(Body=ext_json, ACL='public-read')
#         raw_s3.put(Body=raw_json, ACL='public-read')
        
        table_scrape_hist.put_item(
            Item={
                "source": source,
                "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
            }
        )
        
        print("Storing the data is completed")
    except Exception as e:
        print(f"Error: {e}")

# Settings

In [6]:
MAX_THREAD = 200
include_url = re.compile("cooking.kapook.com/view")
# include_url = re.compile("\/food-recipe|\/food-recommend|\/easy-menu")
BASE_URL = "https://cooking.kapook.com/"
refresh_time = timedelta(hours=1)

# Crawl data from sources

In [7]:
# Main Thread

if __name__ == "__main__":
    
    visited_links = set()
    to_visits = queue.Queue()
    raw_pages = {}
    raw_articles = {}
    
    start_url = f"{BASE_URL}/menu"

    resp = requests.get(start_url)
    page_text = resp.text

    html = bs(page_text)
    _ = [x for x in filter(include_url_lambda, html.findAll("a"))]

    while True:
        threads = []

        if to_visits.empty():
            break

        qsize = to_visits.qsize()
        num_threads = min(MAX_THREAD, qsize)
        print(f"Generating {num_threads} threads ...")
        for _t in range(num_threads):
            _url = to_visits.get()
            _thread = threading.Thread(target=scrape_fn, args=(_url,))
            _thread.start()
            threads.append(_thread)

        for _t in threads:
            _t.join()
        print(f"finish run {num_threads} threads. {to_visits.qsize()} remaining urls")

        del threads
    print("finished.")

Generating 79 threads ...
finish run 79 threads. 363 remaining urls
Generating 200 threads ...
finish run 200 threads. 429 remaining urls
Generating 200 threads ...
finish run 200 threads. 361 remaining urls
Generating 200 threads ...
finish run 200 threads. 353 remaining urls
Generating 200 threads ...
finish run 200 threads. 293 remaining urls
Generating 200 threads ...
finish run 200 threads. 227 remaining urls
Generating 200 threads ...
finish run 200 threads. 110 remaining urls
Generating 110 threads ...
finish run 110 threads. 24 remaining urls
Generating 24 threads ...
finish run 24 threads. 14 remaining urls
Generating 14 threads ...
finish run 14 threads. 7 remaining urls
Generating 7 threads ...
finish run 7 threads. 1 remaining urls
Generating 1 threads ...
finish run 1 threads. 9 remaining urls
Generating 9 threads ...
finish run 9 threads. 5 remaining urls
Generating 5 threads ...
finish run 5 threads. 2 remaining urls
Generating 2 threads ...
finish run 2 threads. 1 remai

In [8]:
# Print number of pages
len(raw_pages), len(raw_articles)

(1452, 1412)

# Simple Extract Recipe From Pages

In [9]:
ingredients = {}
EXCLUDE_WORDS = set(["หมายถึง", "อาหารเจ", "เคล็ดลับความอร่อย"])
ingredient_word = re.compile("ส่วนผสม|วัตถุดิบ|เครื่องปรุง|ส่วนประกอบ|วัตุดิบ|อัตราส่วน")
how_to_word = re.compile("ขั้นตอนการทำ|วิธีทำ|วิธีการทำ")

process = 0
_include = 0
_not_include = []

for url, page in raw_articles.items():
    strong = page.findAll(["strong", "p", "h1", "h2", "h3", "li", "a"])
    
    header_exists = False
    title = ""
    for s in strong:
        text = s.text
        is_ingredient = ingredient_word.search(text)
        is_how_to = how_to_word.search(text)

        if (s.name in ["strong", "h3", "h1"]) and not (is_ingredient or is_how_to) and text not in EXCLUDE_WORDS:
            title = f"{text}_{url}"
        else:
            if not header_exists:
                if is_ingredient:
                    header_exists = True
            else:
                if is_how_to:
                    header_exists = False
                else:
                    if s.name == "li":
                        if title not in ingredients:
                            ingredients[title] = [text]
                        else:
                            ingredients[title].append(text)
         
    process += 1
    print("\r", end=f"{process}")

1412

In [11]:
len(ingredients)

336

In [10]:
ingredients

{'ไมโครเวฟพร้อมจ้า ! ชวนทำไดฟูกุสตรอว์เบอร์รี เมนูไมโครเวฟ แป้งสีสวยเหนียวนุ่ม สอดไส้ถั่วแดงกวนกับผลไม้สด ปั้นลูกโตน่าหม่ำ จับแช่เย็นยิ่งฟิน_https://cooking.kapook.com/view206822.html': ['แป้งข้าวเหนียว 1+1/2 ถ้วย',
  'แป้งข้าวโพด 1 ช้อนโต๊ะ',
  'น้ำตาลทรายทรายป่นละเอียด 5 ช้อนโต๊ะ',
  'น้ำสะอาด 250 กรัม',
  'สีผสมอาหารสีแดง (ใส่หรือไม่ใส่ก็ได้) \xa0',
  'แป้งมันสำปะหลัง สำหรับทำแป้งนวล 100 กรัม (นำไปผัดด้วยไฟกลางค่อนข้างอ่อน 5 นาที)\n\t\xa0',
  'สตรอว์เบอร์รี\n\t\xa0',
  'เนยสดเค็ม 1/2 ช้อนโต๊ะ',
  'ถั่วแดงกระป๋อง 200 กรัม',
  'น้ำตาลทรายแดง 160 กรัม\n\t\xa0'],
 ' ขนมหวานวันหยุด เมนูคัพเค้กไมโครเวฟ เมนูไมโครเวฟ อร่อยนุ่มนานแต่งหน้าตามชอบ ทำง่ายไม่ต้องใช้เตาอบเหมาะกับเด็กหอ\xa0\xa0_https://cooking.kapook.com/view202667.html': ['เนยสด 120 กรัม',
  'น้ำตาลทราย 120 กรัม',
  'กลิ่นวานิลลา 1/2 ช้อนชา',
  'ไข่ 2 ฟอง',
  'แป้งสาลีอเนกประสงค์ 120 กรัม',
  'ครีมและเกล็ดน้ำตาลตกแต่งตามชอบ\n\t\xa0'],
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 สะบัดบ๊อบให้เมนูช็อกโกแลตบอลสุดเชย แล้วมาลองทำช็อกโกแล

# Store crawling data to S3 for further utilization

In [15]:
store_data_on_s3(prefix="Kapook-recipe", source=BASE_URL, ext_data=ingredients, raw_data=raw_pages)

Storing the data is completed
