# Import necessary libs

In [1]:
from datetime import datetime, timedelta
from bs4 import BeautifulSoup as bs
from dotenv import load_dotenv
import threading, queue
import pandas as pd
import numpy as np
import requests
import logging
import boto3
import json
import re
import os

# Get environment variable
load_dotenv('.env')
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

#initialize S3
S3 = boto3.resource('s3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

#initialize dynamodb
dynamodb = boto3.resource('dynamodb', region_name='ap-southeast-2',
  aws_access_key_id=AWS_ACCESS_KEY_ID,
  aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

table_scrape_hist = dynamodb.Table('ScrapingHistory')

last_refresh = datetime.now()

# Necessary functions

In [66]:
# Function for filter the urls
def include_url_lambda(x):
    url = x.get("href")
    url = f"{BASE_URL}{url}".replace("//", "/")
    url = url.replace("https:/", "https://")
    if url in visited_links:
        return False
    visited_links.add(url)
    if not include_url.search(url):
        return False
    to_visits.put(url)
    return True

# Function for scraping data from urls
def scrape_fn(url):
    while True:
        try:
            resp = requests.get(url, headers={"user-agent": str(np.random.randint(0, 1e10))})
            html = bs(resp.text)
            _ = [x for x in filter(include_url_lambda, html.findAll("a"))]
            raw_pages[url] = resp.text
            break
        except Exception as e:
            print(f"Error: {e} -- Trying to reconnect to url {url}.")
            
# Wrapper function for validate outdated of s3 credential and auto replacing
def validate_update(fn):
    def return_fn(*ar, **kw):
        global S3, last_refresh, refresh_time
        _now = datetime.now()
        if (last_refresh + refresh_time) <= _now:
            S3 = boto3.resource('s3', 
                aws_access_key_id=AWS_ACCESS_KEY_ID,
                aws_secret_access_key=AWS_SECRET_ACCESS_KEY
            )
            last_refresh = _now

        return fn(*ar, **kw)
    return return_fn

# Store the data to the S3 instance
@validate_update
def store_data_on_s3(prefix, source, ext_data, raw_data):
    try:
        now = datetime.now()
        filename = f"{prefix}-{now.strftime('%Y-%m-%d_%H:%M:%S')}.json"

        # Simple extracted recipes
        ext_s3 = S3.Object("eltrial", f"SCRAPING/SIMPLE_EXTRACTED/{filename}")

        # Save raw pages for debugging and correction
#         raw_s3 = S3.Object("eltrial", f"SCRAPING/RAW/{filename}")

        ext_json = json.dumps(ext_data)
#         raw_json = json.dumps(raw_data)

        ext_s3.put(Body=ext_json, ACL='public-read')
#         raw_s3.put(Body=raw_json, ACL='public-read')
        
        table_scrape_hist.put_item(
            Item={
                "source": source,
                "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
            }
        )
        
        print("Storing the data is completed")
    except Exception as e:
        print(f"Error: {e}")

# Settings

In [70]:
MAX_THREAD = 200
include_url = re.compile("www.wongnai.com/recipes")
BASE_URL = "https://www.wongnai.com/"
refresh_time = timedelta(hours=1)

# Crawl data from sources

In [19]:
# Main Thread

if __name__ == "__main__":
    
    visited_links = set()
    to_visits = queue.Queue()
    raw_pages = {}
    raw_articles = {}
    
    start_url = f"{BASE_URL}"
    
    resp = requests.get(start_url, headers={"user-agent": str(np.random.randint(0, 1e10))})
    page_text = resp.text

    html = bs(page_text)
    _ = [x for x in filter(include_url_lambda, html.findAll("a"))]

    while True:
        threads = []

        if to_visits.empty():
            break

        qsize = to_visits.qsize()
        num_threads = min(MAX_THREAD, qsize)
        print(f"Generating {num_threads} threads ...")
        for _t in range(num_threads):
            _url = to_visits.get()
            _thread = threading.Thread(target=scrape_fn, args=(_url,))
            _thread.start()
            threads.append(_thread)

        for _t in threads:
            _t.join()
        print(f"finish run {num_threads} threads. {to_visits.qsize()} remaining urls")

        del threads
    print("finished.")

Generating 200 threads ...
finish run 200 threads. 1318 remaining urls
Generating 200 threads ...
finish run 200 threads. 2010 remaining urls
Generating 200 threads ...
finish run 200 threads. 2464 remaining urls
Generating 200 threads ...
finish run 200 threads. 2826 remaining urls
Generating 200 threads ...
finish run 200 threads. 3310 remaining urls
Generating 200 threads ...
finish run 200 threads. 3691 remaining urls
Generating 200 threads ...
finish run 200 threads. 4071 remaining urls
Generating 200 threads ...
finish run 200 threads. 4441 remaining urls
Generating 200 threads ...
finish run 200 threads. 4673 remaining urls
Generating 200 threads ...
finish run 200 threads. 4856 remaining urls
Generating 200 threads ...
finish run 200 threads. 5022 remaining urls
Generating 200 threads ...
finish run 200 threads. 5305 remaining urls
Generating 200 threads ...
finish run 200 threads. 5494 remaining urls
Generating 200 threads ...
finish run 200 threads. 5465 remaining urls
Genera

finish run 200 threads. 9388 remaining urls
Generating 200 threads ...
finish run 200 threads. 9364 remaining urls
Generating 200 threads ...
finish run 200 threads. 9311 remaining urls
Generating 200 threads ...
finish run 200 threads. 9290 remaining urls
Generating 200 threads ...
finish run 200 threads. 9250 remaining urls
Generating 200 threads ...
finish run 200 threads. 9247 remaining urls
Generating 200 threads ...
finish run 200 threads. 9238 remaining urls
Generating 200 threads ...
finish run 200 threads. 9202 remaining urls
Generating 200 threads ...
finish run 200 threads. 9144 remaining urls
Generating 200 threads ...
finish run 200 threads. 9110 remaining urls
Generating 200 threads ...
finish run 200 threads. 9121 remaining urls
Generating 200 threads ...
finish run 200 threads. 9111 remaining urls
Generating 200 threads ...
finish run 200 threads. 9075 remaining urls
Generating 200 threads ...
finish run 200 threads. 9040 remaining urls
Generating 200 threads ...
finish

finish run 200 threads. 1401 remaining urls
Generating 200 threads ...
finish run 200 threads. 1302 remaining urls
Generating 200 threads ...
finish run 200 threads. 1222 remaining urls
Generating 200 threads ...
finish run 200 threads. 1135 remaining urls
Generating 200 threads ...
finish run 200 threads. 1085 remaining urls
Generating 200 threads ...
finish run 200 threads. 992 remaining urls
Generating 200 threads ...
finish run 200 threads. 906 remaining urls
Generating 200 threads ...
finish run 200 threads. 843 remaining urls
Generating 200 threads ...
finish run 200 threads. 750 remaining urls
Generating 200 threads ...
finish run 200 threads. 665 remaining urls
Generating 200 threads ...
finish run 200 threads. 555 remaining urls
Generating 200 threads ...
finish run 200 threads. 478 remaining urls
Generating 200 threads ...
finish run 200 threads. 388 remaining urls
Generating 200 threads ...
finish run 200 threads. 305 remaining urls
Generating 200 threads ...
finish run 200 

In [20]:
# Print number of pages
len(raw_pages)

49943

# Simple Extract Recipe From Pages

In [60]:
ings = {}
for h in js["store"]["recipeData"]["value"]["data"]["ingredients"]:
    ings[h["ingredientName"]] = h["amount"]


In [63]:
ingredients = {}
raw_pages_key_list = list(raw_pages.keys())
for k in range(len(raw_pages.keys())):
    page_text = raw_pages[raw_pages_key_list[k]]
    html = bs(page_text)
    rr = html.findAll("script")
    for i in rr:
        try:
            js = json.loads(i.contents[0].replace("\n\twindow._wn = ", "").replace("\n\t", "").replace(";", ""))
            title = js["store"]["recipeData"]["value"]["data"]["title"]
            ings = {}
            for h in js["store"]["recipeData"]["value"]["data"]["ingredients"]:
                ings[h["ingredientName"]] = h["amount"]
            ingredients[title] = ings
            break
        except:
            pass
        
    print("\r", end=f"{k}")

49942

In [64]:
len(ingredients)

9221

# Store crawling data to S3 for further utilization

In [68]:
store_data_on_s3(prefix="Wongnai-recipe", source=BASE_URL, ext_data=ingredients, raw_data=raw_pages)

Storing the data is completed
