In [72]:
from dotenv import load_dotenv
from datetime import datetime
import pandas as pd
import requests
import psycopg2
import boto3
import json
import os
import re

# Get environment variable
load_dotenv('.env')
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

#initialize S3
S3 = boto3.client('s3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

#initialize dynamodb
dynamodb = boto3.resource('dynamodb', region_name='ap-southeast-2',
  aws_access_key_id=AWS_ACCESS_KEY_ID,
  aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

table_recipe = dynamodb.Table('FoodRecipes')

conn = psycopg2.connect(
    "dbname=eltrial user=top password=1234"
)

In [60]:
BUCKET = "eltrial"
OBJECT = "SCRAPING/SIMPLE_EXTRACTED/"
FILENAME = [
    "Mthai-recipe-2020-07-22_20:25:58.json",
    "Kapook-recipe-2020-07-25_15:25:52.json",
    "Wongnai-recipe-2020-07-25_13:54:26.json"
]

for f in FILENAME:
    S3.download_file(BUCKET, f"{OBJECT}{f}", f)

In [65]:
with open("Wongnai-recipe-2020-07-25_13:54:26.json") as f:
    wongnai = json.load(f)
    
with open("Kapook-recipe-2020-07-25_15:25:52.json") as f:
    kapook = json.load(f)
    
with open("Mthai-recipe-2020-07-22_20:25:58.json") as f:
    mthai = json.load(f)

In [124]:
num = re.compile("\d+ ‡∏Å‡∏£‡∏±‡∏°$")

In [130]:
num.search(wongnai[j][k]).string

'150 ‡∏Å‡∏£‡∏±‡∏°'

# Wongnai

In [574]:
units = {
    "‡∏Å‡∏£‡∏±‡∏°": "g",
    "g": "g",
    "grams": "g",
    "‡∏Å‡∏£‡∏∞‡∏°": "g",
    "‡∏Å‡∏¥‡πÇ‡∏•": "kg",
    "‡∏ü‡∏≠‡∏á": "unit",
    "‡∏ä‡∏¥‡πâ‡∏ô‡πÉ‡∏´‡∏ç‡πà": "unknown",
    "‡∏°‡∏¥‡∏•‡∏•‡∏¥‡∏•‡∏¥‡∏ï‡∏£": "ml",
    "‡∏Å‡πâ‡∏≠‡∏ô": "unknown",
    "‡∏ñ‡πâ‡∏ß‡∏¢": "unknown",
    "‡∏ä‡πâ‡∏≠‡∏ô‡πÇ‡∏ï‡πä‡∏∞": (15, "ml"),
    "‡∏ä‡∏¥‡πâ‡∏ô.*": "unknown",
    "‡∏Å‡∏£‡πâ‡∏°": "g",
    "ml": "ml",
    "kg": (1000, "g"),
    "‡∏Å‡∏Å": (1000, "g"),
    "‡∏Ç‡∏µ‡∏î": (100, "g"),
    "‡∏Å‡∏¥‡πÇ‡∏•‡∏Å‡∏£‡∏±‡∏°": (1000, "g"),
    "‡∏Å‡∏£‡∏°": "g",
    "‡∏ó‡πà‡∏≠‡∏ô": "unknown",
    "‡πÇ‡∏Ñ‡∏£‡∏á": "unknown",
    "‡πÉ‡∏ö": "unit",
    "‡πÅ‡∏û‡πá‡∏Ñ": "unknown",
    "mg": (0.001, "g"),
    "‡∏Å$": "g",
    "‡πÄ‡∏™‡πâ‡∏ô": "unknown",
    "‡∏ô‡πà‡∏≠‡∏á": "unknown",
    "‡∏õ‡∏µ‡∏Å": "unknown",
    "‡∏Ç‡∏≤": "unknown",
    "‡∏ß‡∏á": "unknown",
    "‡∏ï‡∏±‡∏ß": "unknown",
    "‡∏Ç‡∏∂‡∏î": "unknown",
    "‡∏ã‡∏≠‡∏á": "unknown",
    "‡∏≠‡∏Å": "unknown",
    "‡∏à‡∏≤‡∏ô": "unknown",
    "‡πÇ‡∏•": (1000, "g"),
    "‡πÅ‡∏ó‡πà‡∏á": "unknown",
    "‡∏ö‡∏≤‡∏ó": "baht",
    "‡πÑ‡∏°‡πâ": "unknown",
    "‡∏î‡∏∏‡πâ‡∏ô‡∏™‡∏±‡πâ‡∏ô": "unknown",
    "‡∏ï‡∏µ‡∏ô": "unknown",
    "Kg": (1000, "g"),
    "‡∏ó‡∏±‡∏û‡∏û‡∏µ": "unknown",
    "‡∏ü‡∏≠‡∏ß": "unit",
    "‡∏ñ‡∏≤‡∏î": "unknown",
    "‡∏ä‡πâ‡∏¥‡∏ô‡πÇ‡∏ï‡πä‡∏∞": (15, "g"),
    "‡∏´‡πà‡∏≠": "unknown",
    "‡∏•‡∏π‡∏Å": "unit",
    "KG": (1000, "g"),
    "‡πÅ‡∏ú‡πà‡∏ô.*": "unknown",
    "G": "g",
    "‡∏ï‡∏≤‡∏°‡∏™‡∏∞‡∏î‡∏ß‡∏Å": "unknown",
    "‡∏ä‡∏ï": (15, "g"),
    "pack": "unknown",
    "‡∏û‡∏ß‡∏á": "unknown",
    "‡∏ñ‡∏∏‡∏á": "unknown",
    "pcs": "unit",
    "‡∏™‡πÑ‡∏•‡∏î‡πå": "unknown",
    "‡πÇ‡∏Ñ‡∏£": "unknown",
    "‡∏ü$": "unit",
    "‡∏ä‡∏≤‡∏°": "unknown",
    "‡πÅ‡∏û‡∏Ñ": "unknown",
    "‡∏Ç‡∏µ‡πÄ": (100, "g"),
    "‡∏ä‡πâ‡∏≠‡∏ô$": "unknown",
    "‡∏≠‡∏±‡∏ô": "unknown",
    "‡πÅ‡∏û‡πä‡∏Ñ": "unknown",
    "‡∏Ñ‡∏£‡∏∂‡πà‡∏á‡πÇ‡∏•": (500, "g"),
    "‡∏´‡∏°‡πâ‡∏≠": "unknown",
    "‡∏Å‡∏•‡πà‡∏≠‡∏á": "unknown",
    "‡∏Å‡∏£‡∏µ‡∏°": "g",
    "‡∏Å‡∏ô‡∏±‡∏°": "g",
    "‡∏™‡∏∞‡πÇ‡∏û‡∏Å": "unknown",
    "‡∏ã‡∏µ‡∏Å": "unknown",
    "‡∏Ç‡∏£‡∏î": "unknown",
    "‡∏ä‡∏µ‡∏î": "unknown",
    "‡∏´‡∏¢‡∏¥‡∏ö‡∏°‡∏∑‡∏≠": "unknown",
    "‡πÇ‡∏Ñ‡∏•‡∏á": "unknown",
    "‡∏°‡∏∑‡∏≠": "unknown",
    "‡∏ñ‡∏ï": (140, "g"),
    "‡∏Å‡∏£‡∏±‡∏ö": "g",
    "‡πÅ$": "g",
    "‡∏Å‡∏°": "g",
    "‡∏ä‡∏∏‡∏î": "unknown",
    "‡∏ä‡πâ‡∏≠‡∏ô‡πÇ‡∏ï‡∏ì‡∏∞": (15, "g"),
    "‡∏´‡∏≠‡∏á": "unit",
    "‡∏´‡∏≤‡∏á": "unknown",
    "‡∏ä‡πâ‡∏≠‡∏ô‡πÇ‡∏ï‡πç‡∏∞": "unknown",
    "‡∏ó‡∏µ‡πà": "unknown",
}


patterns = [
    "\d+ - \d+\ *{}",
    "\d+-\d+\ *{}",
    "\d+\ *{}",
    "\d+\t*{}",
]

regex_patterns = []
for u in units:
    for p in patterns:
        regex_patterns.append(p.format(u))
        
regex_pattern = re.compile("|".join(regex_patterns))

In [575]:
ig

'‡∏Ñ‡∏£‡∏∂‡πà1/2'

In [577]:
includes = re.compile("(‡∏´‡∏°‡∏π|‡πÑ‡∏Å‡πà|‡∏ß‡∏±‡∏ß|^‡πÑ‡∏Ç‡πà$|^‡πÑ‡∏Ç‡πà‡πÑ‡∏Å‡πà|$‡∏ô‡∏°)")
excludes = re.compile("‡∏ô‡∏°‡∏Ç‡πâ‡∏ô‡∏´‡∏ß‡∏≤‡∏ô|‡∏ú‡∏á|‡∏Ñ‡∏ô‡∏≠‡∏£‡πå|‡∏°‡∏≤‡∏°‡πà‡∏≤|‡∏Ñ‡∏≤‡∏£‡πå‡πÄ‡∏ô‡∏ä‡∏±‡πà‡∏ô|‡∏´‡∏°‡∏π‡∏Å‡∏£‡∏≠‡∏ö|‡∏Å‡∏£‡∏∞‡∏î‡∏π‡∏Å|‡∏õ‡∏π|^‡∏ã‡∏≠‡∏™|^‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡πÄ‡∏ó‡∏®|^‡πÄ‡∏ï‡πâ‡∏≤‡∏´‡∏π‡πâ|‡∏≠‡∏¥‡∏ô‡∏ó‡∏ú‡∏≤‡∏•‡∏±‡∏°|‡∏•‡∏π‡∏Å‡∏ä‡∏¥‡πâ‡∏ô|‡∏´‡∏°‡∏π‡πÅ‡∏Æ‡∏°|‡πÑ‡∏Ç‡πà‡πÅ‡∏î‡∏á‡πÄ‡∏Ñ‡πá‡∏°|‡∏ô‡πâ‡∏≥‡∏°‡∏±‡∏ô|‡∏Ñ‡∏≠‡∏£‡πå|‡πÑ‡∏™‡πâ‡∏Å‡∏£‡∏≠‡∏Å|‡∏´‡∏±‡∏ß‡πÉ‡∏à|‡∏Å‡∏£‡∏∞‡πÄ‡∏û‡∏≤‡∏∞|‡∏õ‡∏≠‡∏î|‡∏£‡∏™‡∏î‡∏µ|‡∏Ñ‡∏ô‡∏≠|‡∏ô‡πâ‡∏≥‡∏ã‡∏∏‡∏õ|‡∏´‡∏°‡∏π‡∏¢‡∏≠|‡∏ô‡πâ‡∏≥|‡∏ã‡∏µ‡∏≠‡∏¥‡πä‡∏ß|‡∏ä‡∏∏‡∏î|^‡∏û‡∏£‡∏¥‡∏Å‡πÑ‡∏ó‡∏¢|‡∏ã‡∏µ‡∏≠‡∏¥‡πâ‡∏ß|‡∏™‡∏ï‡πä‡∏≠‡∏Å|‡πÅ‡∏õ‡πâ‡∏á|^‡∏û‡∏£‡∏¥‡∏Å|‡πÄ‡∏•‡∏∑‡∏≠‡∏î|‡∏ä‡∏∏‡∏õ‡∏Å‡πâ‡∏≠‡∏ô")
excludes_units = re.compile("‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö|‡∏Ñ‡∏£‡∏∂‡πà1/2")
num = re.compile("\d")
no_unit = re.compile("^\d+$|^\d+\ *-\d\ *$|^\d+/\d+$")

units = []
l = False
f = 0
for j in wongnai:
#     print(f)
    for k in wongnai[j]:
        _k = k.split(" ")[0]
        if includes.search(_k) and not excludes.search(k):
            ig = wongnai[j][k].strip().replace("\u200b", "").replace(".", "")
            if num.search(ig) and not regex_pattern.search(ig) and not no_unit.search(ig) and not excludes_units.search(ig):
                l = True
                print(ig)
                print(k, ig)
                break
                
    f += 1
    if l:
        break
#         re.sub(num, "", wongnai[j][k])

print(f)

9221


In [115]:
units = []
for u in _units:
    units.extend(u)

In [116]:
dfunits = pd.DataFrame(units)

In [117]:
unq = pd.unique(dfunits.values.flatten())

In [118]:
unq.tolist()

['‡∏Å‡∏£‡∏∞‡∏õ‡πã‡∏≠‡∏á',
 '‡πÄ‡∏°‡πá‡∏î',
 '‡∏ü‡∏≠‡∏á',
 '‡πÄ‡∏•‡πá‡∏Å‡∏ô‡πâ‡∏≠‡∏¢',
 '‡∏û‡∏≠‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì',
 '‡∏´‡πà‡∏≠',
 '‡∏Å‡∏£‡∏±‡∏°',
 '‡∏ä‡πâ‡∏≠‡∏ô‡πÇ‡∏ï‡πä‡∏∞',
 '‡∏ä‡πâ‡∏≠‡∏ô‡∏ä‡∏≤',
 '‡∏ñ‡πâ‡∏ß‡∏¢‡∏ï‡∏ß‡∏á',
 '‡∏Å‡πâ‡∏≠‡∏ô',
 '‡∏ä‡∏¥‡πâ‡∏ô‡πÉ‡∏´‡∏ç‡πà',
 '‡∏ï‡πâ‡∏ô',
 '‡∏ï‡∏≤‡∏°‡∏ä‡∏≠‡∏ö',
 '‡∏°‡∏¥‡∏•‡∏•‡∏¥‡∏•‡∏¥‡∏ï‡∏£',
 '',
 '‡πÅ‡∏û‡πá‡∏Ñ',
 '‡πÉ‡∏ö',
 '‡∏Ñ‡∏£‡∏∂‡πà‡∏á‡∏ä‡πâ‡∏≠‡∏ô‡πÇ‡∏ï‡πä‡∏∞',
 '‡∏Ñ‡∏£‡∏∂‡πà‡∏á‡∏ä‡πâ‡∏≠‡∏ô‡∏ä‡∏≤',
 '‡∏ô‡∏¥‡∏î‡∏´‡∏ô‡πà‡∏≠‡∏¢',
 '+‡∏ñ‡πâ‡∏ß‡∏¢',
 '‡∏ñ‡πâ‡∏ß‡∏¢',
 '‡∏Å‡∏¥‡πÇ‡∏•‡∏Å‡∏£‡∏±‡∏°',
 '‡∏•‡∏¥‡∏ï‡∏£',
 '‡∏ï‡∏±‡∏ß',
 '‡∏ñ‡∏∏‡∏á',
 '‡∏ä‡∏ï',
 '‡πÇ‡∏•',
 '‡∏ä‡∏ä',
 '‡∏ñ‡∏ï',
 '‡∏•‡∏π‡∏Å',
 '‡∏≠‡∏≠‡∏ô‡∏ã‡πå',
 '‡πÄ‡∏ï‡πá‡∏°‡πÅ‡∏Å‡πâ‡∏ß',
 '‡∏Å‡∏≥',
 '‡∏Å‡∏•‡∏µ‡∏ö',
 '‡∏´‡∏±‡∏ß',
 '‡∏£‡∏≤‡∏Å',
 '‡∏™‡πà‡∏ß‡∏ô',
 '‡∏ã‡∏≠‡∏á',
 '‡πÅ‡∏•‡πâ‡∏ß‡πÅ‡∏ï‡πà‡∏ä‡∏≠‡∏ö',
 '‡∏°‡∏•',
 '‡∏Ñ‡∏£‡∏∂‡πà‡∏á‡∏Å‡∏Å',
 '‡πÄ‡∏´‡∏¢‡∏≤‡∏∞',
 '‡πÄ‡∏´‡∏¢‡∏≤‡∏∞‡πÜ‡∏•‡∏á‡πÑ‡∏õ',
 '‡∏ä‡∏¥‡πâ‡∏ô',
 '‡∏Å‡∏µ‡∏ö',
 '‡∏Å‡∏Å',
 '‡∏Å‡∏≥‡∏°‡∏∑‡∏≠',
 '‡∏Ñ‡∏£‡∏∂‡πà‡∏á‡∏ñ‡πâ‡∏

# Insert Wongnai

In [70]:
# cursor = conn.cursor()
# for j in wongnai:
#     res = cursor.execute("INSERT INTO ingredients (menu_name, ingredient_json) VALUES(%s, %s)", 
#                (j, json.dumps(wongnai[j])))
# conn.commit()
# cursor.close()

In [71]:
kapook

{'‡πÑ‡∏°‡πÇ‡∏Ñ‡∏£‡πÄ‡∏ß‡∏ü‡∏û‡∏£‡πâ‡∏≠‡∏°‡∏à‡πâ‡∏≤ ! ‡∏ä‡∏ß‡∏ô‡∏ó‡∏≥‡πÑ‡∏î‡∏ü‡∏π‡∏Å‡∏∏‡∏™‡∏ï‡∏£‡∏≠‡∏ß‡πå‡πÄ‡∏ö‡∏≠‡∏£‡πå‡∏£‡∏µ ‡πÄ‡∏°‡∏ô‡∏π‡πÑ‡∏°‡πÇ‡∏Ñ‡∏£‡πÄ‡∏ß‡∏ü ‡πÅ‡∏õ‡πâ‡∏á‡∏™‡∏µ‡∏™‡∏ß‡∏¢‡πÄ‡∏´‡∏ô‡∏µ‡∏¢‡∏ß‡∏ô‡∏∏‡πà‡∏° ‡∏™‡∏≠‡∏î‡πÑ‡∏™‡πâ‡∏ñ‡∏±‡πà‡∏ß‡πÅ‡∏î‡∏á‡∏Å‡∏ß‡∏ô‡∏Å‡∏±‡∏ö‡∏ú‡∏•‡πÑ‡∏°‡πâ‡∏™‡∏î ‡∏õ‡∏±‡πâ‡∏ô‡∏•‡∏π‡∏Å‡πÇ‡∏ï‡∏ô‡πà‡∏≤‡∏´‡∏°‡πà‡∏≥ ‡∏à‡∏±‡∏ö‡πÅ‡∏ä‡πà‡πÄ‡∏¢‡πá‡∏ô‡∏¢‡∏¥‡πà‡∏á‡∏ü‡∏¥‡∏ô_https://cooking.kapook.com/view206822.html': ['‡πÅ‡∏õ‡πâ‡∏á‡∏Ç‡πâ‡∏≤‡∏ß‡πÄ‡∏´‡∏ô‡∏µ‡∏¢‡∏ß 1+1/2 ‡∏ñ‡πâ‡∏ß‡∏¢',
  '‡πÅ‡∏õ‡πâ‡∏á‡∏Ç‡πâ‡∏≤‡∏ß‡πÇ‡∏û‡∏î 1 ‡∏ä‡πâ‡∏≠‡∏ô‡πÇ‡∏ï‡πä‡∏∞',
  '‡∏ô‡πâ‡∏≥‡∏ï‡∏≤‡∏•‡∏ó‡∏£‡∏≤‡∏¢‡∏ó‡∏£‡∏≤‡∏¢‡∏õ‡πà‡∏ô‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î 5 ‡∏ä‡πâ‡∏≠‡∏ô‡πÇ‡∏ï‡πä‡∏∞',
  '‡∏ô‡πâ‡∏≥‡∏™‡∏∞‡∏≠‡∏≤‡∏î 250 ‡∏Å‡∏£‡∏±‡∏°',
  '‡∏™‡∏µ‡∏ú‡∏™‡∏°‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏™‡∏µ‡πÅ‡∏î‡∏á (‡πÉ‡∏™‡πà‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏°‡πà‡πÉ‡∏™‡πà‡∏Å‡πá‡πÑ‡∏î‡πâ) \xa0',
  '‡πÅ‡∏õ‡πâ‡∏á‡∏°‡∏±‡∏ô‡∏™‡∏≥‡∏õ‡∏∞‡∏´‡∏•‡∏±‡∏á ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ó‡∏≥‡πÅ‡∏õ‡πâ‡∏á‡∏ô‡∏ß‡∏• 100 ‡∏Å‡∏£‡∏±‡∏° (‡∏ô‡∏≥‡πÑ‡∏õ‡∏ú‡∏±‡∏î‡∏î‡πâ‡∏ß‡∏¢‡πÑ‡∏ü‡∏Å‡∏

In [46]:
# cursor = conn.cursor()
# cursor.execute("SELECT * FROM ingredients;")
# a = cursor.fetchall() 
# # cursor.close()

In [607]:
import requests
res = requests.get("http://localhost:8000?keyword=‡∏´‡∏°‡∏π‡∏™‡∏±‡∏ö").text
jl = json.loads(res)

In [608]:
with open("test.json", "w+") as f:
    json.dump(jl, f)

In [609]:
jl["menu_prices"]

{'1 ‡πÅ‡∏Å‡∏á‡∏à‡∏∑‡∏î‡∏ï‡∏≥‡∏•‡∏∂‡∏á‡πÄ‡∏ï‡πâ‡∏≤‡∏´‡∏π‡πâ‡∏´‡∏°‡∏π‡∏™‡∏±‡∏ö üê∑': [47.514029888841016,
  47.52803412162209,
  47.54197472310934,
  47.55585198242546,
  47.56966618737945,
  47.58341762447259,
  47.59710657890439,
  47.61073333457848,
  47.62429817410851,
  47.63780137882403,
  47.65124322877629,
  47.66462400274407,
  47.67794397823947,
  47.691203431513635,
  47.70440263756252,
  47.71754187013255,
  47.73062140172636,
  47.74364150360838,
  47.7566024458105,
  47.76950449713767,
  47.78234792517344,
  47.79513299628557,
  47.807859975631494,
  47.82052912716386,
  47.83314071363598,
  47.84569499660729,
  47.85819223644879,
  47.870632692348394,
  47.88301662231637,
  47.895344283190646,
  47.10585170816856],
 '10 ‡πÅ‡∏Å‡∏á‡∏à‡∏∑‡∏î‡∏´‡∏°‡∏π‡∏™‡∏±‡∏ö‡∏ß‡∏∏‡πâ‡∏ô‡πÄ‡∏™‡πâ‡∏ô': [57.01683586660922,
  57.03364094594651,
  57.05036966773121,
  57.06702237891055,
  57.083599424855336,
  57.100101149367106,
  57.11652789468526,
  57.13288000149417,
  57.14915780893021,
  5

In [34]:

conn.close()