In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import time, os, re
from azure_helper import upload_img_by_url
from dotenv import load_dotenv
import pandas as pd 

In [2]:
load_dotenv()
azure_url = os.getenv('AZURE_STORAGE_URL')

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=chrome_options)
driver.implicitly_wait(10)

The chromedriver version (133.0.6943.141) detected in PATH at C:\WebDriver\chromedriver.exe might not be compatible with the detected chrome version (134.0.6998.178); currently, chromedriver 134.0.6998.165 is recommended for chrome 134.*, so it is advised to delete the driver in PATH and retry


In [3]:
def crawl_img_url(item):
    print(f"Searching image for: {item}")
    driver.get("https://images.google.com/")
    search_box = driver.find_element(By.NAME, "q")
    search_box.clear()
    search_box.send_keys(item)
    search_box.send_keys(Keys.RETURN)
    time.sleep(2)

    main_images_div = driver.find_element(By.ID, "rcnt")

    # Sometimes it raise SSL eror so I write code to pass and try another link
    images_holders = main_images_div.find_elements(By.CSS_SELECTOR, "img.YQ4gaf")
    url = None
    for holder in images_holders:
        try:
            holder.click() 
            time.sleep(5)
            url = driver.find_element(By.CSS_SELECTOR, "img.sFlh5c.FyHeAf.iPVvYb, img.sFlh5c.FyHeAf").get_attribute("src")
            if url is not None:
                break
        except Exception:
            pass

    return url

In [4]:
def scrape_food_info():
    url = os.getenv('FOOD_NUTRITION_WEBSITE')
    driver.get(url)
    rows = driver.find_elements(By.CLASS_NAME, 'row-render')
    food_list = []
    for row in rows:
        properties = row.find_elements(By.TAG_NAME, 'td')

        food = {
            'food_name': properties[0].text.strip(),
            'serving_size': properties[1].text.strip(),
            'calories': properties[2].text.strip(),
            'fat': properties[3].text.strip(),
            'sugar': properties[4].text.strip(),
            'protein': properties[5].text.strip(),
            'fiber': properties[6].text.strip()
        }
        food_list.append(food)

    return food_list

In [5]:
def save_as_csv(list_of_item):
    df = pd.DataFrame(list_of_item)
    df.to_csv('food_data.csv', index=False)

In [6]:
def clean_name(name):
    name_without_space = name.replace(' ', '_')
    name_with_word_only = re.sub(r'\W+', '', name_without_space)
    return name_with_word_only   

In [7]:
food_list = scrape_food_info()
print('Done scrape food info')
for food in food_list:
    attempts = 3
    external_src = None
    while (external_src is None and attempts > 0):
        external_src = crawl_img_url(food['food_name'])
        attempts -= 1

    food['img_src'] = external_src

Done scrape food info
Searching image for: Gỏi cuốn tôm thịt
Searching image for: Bánh mì thịt nướng
Searching image for: Bún đậu mắm tôm
Searching image for: Bún chả Hà Nội
Searching image for: Mì xào Bò
Searching image for: Sữa tươi trân châu đường đen
Searching image for: Trà Sữa
Searching image for: KimBap
Searching image for: Trứng vịt lộn
Searching image for: Củ sắn
Searching image for: Mận đỏ
Searching image for: Thanh long
Searching image for: Ổi
Searching image for: Vú sữa
Searching image for: Mít sấy
Searching image for: Chuối sấy
Searching image for: Cam
Searching image for: Đậu phộng chiên muối
Searching image for: Đậu phộng rang
Searching image for: Sơ ri
Searching image for: Nho ta (tím)
Searching image for: Hồng đỏ
Searching image for: Đu đủ
Searching image for: Đậu phộng da cá
Searching image for: Đậu phộng nấu
Searching image for: Khoai lang chiên
Searching image for: Khoai tây
Searching image for: Khế
Searching image for: Mãng cầu xiêm
Searching image for: Bắp nướng
S

In [9]:
food_df = pd.DataFrame(food_list)
food_df.head()

Unnamed: 0,food_name,serving_size,calories,fat,sugar,protein,fiber,img_src
0,Gỏi cuốn tôm thịt,1 Cái,160,2,15,8,1,https://cdn.tgdd.vn/2021/08/CookRecipe/Avatar/...
1,Bánh mì thịt nướng,1 ổ vừa,460,12,50,18,3,https://www.bigc.vn/files/cam-nang-mua-sam-27-...
2,Bún đậu mắm tôm,1 phần,700,22,75,20,5,https://encrypted-tbn0.gstatic.com/images?q=tb...
3,Bún chả Hà Nội,1 phần,650,20,60,25,4,https://vcdn1-giadinh.vnecdn.net/2021/01/08/An...
4,Mì xào Bò,1 đĩa,650,18,65,22,4,https://cdn.tgdd.vn/2021/03/CookProduct/miaobo...


In [10]:
food_df.isna().sum()

food_name       0
serving_size    0
calories        0
fat             0
sugar           0
protein         0
fiber           0
img_src         0
dtype: int64

In [13]:
food_df.to_csv('food.csv', index=False)

In [None]:
def upload_to_azure(row):
    cleaned_name = clean_name(row['food_name'])
    image_name = f'{cleaned_name}.jpg'
    for i in range(3):
        try:
            azure_src = upload_img_by_url(image_name, row['img_src'])
            break
        except Exception as e:
            print(f"Retrying... {e}")
            time.sleep(2)
    return azure_src

In [17]:
food_df['azure_img_src'] = food_df.apply(upload_to_azure, axis=1)

Upload Gỏi_cuốn_tôm_thịt.jpg
Upload Bánh_mì_thịt_nướng.jpg
Upload Bún_đậu_mắm_tôm.jpg
Upload Bún_chả_Hà_Nội.jpg
Upload Mì_xào_Bò.jpg
Upload Sữa_tươi_trân_châu_đường_đen.jpg
Upload Trà_Sữa.jpg
Upload KimBap.jpg
Upload Trứng_vịt_lộn.jpg
Upload Củ_sắn.jpg
Upload Mận_đỏ.jpg
Upload Thanh_long.jpg
Upload Ổi.jpg
Upload Vú_sữa.jpg
Upload Mít_sấy.jpg
Upload Chuối_sấy.jpg
Upload Cam.jpg
Upload Đậu_phộng_chiên_muối.jpg
Upload Đậu_phộng_rang.jpg
Upload Sơ_ri.jpg
Upload Nho_ta_tím.jpg
Upload Hồng_đỏ.jpg
Upload Đu_đủ.jpg
Upload Đậu_phộng_da_cá.jpg
Upload Đậu_phộng_nấu.jpg
Upload Khoai_lang_chiên.jpg
Upload Khoai_tây.jpg
Upload Khế.jpg
Upload Mãng_cầu_xiêm.jpg
Upload Bắp_nướng.jpg
Upload Bắp_luộc.jpg
Upload Táo_tây.jpg
Upload Khoai_mì.jpg
Upload Khoai_từ.jpg
Upload Lê.jpg
Upload Bắp_xào.jpg
Upload Khoai_lang.jpg
Upload Bưởi.jpg
Upload Mãng_cầu_ta.jpg
Upload Dưa_hấu.jpg
Upload Chuối_già.jpg
Upload Khoai_môn.jpg
Upload Nho_Mỹ_đỏxanh.jpg
Upload Cóc.jpg
Upload Nho_khô.jpg
Upload Quýt.jpg
Upload Hạt_điều.

NameError: name 'sleep' is not defined