# 크롬 웹드라이버 클래스 import

In [14]:
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import pandas as pd
import re
import traceback
import time
import os.path
import copy

In [15]:
# data 디렉토리가 없으면 생성한다.
if not os.path.exists('./data'):
    os.makedirs('./data')

# 크롬 웹드라이버 객체 생성

In [16]:
driver = Chrome()

# 웹페이지 열고 이동하기

In [None]:
food_list = []
food_file_path = './data/FoodList.csv'

if not os.path.exists(food_file_path):
    # FoodList.csv 파일이 없으면 웹페이지를 가져와서 생성한다.

    url = '여기를 URL 로 채우세요'
    driver.get(url)

    # 웹페이지에서 모든 음식 목록과 해당 링크를 가져온다.

    # tftable 클래스 하위 tbody 요소 내의 모든 tr 요소 찾기
    tr_elements = driver.find_elements(By.CSS_SELECTOR, '.tftable>tbody>tr')

    tr_idx = 0
    for tr_element in tr_elements:
        td_elements = tr_element.find_elements(By.TAG_NAME, 'td')
        
        if len(td_elements) > 0:
            tr_idx += 1

            td_element = td_elements[1]
            food_name = td_element.text
            page_link = td_element.find_element(By.TAG_NAME, 'a').get_attribute('href')
            food_list.append([tr_idx, food_name, page_link])

    # FoodList.csv 에 저장한다.

    df1 = pd.DataFrame(food_list, columns=['idx', 'name', 'link'])
    df1.head(5)
    df1.to_csv(food_file_path, encoding='utf-8', index=False)
else:
    # FoodList.csv 파일이 있으면 파일에서 가져온다.
    df1 = pd.read_csv(food_file_path)
    food_list = df1.values.tolist()

# 필요한 정보 추출 함수

In [21]:
def find_gi_from_page_type1(dict, text):
    if text.find('(GI)') != -1:
        match1 = re.findall(r'\s+to\s+\d*\.?\d+,', text)
        match2 = re.findall(r'\d*\.?\d+', match1[0]) # r'-?\d*\.?\d+'
        gi = match2[0]
        dict['GI'] = gi
        return gi
    else:
        return None

In [22]:
def find_gl_from_page_type1(dict, text):
    if text.find('(GL)') != -1:
        match3 = re.findall(r'\s+to\s+\d*\.?\d+,', text)
        match4 = re.findall(r'\d*\.?\d+', match3[0]) # r'-?\d*\.?\d+'
        gl = match4[0]
        dict['GL'] = gl
        return gl
    else:
        return None

In [23]:
def find_nutrition_from_page_type1(dict, text):
    if (text.find('(GI)') == -1) and (text.find('(GL)') == -1):
        match5 = re.findall(r'\s+\d*\.?\d+\s+kcal', text) # r'\s+-?\d*\.?\d+\s+kcal'
        match6 = re.findall(r'\d*\.?\d+', match5[0]) # r'-?\d*\.?\d+'
        calory = match6[0]

        dict['calory'] = calory
        match7 = re.findall(r'\s+\d*\.?\d+\sgrams\s+of\s+\w+', text)
        for snippet in match7:
            words = snippet.split('grams of')
            grams = words[0].strip()
            nutrition = words[1].strip()
            dict[nutrition] = grams
        
        return dict
    else:
        return None

# 각 푸드 웹페이지 visit

In [30]:
def get_information1(idx, name, link):
    dict = {}
    dict['idx'] = idx
    dict['name'] = name
    dict['link'] = link

    page_type_2_xpath = f"//h2[text()='Overview']/following-sibling::table[1]//tr"

    try:
        tr_elements_by_overview = driver.find_elements(By.XPATH, page_type_2_xpath)
        if len(tr_elements_by_overview) > 0:
            print(f'Type 2 : {idx} {name}')
            for tr_element in tr_elements_by_overview:
                td_elements = tr_element.find_elements(By.TAG_NAME, 'td')
                for td_element in td_elements:
                    if td_element.text == 'Glycemic index':
                        gi = td_element.find_element(By.XPATH, "./following-sibling::td[1]").text
                        dict['GI'] = gi
                    elif td_element.text == 'Glycemic load':
                        gl = td_element.find_element(By.XPATH, "./following-sibling::td[1]").text
                        dict['GL'] = gl

            xpath = f"//h2[contains(text(), 'Nutrition Facts')]/following-sibling::table[1]//tr"
            tr_elements = driver.find_elements(By.XPATH, xpath)
            for tr_element in tr_elements:
                td_elements = tr_element.find_elements(By.TAG_NAME, 'td')
                if len(td_elements) > 1:
                    # 마지막 괄호와 괄호 안의 내용 제거
                    nutrition = re.sub(r"\s*\([^)]*\)$", '', td_elements[0].text.strip())
                    nutrition = nutrition.lower().strip()
                    if nutrition == 'calories':
                        nutrition = 'calory'
                    
                    value = td_elements[1].text.strip()
                    dict[nutrition] = value
        else:
            h2_elements_type1 = driver.find_elements(By.CSS_SELECTOR, 'h2.wp-block-heading')
            for idx, h2_element in enumerate(h2_elements_type1):
                paragraph = h2_element.find_element(By.XPATH, "./following-sibling::p[1]")

                if idx == 0: # GI
                    find_gi_from_page_type1(dict, paragraph.text)
                elif idx == 1: # GL
                    find_gl_from_page_type1(dict, paragraph.text)
                elif idx == 2:
                    find_nutrition_from_page_type1(dict, paragraph.text)
    except Exception as e:
        print(e)
        traceback.print_exc() # 전체 오류 내역 출력하기

    return dict

In [29]:
result_information = []

for food in food_list:
    food_idx = food[0]
    food_name = food[1]
    food_link = food[2]
            
    # visit
    driver.get(food_link)

    # visit
    dict = get_information1(food_idx, food_name, food_link)
    if (len(dict) > 3):
        result_information.append(dict)
    
    time.sleep(2) # 페이지 쓰로틀링

Type 2 : 21 Banana
Type 2 : 27 Basmati Rice
Type 2 : 49 Brown rice
Type 2 : 139 Egg
Type 2 : 175 Green apple


list index out of range


Traceback (most recent call last):
  File "C:\Users\Kim\AppData\Local\Temp\ipykernel_22816\2710307411.py", line 46, in get_information1
    find_nutrition_from_page_type1(dict, paragraph.text)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Kim\AppData\Local\Temp\ipykernel_22816\2603213502.py", line 4, in find_nutrition_from_page_type1
    match6 = re.findall(r'\d*\.?\d+', match5[0]) # r'-?\d*\.?\d+'
                                      ~~~~~~^^^
IndexError: list index out of range




list index out of range


Traceback (most recent call last):
  File "C:\Users\Kim\AppData\Local\Temp\ipykernel_22816\2710307411.py", line 46, in get_information1
    find_nutrition_from_page_type1(dict, paragraph.text)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Kim\AppData\Local\Temp\ipykernel_22816\2603213502.py", line 4, in find_nutrition_from_page_type1
    match6 = re.findall(r'\d*\.?\d+', match5[0]) # r'-?\d*\.?\d+'
                                      ~~~~~~^^^
IndexError: list index out of range


Type 2 : 468 Pineapple


In [31]:
# 589개 페이지 중에서 483개의 페이지에서 정보를 가져왔고 106개는 파싱 실패
print(len(food_list))
print(len(result_information))
print(len(food_list) - len(result_information))

589
483
106


In [None]:
# 파싱 실패한 106개의 페이지를 출력
for food in food_list:
    idx = food[0]
    try:
        # 리스트 컴프리헨션으로 검색
        result = [dict for dict in result_information if dict['idx'] == idx][0]
    except Exception as e:
        print(food)

In [33]:
# 106개의 파싱 실패 페이지를 위한 파서 함수
def get_information2(idx, name, link):
    dict = {}
    dict['idx'] = idx
    dict['name'] = name
    dict['link'] = link

    page_type_3_xpath = "//hr[contains(@class, 'wp-block-separator') and contains(@class, 'has-alpha-channel-opacity')]/preceding-sibling::p[1]"

    try:
        paragraph = driver.find_element(By.XPATH, page_type_3_xpath)
        sentences = paragraph.text.split('. ')
        for text in sentences:
            if text.find('(GI)') > -1:
                find_gi_from_page_type1(dict, text)
            elif text.find('(GL)') > -1:
                find_gl_from_page_type1(dict, text)
        
        tr_elements = paragraph.find_elements(By.XPATH, "//hr[contains(@class, 'wp-block-separator') and contains(@class, 'has-alpha-channel-opacity')]/following-sibling::table[1]//tr")
        if tr_elements[0].text.find('Nutrition Facts') > -1:
            for tr_element in tr_elements:
                td_elements = tr_element.find_elements(By.TAG_NAME, 'td')
                if len(td_elements) > 1:
                    # 마지막 괄호와 괄호 안의 내용 제거
                    nutrition = re.sub(r"\s*\([^)]*\)$", '', td_elements[0].text.split(' ')[0].strip())
                    nutrition = nutrition.lower().strip()
                    if nutrition == 'calories':
                        nutrition = 'calory'
                    
                    value = td_elements[1].text.strip()
                    dict[nutrition] = value
    except Exception as e:
        print(e)
        traceback.print_exc() # 전체 오류 내역 출력하기

    return dict

In [34]:
failed_list = []

for food in food_list:
    food_idx = food[0]

    try:
        # 리스트 컴프리헨션으로 검색
        result = [dict for dict in result_information if dict['idx'] == food_idx][0]
    except Exception as e:
        food_name = food[1]
        food_link = food[2]

        driver.get(food_link)
        
        # visit
        dict = get_information2(food_idx, food_name, food_link)
        failed_list.append(dict)

        time.sleep(2) # 페이지 쓰로틀링

In [38]:
print(len(food_list))
print(len(result_information))
print(len(failed_list))

589
483
106


In [41]:
# result_information의 복사본을 만들고 여기에 failed_list를 추가
final_result = copy.deepcopy(result_information)
final_result.extend(failed_list)

print(len(food_list))
print(len(result_information))
print(len(failed_list))
print(len(final_result))

589
483
106
589


In [42]:
# final_result 데이터를 idx로 정렬
sorted_list = sorted(final_result, key=lambda x: x['idx'])

In [None]:
df2 = pd.DataFrame(sorted_list)
df2.to_csv('data/Nutrition_beta2.csv', encoding='utf-8', index=False)
df2.to_excel('data/Nutrition_beta2.xlsx', index=False)

In [44]:
driver.quit()

# 부분적으로 에러난 항목들

In [None]:
# data 디렉토리 밑에 만들어진 csv 파일을 열어보면
# idx 가 534, 545, 575 인 row 는 GI 와 GL 값을 제대로 파싱을 하지 못했다는 것을 알 수 있다.
# 해당하는 데이터의 각 페이지를 직접 수동으로 들어가서 누락된 값을 csv 와 excel 파일에 채운다.

df2[df2.idx.isin([534, 545, 575])]

In [None]:
# idx 가 587 인 row 는 name 에 특수문자(“baguette”) 가 있어서
# 엑셀로 열었을 때 밀려보이므로 “ 와 ” 를 csv 와 excel 파일에서 지운다.

df2[df2.idx.isin([587])]