In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from tqdm.notebook import tqdm
import os
import threading
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from glob import glob
import shutil
# 쓰레드 함수 정의
def scrape_data(thread_id, sigungu_list_chunk):
    # 각 쓰레드별 저장 디렉토리 생성
    save_dir = f'/home/kms/dev_ws/eda/data{thread_id}'
    os.makedirs(save_dir, exist_ok=True)
    options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': save_dir,
             'download.prompt_for_download': False,
             'download.directory_upgrade': True,
             'safebrowsing.enabled': False}
    options.add_experimental_option('prefs', prefs)
    url = "https://www.opinet.co.kr/searRgSelect.do"
    driver = webdriver.Chrome(options=options)
    for _ in range(2):  # 페이지 로딩 재시도
        driver.get(url)
        time.sleep(1)
    # 서울시 선택 (서울시 value는 "01"입니다)
    sido_btn = driver.find_element(By.ID, "SIDO_NM0")
    sido_option = sido_btn.find_elements(By.TAG_NAME, "option")
    sido_list = [value for option in sido_option if len(value := option.get_attribute("value"))>0]
    sido_btn.send_keys(sido_list[0])  # 서울시 선택
    time.sleep(1)
    # 각 시군구별로 처리 (입력받은 시군구 리스트 청크만 처리)
    for option in tqdm(sigungu_list_chunk, desc=f"Thread {thread_id}"):
        sigungu_btn = driver.find_element(By.ID, "SIGUNGU_NM0")
        sigungu_btn.send_keys(option)
        driver.implicitly_wait(3)
        driver.refresh()
        # 저장 버튼 클릭
        save = WebDriverWait(driver, timeout=40).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "btn_type6_ex_save")))
        save.click()
        # 파일이 다운로드될 때까지 대기
        path = f"{save_dir}/지역_*"
        current_count = len(glob(path))
        while len(glob(path)) <= current_count:
            time.sleep(0.1)
        # 파일 이름 변경
        time.sleep(1)  # 파일이 완전히 다운로드될 때까지 추가 대기
        downloaded_files = glob(f"{save_dir}/지역_*")
        if downloaded_files:
            latest_file = max(downloaded_files, key=os.path.getctime)
            new_filename = f"{save_dir}/{option}.xls"
            try:
                os.rename(latest_file, new_filename)
            except FileExistsError:
                os.remove(new_filename)  # 기존 파일이 있다면 삭제
                os.rename(latest_file, new_filename)
    # 모든 파일을 부모 디렉토리로 이동
    parent_dir = os.path.dirname(save_dir)
    for file in os.listdir(save_dir):
        src_path = os.path.join(save_dir, file)
        dst_path = os.path.join(parent_dir, file)
        try:
            shutil.move(src_path, dst_path)
        except FileExistsError:
            os.remove(dst_path)  # 기존 파일이 있다면 삭제
            shutil.move(src_path, dst_path)
    # 빈 디렉토리 삭제
    os.rmdir(save_dir)
    driver.quit()
# 메인 코드
def main():
    # 먼저 시도 목록 가져오기
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service("/home/kms/dev_ws/eda/driver/chromedriver"),
                              options=options)
    url = "https://www.opinet.co.kr/searRgSelect.do"
    for _ in range(2):
        driver.get(url)
        time.sleep(1)
    sido_btn = driver.find_element(By.ID, "SIDO_NM0")
    sido_option = sido_btn.find_elements(By.TAG_NAME, "option")
    sido_list = [value for option in sido_option if len(value := option.get_attribute("value"))>0]
    sigungu_btn = driver.find_element(By.ID, "SIGUNGU_NM0")
    sigungu_option = sigungu_btn.find_elements(By.TAG_NAME, "option")
    sigungu_list = [value for option in sigungu_option if len(value := option.get_attribute("value"))>0]
    driver.quit()
    # 시도 목록을 5개 쓰레드로 나누기
    num_threads = 5
    chunk_size = len(sigungu_list) // num_threads
    if len(sigungu_list) % num_threads != 0:
        chunk_size += 1
    sigungu_chunks = [sigungu_list[i:i+chunk_size] for i in range(0, len(sigungu_list), chunk_size)]
    # 필요한 경우 빈 청크 추가 (쓰레드 수보다 시도 수가 적을 경우)
    while len(sigungu_chunks) < num_threads:
        sigungu_chunks.append([])
    # 쓰레드 생성 및 실행
    threads = []
    for i in range(num_threads):
        thread = threading.Thread(target=scrape_data, args=(i+1, sigungu_chunks[i]))
        threads.append(thread)
        thread.start()
    # 모든 쓰레드가 완료될 때까지 대기
    for thread in threads:
        thread.join()
# 메인 함수 실행
if __name__ == "__main__":
    main()

Thread 1:   0%|          | 0/5 [00:00<?, ?it/s]

Thread 2:   0%|          | 0/5 [00:00<?, ?it/s]

Thread 4:   0%|          | 0/5 [00:00<?, ?it/s]

Thread 5:   0%|          | 0/5 [00:00<?, ?it/s]

Thread 3:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
len(glob("/home/kms/dev_ws/eda/data/*구.xls"))

25

In [7]:
file_list = glob("/home/kms/dev_ws/eda/data/*구.xls")
print(len(file_list))
file_list

25


['/home/kms/dev_ws/eda/data/강동구.xls',
 '/home/kms/dev_ws/eda/data/관악구.xls',
 '/home/kms/dev_ws/eda/data/은평구.xls',
 '/home/kms/dev_ws/eda/data/강북구.xls',
 '/home/kms/dev_ws/eda/data/광진구.xls',
 '/home/kms/dev_ws/eda/data/양천구.xls',
 '/home/kms/dev_ws/eda/data/용산구.xls',
 '/home/kms/dev_ws/eda/data/성동구.xls',
 '/home/kms/dev_ws/eda/data/중랑구.xls',
 '/home/kms/dev_ws/eda/data/노원구.xls',
 '/home/kms/dev_ws/eda/data/도봉구.xls',
 '/home/kms/dev_ws/eda/data/동대문구.xls',
 '/home/kms/dev_ws/eda/data/성북구.xls',
 '/home/kms/dev_ws/eda/data/강서구.xls',
 '/home/kms/dev_ws/eda/data/종로구.xls',
 '/home/kms/dev_ws/eda/data/금천구.xls',
 '/home/kms/dev_ws/eda/data/서대문구.xls',
 '/home/kms/dev_ws/eda/data/동작구.xls',
 '/home/kms/dev_ws/eda/data/중구.xls',
 '/home/kms/dev_ws/eda/data/서초구.xls',
 '/home/kms/dev_ws/eda/data/강남구.xls',
 '/home/kms/dev_ws/eda/data/마포구.xls',
 '/home/kms/dev_ws/eda/data/영등포구.xls',
 '/home/kms/dev_ws/eda/data/송파구.xls',
 '/home/kms/dev_ws/eda/data/구로구.xls']

In [8]:
import pandas as pd

data = pd.read_excel(file_list[0], header=2)
data.tail(2)

Unnamed: 0,지역,상호,주소,상표,전화번호,셀프여부,고급휘발유,휘발유,경유,실내등유
11,서울특별시,천호현대주유소,서울 강동구 천중로 67 (천호동),HD현대오일뱅크,02-484-9323,N,-,1938,1793,-
12,서울특별시,광성주유소,서울 강동구 올림픽로 673 (천호동),S-OIL,02-470-5133,N,-,1968,1858,-


In [9]:
data_list = []

for file in file_list:
    data = pd.read_excel(file, header=2)
    data_list.append(data)

oil_data = pd.concat(data_list)
len(oil_data)

422

In [10]:
duplicated = oil_data.duplicated()
duplicated.sum()

np.int64(0)