In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

class EBSCrawling:
    LOGIN_URL = "https://www.ebsi.co.kr/ebs/pot/potl/login.ebs?destination=/ebs/pot/poti/main.ebs&alertYn=N"
    ID = "hoonistwo"
    PASSWORD = "gksaudgns1!"
    XLSX_PATH = "result.xlsx"
    
    def __init__(self):
        chrome_options = Options()
        # chrome_options.add_argument("--headless")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_argument("--disable-gpu")
        self.driver = webdriver.Chrome(options=chrome_options)
        # self.wait = WebDriverWait(self.driver, 10)  # WebDriverWait 설정

        
    def login(self):
        self.driver.get(self.LOGIN_URL)

        id_field = self.driver.find_element(By.XPATH, """//*[@id="loginFrm"]/input[11]""")
        password_field = self.driver.find_element(By.XPATH, """//*[@id="loginFrm"]/span/input""")

        id_field.send_keys("hoonistwo")
        password_field.send_keys("gksaudgns1!")
        
        login_button = self.driver.find_element(By.XPATH, """//*[@id="btnLogin"]""")
        login_button.click()
        self.wait()
        
    def go_to_main(self):
        self.driver.get("https://www.ebsi.co.kr/ebs/pot/poti/main.ebs")
        self.wait()
        self.driver.get("https://www.ebsi.co.kr/ebs/lms/lmsx/retrieveSbjtDtl.ebs?courseId=S20230000735#qna")
        self.wait()
        
    def next_page(self):
        # at question list page
        next_button = self.driver.find_element(By.XPATH,"""//*[@id="gotoTab"]/div/div[6]/div[2]/a[3]""")
        next_button.click()
        time.sleep(0.2)
        
    def get_page_num(self):
        # at question list page
        return int(self.driver.find_element(By.XPATH, """//*[@id="gotoTab"]/div/div[6]/div[2]/div/a[2]""").text)
    
    def go_to_page(self, page_idx):
        self.go_to_main()
        for i in range(page_idx):
            self.next_page()
        
    def back(self):
        self.driver.back()
        self.wait()
        
    def get_question_num(self):
        # at question list page
        return len(self.driver.find_element(By.XPATH, """//*[@id="gotoTab"]/div/div[5]/ul""").find_elements(By.CLASS_NAME, "bg_reply"))
    
    def click_question(self, i):
        # at question list page
        question = self.driver.find_element(By.XPATH, """//*[@id="gotoTab"]/div/div[5]/ul""").find_elements(By.CLASS_NAME, "bg_reply")[i]
        question.find_element(By.TAG_NAME, "a").click()
        self.wait()
    
    def go_to_question(self, page_idx, question_idx):
        self.go_to_page(page_idx)
        self.click_question(question_idx)
    
    def wait(self):
        time.sleep(2)
        
    def get_question_num_list(self):
        self.go_to_main()
        page_num = self.get_page_num()
        question_num_list = []
        question_num = self.get_question_num()
        question_num_list.append(question_num)
        for i in range(page_num):
            self.next_page()
            question_num = self.get_question_num()
            question_num_list.append(question_num)
        return question_num_list        
            
    def get_question_text(self):
        question_title = self.driver.find_element(By.XPATH, """//*[@id="gotoTab"]/div/div[2]/div[1]/div/p[2]""").text
        elements = self.driver.find_element(By.XPATH, """//*[@id="qstBd"]""").find_elements(By.TAG_NAME, "p")
        question = "".join([element.text for element in elements])        
        question_view = int(self.driver.find_element(By.XPATH, """//*[@id="gotoTab"]/div/div[2]/div[1]/div/ul/li[3]/span""").text)
        
        answer_title = self.driver.find_element(By.XPATH, """//*[@id="gotoTab"]/div/div[4]/div[1]/div/p[2]""").text
        elements = self.driver.find_element(By.XPATH, """//*[@id="ansBd"]""").find_elements(By.TAG_NAME, "p")
        answer = "".join([element.text for element in elements])
        answer_view = int(self.driver.find_element(By.XPATH, """//*[@id="gotoTab"]/div/div[4]/div[1]/div/ul/li[3]/span""").text)
        
        try:
            comment = self.driver.find_element(By.XPATH, """//*[@id="gotoTab"]/div/div[4]/div[3]/div/p/em""").text
        except:
            comment = ""
        
        
        return {"question_title":question_title, "question":question, "question_view":question_view, "answer_title":answer_title, "answer":answer, "answer_view":answer_view, "comment":comment}
        
    def save(self, sample):
        try:
            df = pd.read_excel(self.XLSX_PATH)
        except FileNotFoundError:
            df = pd.DataFrame()  # 빈 데이터 프레임 생성
        new_df = pd.DataFrame([sample])
        df = pd.concat([df, new_df])
        df.to_excel(self.XLSX_PATH, index=False)        
    
    def crawling(self):
        self.login()
        question_num_list = self.get_question_num_list()
        print(question_num_list)
        for page_idx, question_num in enumerate(question_num_list):
            for question_idx in range(question_num):
                self.go_to_question(page_idx, question_idx)
                result = self.get_question_text()
                result.update({"page":page_idx+1})
                self.save(result)                
                        
    def test(self):
        self.driver.get("https://www.ebsi.co.kr/ebs/lms/lmsx/retrieveSbjtDtl.ebs?courseId=S20230000735#qna")
        page_num = self.driver.find_element(By.XPATH, """//*[@id="gotoTab"]/div/div[6]/div[2]/div/a[2]""").text
        
        for i in range(2, int(page_num)):
            for k in range(i):
                next_button = self.driver.find_element(By.XPATH,"""//*[@id="gotoTab"]/div/div[6]/div[2]/a[3]""")
                next_button.click()
                self.wait()
                
            question_num = len(self.driver.find_element(By.XPATH, """//*[@id="gotoTab"]/div/div[5]/ul""").find_elements(By.CLASS_NAME, "bg_reply"))
            
            for j in range(4, question_num):
                question = self.driver.find_element(By.XPATH, """//*[@id="gotoTab"]/div/div[5]/ul""").find_elements(By.CLASS_NAME, "bg_reply")[j]
                question.find_element(By.TAG_NAME, "a").click()
                self.wait()
                self.driver.back()
                self.wait()
                for k in range(i):
                    next_button = self.driver.find_element(By.XPATH,"""//*[@id="gotoTab"]/div/div[6]/div[2]/a[3]""")
                    next_button.click()
                    self.wait()

            
            
                
            
        
crawling = EBSCrawling()
crawling.crawling()


@@@ 0
@@@ 0
@@@ 0
@@@ 0
@@@ 0
@@@ 0
@@@ 1
@@@ 1
