In [13]:

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver import Keys, ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup as bs
from selenium.webdriver.chrome.options import Options

from collections import deque
import re
import random
import time
import concurrent.futures
from datetime import datetime

ARCA_LIVE_LINK = "https://arca.live/b/hotdeal"
RULI_WEB_LINK = "https://bbs.ruliweb.com/market/board/1020?view=default"
PPOM_PPU_LINK = "https://www.ppomppu.co.kr/zboard/zboard.php?id=ppomppu"
QUASAR_ZONE_LINK = "https://quasarzone.com/bbs/qb_saleinfo"
FM_KOREA_LINK = "https://www.fmkorea.com/hotdeal"
false = []

class PAGES:
    def __init__(self):
        self.refresh_delay = 60 # sec
        self.item_link_queue = deque()
        self.previous_items_queue = deque()
        
    def set_drvier(self, site_name):
        chrome_options = Options()
        service = Service(executable_path=ChromeDriverManager().install())
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(options = chrome_options, service = service)
        driver.implicitly_wait(5)
        driver.get(site_name)

        return driver
    
    def insert_to_db(self, **kwargs):
        db.test.insert_one(kwargs)
    
class ARCA_LIVE(PAGES): # shopping_mall_link, shopping_mall, item_name, price, delivery, content, comment
    def __init__(self):
        super().__init__()
        self.hot_deal_page = ARCA_LIVE_LINK
    
    def get_item_links(self):
        driver = self.set_drvier(self.hot_deal_page)
        for i in range(4, 49):
            try:
                item = driver.find_element(By.CSS_SELECTOR, f"body > div.root-container > div.content-wrapper.clearfix > article > div > div.article-list > div.list-table.hybrid > div:nth-child({i}) > div > div > span.vcol.col-title > a")
                item_link = item.get_attribute("href")
            except Exception as e:
                print(e)
                
            if item_link not in self.previous_items_queue:
                self.item_link_queue.append((item_link, 0))
                self.previous_items_queue.appendleft(item_link)
                if len(self.previous_items_queue) > 100:
                    self.previous_items_queue.pop()
            else:
                pass
            
    def crawling(self):
        driver = self.set_drvier(self.hot_deal_page)
        while True:
            try:
                item_link, retry_attempt = self.item_link_queue.popleft()
                print(item_link, retry_attempt)
            except:
                print("Empty Queue")
                break
            driver.get(item_link)
            time.sleep(5)
            try: # 신고 처리, 보안 검사 등
                table = driver.find_element(By.TAG_NAME, "table")
                rows = table.find_elements(By.TAG_NAME, "tr")
                details = [row.text for row in rows]
                shopping_mall_link, shopping_mall, item_name, price, delivery = list(map(lambda x: "".join(x.split()[1:]), details))
                content = driver.find_element(By.CSS_SELECTOR, "body > div.root-container > div.content-wrapper.clearfix > article > div > div.article-wrapper > div.article-body > div.fr-view.article-content").text
                comment_box = driver.find_element(By.CSS_SELECTOR, "#comment > div.list-area")
                comment = list(map(lambda x: x.text, comment_box.find_elements(By.CLASS_NAME, "text")))
            except Exception as e:
                if retry_attempt >= 3:
                    print(e)
                    false.append((item_link, retry_attempt + 1))
                else:
                    self.item_link_queue.append((item_link, retry_attempt + 1))
                continue
            
            self.insert_to_db(item_link = item_link, shopping_mall_link = shopping_mall_link, shopping_mall = shopping_mall, price = price, item_name = item_name, delivery = delivery, content = content, comment = comment)

# shopping_mall_link가 누락된 채로 게시글이 올라옴
class RULI_WEB(PAGES): # shopping_mall_link, item_name, content, comment
    def __init__(self):
        super().__init__()
        self.hot_deal_page = RULI_WEB_LINK
    
    def get_item_links(self):
        driver = self.set_drvier(self.hot_deal_page)
        item_table = driver.find_elements(By.CSS_SELECTOR, "#board_list > div > div.board_main.theme_default.theme_white.theme_white > table > tbody > tr")
        for item in item_table:
            try:
                if item.get_attribute("class") == "table_body blocktarget":
                    item_link = item.find_element(By.CSS_SELECTOR, "td.subject > div > a.deco").get_attribute("href")

                else: # 공지, best 핫딜 등
                    continue
                
                if item_link not in self.previous_items_queue:
                    self.item_link_queue.append((item_link, 0))
                    self.previous_items_queue.appendleft(item_link)
                    if len(self.previous_items_queue) > 100:
                        self.previous_items_queue.pop()
                else:
                    pass
            
            except Exception as e:
                print(e)
                
    def crawling(self):
        driver = self.set_drvier(self.hot_deal_page)
        while True:
            try:
                item_link, retry_attempt = self.item_link_queue.popleft()
                print(item_link, retry_attempt)
            except:
                print("Empty Queue")
                break
            driver.get(item_link)
            time.sleep(5)
            try: # 신고 처리, 보안 검사 등
                item_name = driver.find_element(By.CSS_SELECTOR, "#board_read > div > div.board_main > div.board_main_top > div.user_view > div:nth-child(1) > div > h4 > span > span.subject_inner_text").text
                print(item_name)
                shopping_mall_link = driver.find_element(By.CSS_SELECTOR, "#board_read > div > div.board_main > div.board_main_view > div.row.relative > div > div.source_url.box_line_with_shadow > a").text
                print(shopping_mall_link)
                content = driver.find_element(By.TAG_NAME, "article").text
                print(content)
                comment = list(map(lambda x: x.text, driver.find_elements(By.CLASS_NAME, "comment")))
                print(comment)
            except Exception as e:
                if retry_attempt >= 3:
                    print(e)
                    false.append((item_link, retry_attempt + 1))
                else:
                    self.item_link_queue.append((item_link, retry_attempt + 1))
                continue
            
            self.insert_to_db(item_link = item_link, shopping_mall_link = shopping_mall_link, item_name = item_name, content = content, comment = comment)
        
class FM_KOREA(PAGES): # shopping_mall_link, shopping_mall, item_name, price, delivery, content, comment
    def __init__(self):
        super().__init__()
        self.hot_deal_page = FM_KOREA_LINK
    
    def get_item_links(self):
        driver = self.set_drvier(self.hot_deal_page)
        for i in range(1, 21):
            try:
                item = driver.find_element(By.CSS_SELECTOR, f"#bd_1196365581_0 > div > div.fm_best_widget._bd_pc > ul > li:nth-child({i}) > div > h3 > a")
                item_link = item.get_attribute("href")
            except Exception as e:
                print(e)
                
            if item_link not in self.previous_items_queue:
                self.item_link_queue.append((item_link, 0))
                self.previous_items_queue.appendleft(item_link)
                if len(self.previous_items_queue) > 100:
                    self.previous_items_queue.pop()
            else:
                pass
            
    def crawling(self):
        driver = self.set_drvier(self.hot_deal_page)
        
        while True:
            try:
                item_link, retry_attempt = self.item_link_queue.popleft()
                print(item_link, retry_attempt)
            except:
                print("Empty Queue")
                break
            driver.get(item_link)
            time.sleep(5)
            try: # 신고 처리, 보안 검사 등
                details = driver.find_elements(By.CLASS_NAME, "xe_content")
                shopping_mall_link, shopping_mall, item_name, price, delivery, content, *comment = details
                shopping_mall_link, shopping_mall, item_name, price, delivery, content = map(lambda x: x.text, (shopping_mall_link, shopping_mall, item_name, price, delivery, content))
                comment = list(map(lambda x: x.text, comment))
            except Exception as e:
                if retry_attempt >= 3:
                    print(e)
                    false.append((item_link, retry_attempt + 1))
                else:
                    self.item_link_queue.append((item_link, retry_attempt + 1))
                continue
            
            self.insert_to_db(item_link = item_link, shopping_mall_link = shopping_mall_link, shopping_mall = shopping_mall, item_name = item_name, price = price, delivery = delivery, content = content, comment = comment)
            
            
            
class QUASAR_ZONE(PAGES): # shopping_mall_link, shopping_mall, item_name, price, delivery, content, comment
    def __init__(self):
        super().__init__()
        self.hot_deal_page = QUASAR_ZONE_LINK
        
        
    def get_item_links(self):
        driver = self.set_drvier(self.hot_deal_page)
        for i in range(1, 31):
            try:
                item = driver.find_element(By.CSS_SELECTOR, f"#frmSearch > div > div.list-board-wrap > div.market-type-list.market-info-type-list.relative > table > tbody > tr:nth-child({i}) > td:nth-child(2) > div > div.market-info-list-cont > p > a")
                item_link = item.get_attribute("href")
            except Exception as e:
                print(e)
            if item_link not in self.previous_items_queue:
                self.item_link_queue.append((item_link, 0))
                self.previous_items_queue.appendleft(item_link)
                if len(self.previous_items_queue) > 100:
                    self.previous_items_queue.pop()
            else:
                pass
        
    def crawling(self):
        driver = self.set_drvier(self.hot_deal_page)
        
        while True:
            try:
                item_link, retry_attempt = self.item_link_queue.popleft()
                print(item_link, retry_attempt)
            except:
                print("Empty Queue")
                break
            driver.get(item_link)
            time.sleep(5)
            try: # 신고 처리, 보안 검사 등
                item_name = driver.find_element(By.CSS_SELECTOR, "#content > div.sub-content-wrap > div.left-con-wrap > div.common-view-wrap.market-info-view-wrap > div > dl > dt > div:nth-child(1) > h1").text.split()[2:]
                item_name = " ".join(item_name)
                table = driver.find_element(By.TAG_NAME, "table")
                rows = table.find_elements(By.TAG_NAME, "tr")
                content = driver.find_element(By.CSS_SELECTOR, "#new_contents").text
                comment = list(map(lambda x: x.text, driver.find_elements(By.CSS_SELECTOR, "#content > div.sub-content-wrap > div.left-con-wrap > div.reply-wrap > div.reply-area > div.reply-list")))
            except Exception as e:
                if retry_attempt >= 3:
                    print(e)
                    false.append((item_link, retry_attempt + 1))
                else:
                    self.item_link_queue.append((item_link, retry_attempt + 1))
                continue
            details = [row.text for row in rows]
            shopping_mall_link, shopping_mall, price, delivery, *_ = list(map(lambda x: "".join(x.split()[1:]), details))
            self.insert_to_db(item_link = item_link, shopping_mall_link = shopping_mall_link, shopping_mall = shopping_mall, item_name = item_name, price = price, delivery = delivery, content = content, comment = comment)

# shopping_mall이 tag되지 않은 채로 올라옴
class PPOM_PPU(PAGES):
    def __init__(self):
        super().__init__()
        self.hot_deal_page = PPOM_PPU_LINK
        
    def get_item_links(self):
        driver = self.set_drvier(self.hot_deal_page)
        for i in range(9, 34):#revolution_main_table > tbody > tr:nth-child(33)
            try:#revolution_main_table > tbody > tr:nth-child(9)
                item = driver.find_element(By.CSS_SELECTOR, f"#revolution_main_table > tbody > tr:nth-child({i}) > td.baseList-space.title > div > div > a")
                item_link = item.get_attribute("href")
            except Exception as e:
                print(e)
            if item_link not in self.previous_items_queue:
                self.item_link_queue.append((item_link, 0))
                self.previous_items_queue.appendleft(item_link)
                if len(self.previous_items_queue) > 100:
                    self.previous_items_queue.pop()
            else:
                pass
        
    def crawling(self):
        driver = self.set_drvier(self.hot_deal_page)
        
        while True:
            try:
                item_link, retry_attempt = self.item_link_queue.popleft()
                print(item_link, retry_attempt)
            except:
                print("Empty Queue")
                break
            driver.get(item_link)
            time.sleep(5)
            try: # 신고 처리, 보안 검사 등
                item_name = driver.find_element(By.CSS_SELECTOR, "body > div.wrapper > div.contents > div.container > div > table:nth-child(9) > tbody > tr:nth-child(3) > td > table > tbody > tr > td:nth-child(5) > div > div.sub-top-text-box > font.view_title2").text
                content = driver.find_element(By.CSS_SELECTOR, "body > div.wrapper > div.contents > div.container > div > table:nth-child(15) > tbody > tr:nth-child(1) > td > table > tbody > tr > td").text
                comments = driver.find_element(By.ID, "quote").text
                shopping_mall_link = driver.find_element(By.CSS_SELECTOR, "body > div.wrapper > div.contents > div.container > div > table:nth-child(9) > tbody > tr:nth-child(3) > td > table > tbody > tr > td:nth-child(5) > div > div.sub-top-text-box > div > a").get_attribute("href")
                shopping_mall = driver.find_element(By.CSS_SELECTOR, "body > div.wrapper > div.contents > div.container > div > table:nth-child(9) > tbody > tr:nth-child(3) > td > table > tbody > tr > td:nth-child(5) > div > div.sub-top-text-box > font.view_title2 > span").text
            except Exception as e:
                if retry_attempt >= 3:
                    print(e)
                    false.append((item_link, retry_attempt + 1))
                else:
                    self.item_link_queue.append((item_link, retry_attempt + 1))
                continue
                
            self.insert_to_db(item_link = item_link, item_name = item_name, content = content, comments = comments, shopping_mall = shopping_mall, shopping_mall_link = shopping_mall_link)


In [1]:

from pymongo.mongo_client import MongoClient

uri = "mongodb+srv://f2921641:a5735038@test.krlnafk.mongodb.net/?retryWrites=true&w=majority&appName=test"

# Create a new client and connect to the server
client = MongoClient(uri)
db = client.test
print(db)
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Database(MongoClient(host=['ac-ahopjww-shard-00-00.krlnafk.mongodb.net:27017', 'ac-ahopjww-shard-00-01.krlnafk.mongodb.net:27017', 'ac-ahopjww-shard-00-02.krlnafk.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='test', authsource='admin', replicaset='atlas-g0516r-shard-0', tls=True), 'test')
Pinged your deployment. You successfully connected to MongoDB!


In [14]:
quasar_zone = QUASAR_ZONE()
ppom_ppu = PPOM_PPU()
fm_korea = FM_KOREA()
ruli_web = RULI_WEB()
arca_live = ARCA_LIVE()


In [15]:
while True:
    quasar_zone.get_item_links()
    ppom_ppu.get_item_links()
    fm_korea.get_item_links()
    ruli_web.get_item_links()
    arca_live.get_item_links()
    quasar_zone.crawling()
    ppom_ppu.crawling()
    fm_korea.crawling()
    ruli_web.crawling()
    arca_live.crawling()

Message: no such element: Unable to locate element: {"method":"css selector","selector":"#revolution_main_table > tbody > tr:nth-child(29) > td.baseList-space.title > div > div > a"}
  (Session info: chrome-headless-shell=123.0.6312.86); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00BE4CC3+225091]
	(No symbol) [0x00B14E11]
	(No symbol) [0x009B9A7A]
	(No symbol) [0x009F175B]
	(No symbol) [0x009F188B]
	(No symbol) [0x00A27882]
	(No symbol) [0x00A0F5A4]
	(No symbol) [0x00A25CB0]
	(No symbol) [0x00A0F2F6]
	(No symbol) [0x009E79B9]
	(No symbol) [0x009E879D]
	sqlite3_dbdata_init [0x01059A63+4064547]
	sqlite3_dbdata_init [0x0106106A+4094762]
	sqlite3_dbdata_init [0x0105B968+4072488]
	sqlite3_dbdata_init [0x00D5C9C9+930953]
	(No symbol) [0x00B207E4]
	(No symbol) [0x00B1AD08]
	(No symbol) [0x00B1AE31]
	(No symbol) [0x00B0CAA0]
	BaseThreadInitThunk [0x769F7BA9+25