## Main

In [4]:
from seleniumbase import SB
import pandas as pd

In [53]:
def verify_success(sb):
    sb.assert_element('img[alt="Logo Assembly"]', timeout=8)
    sb.sleep(4)

def scroll_down(sb):
    # sb.execute_script('window.scrollTo({top: document.body.scrollHeight, behavior: "smooth"});')
    current_scroll_position, new_height= 0, 1
    while current_scroll_position <= new_height:
        current_scroll_position += 100
        sb.execute_script("window.scrollTo(0, {});".format(current_scroll_position))
        new_height = sb.execute_script("return document.body.scrollHeight")
    sb.sleep(5)

def zeczec_crawler(url):
    project_id = url.split("/")[-1]
    print(f'---- Start to collect {project_id} ----')

    with SB(uc_cdp=True, guest_mode=True) as sb:

        project = {'id': project_id}
        sb.open(url)

        if sb.is_element_visible('.js-expand-project-content'):
            # ------- basic info -------
            title = sb.find_element('h1.text-lg.tracking-wide')
            # print(title.text)

            cate = sb.find_element('a[href^="/categories?category"]')
            # print(cate.text)

            duration = sb.find_element('h3.inline-block.text-gray-500.text-xs')
            # print(duration.text)

            desc = sb.find_element('p.text-gray-500.text-sm.tracking-wider')
            # print(desc.text)

            project['title'] = title.text
            project['category'] = cate.text
            project['duration'] = duration.text
            project['desc'] = desc.text

            print('collected [basic info]')

            # ------- content -------
            elements = sb.find_elements('.js-expand-project-content')
            for element in elements:
                project_text = element.text
                
            # print(project_text)
            project['text'] = project_text

            print('collected [project content]')

            # ------- highlight content -------
            color_text = []
            elements = sb.find_elements('.js-expand-project-content span[class*="color"]')
            for element in elements:
                # print(element.text)
                if element.text:
                    color_text.append(element.text)
            
            project['color_text'] = color_text

            bold_text = []
            # elements = sb.find_elements('.js-expand-project-content span b')
            elements = sb.find_elements('.js-expand-project-content b')
            for element in elements:
                # print(element.text)
                if element.text:
                    bold_text.append(element.text)

            project['bold_text'] = bold_text

            hyper_text = []
            # elements = sb.find_elements('.js-expand-project-content span a')
            elements = sb.find_elements('.js-expand-project-content a')
            for element in elements:
                # print(element.text)
                if element.text:
                    hyper_text.append(element.text) 

            project['hyper_text'] = hyper_text

            print('collected [project highlight content]')

            # ------- img -------
            scroll_down(sb)

            project_img = []
            elements = sb.find_elements('.js-expand-project-content img')
            for e in elements:
                src = e.get_attribute('data-src')
                if src:
                    project_img.append(src)
                # print(src)
            
            project['img_list'] = project_img

            print('collected [project img]')

            sb.sleep(5) # wait

        else:
            print(False)
            try:
                verify_success(sb)
            except Exception:
                print("Detected!")
    
    print(f'---- Complete to collect {project_id} ----')
    return project

## Loop

In [41]:
## --- Init Setting --- ## 

collected_data = []

start_idx = 10001
end_idx = start_idx + 1000
file_name = f'ProjectData_{start_idx}-{end_idx-1}.csv'
print(file_name)

project_list = pd.read_csv('./ProjectList.csv')
project_list[start_idx: end_idx]

ProjectData_10001-11000.csv


Unnamed: 0,cover_text,url,id
10001,"Travel with Ü - 收藏你旅行的回憶與微笑\n102%\nNT$ 102,400...",https://www.zeczec.com/projects/travel-with-u,travel-with-u
10002,"食農繪本出版計劃\n111%\nNT$ 55,750\nperson\n33\n人\n集資成功",https://www.zeczec.com/projects/shi-nong-hui-b...,shi-nong-hui-ben-chu-ban-ji-hua
10003,"《HT.Chen白日夢遊先生》醫學系休學生影像日記\n152%\nNT$ 76,190\np...",https://www.zeczec.com/projects/ht-chen,ht-chen
10004,"EZPLUG™ 最小的智慧溫控插頭 - 舒肥烹調好幫手\n100%\nNT$ 1,004,5...",https://www.zeczec.com/projects/ezplug,ezplug
10005,SUSTAIN 地表最強發熱背心-超機能，多季節穿搭，輕薄保暖\n240%\nNT$ 720...,https://www.zeczec.com/projects/sustain-heated...,sustain-heated-vest
...,...,...,...
10996,"Stagg EKG600 | 溫控手沖壼 | 畫出生活與設計間最完美的水流\n1,242%\...",https://www.zeczec.com/projects/staggekg600,staggekg600
10997,"Travel with Ü - 收藏你旅行的回憶與微笑\n102%\nNT$ 102,400...",https://www.zeczec.com/projects/travel-with-u,travel-with-u
10998,"食農繪本出版計劃\n111%\nNT$ 55,750\nperson\n33\n人\n集資成功",https://www.zeczec.com/projects/shi-nong-hui-b...,shi-nong-hui-ben-chu-ban-ji-hua
10999,"《HT.Chen白日夢遊先生》醫學系休學生影像日記\n152%\nNT$ 76,190\np...",https://www.zeczec.com/projects/ht-chen,ht-chen


In [42]:
## --- Running --- ## 
error_log = {}

for i in range(start_idx, end_idx):
    project_url = project_list.iloc[i]['url']
    print(f'{i}: {project_url}')
    try:
        data = zeczec_crawler(project_url)
    except Exception as err:
        error_log[project_url] = err
        print(f'Collect Error! url: {project_url}')
    else:
        collected_data.append(data)

        print(f'Total: {len(collected_data)}')
        print()

10001: https://www.zeczec.com/projects/travel-with-u
---- Start to collect travel-with-u ----
collected [basic info]
collected [project content]
collected [project highlight content]
collected [project img]
---- Complete to collect travel-with-u ----
Total: 1

10002: https://www.zeczec.com/projects/shi-nong-hui-ben-chu-ban-ji-hua
---- Start to collect shi-nong-hui-ben-chu-ban-ji-hua ----
collected [basic info]
collected [project content]
collected [project highlight content]
collected [project img]
---- Complete to collect shi-nong-hui-ben-chu-ban-ji-hua ----
Total: 2

10003: https://www.zeczec.com/projects/ht-chen
---- Start to collect ht-chen ----
collected [basic info]
collected [project content]
collected [project highlight content]
collected [project img]
---- Complete to collect ht-chen ----
Total: 3

10004: https://www.zeczec.com/projects/ezplug
---- Start to collect ezplug ----
collected [basic info]
collected [project content]
collected [project highlight content]
collected [p

In [57]:
## --- Save --- ##

pd.DataFrame(collected_data).to_csv(f'{file_name}.csv', index=False)

#### Test

In [5]:
data = zeczec_crawler('https://www.zeczec.com/projects/scion-36')
data

---- Start to collect scion-36 ----
collected [basic info]
collected [project content]
collected [project highlight content]
collected [project img]
---- Complete to collect scion-36 ----


{'id': 'scion-36',
 'title': '【 i wash 薄型洗碗機 】 專為台灣住宅環境設計！超薄型Ｘ大容量Ｘ免安裝',
 'category': '科技',
 'duration': '2023/11/07 12:00 – 2023/12/06 02:00',
 'desc': '台灣屋小人多，讓洗碗機因「空間和容量不足」無法普及，面對講求便利性的特殊住宅環境，i wash 薄型洗碗機正式誕生！台灣首款 6 人份大容量洗碗機，前所未見 36cm 超薄厚度，輕鬆容納 46 件餐具，而且「不需安裝」，買回家就能開洗！',
 'text': '洗碗，那是過去的事。\n和家人在餐桌上享受豐盛的美食，是忙碌生活中彼此分享、連結的重要時光，而「洗碗」這項繁瑣的家務，消磨著這些日常喜悅。\ni wash 薄型洗碗機是讓你的家更像家的現代智慧，只要一個按鈕，清潔自動一次到位。不用再浪費時間站在水槽前、不用再擔心骯髒的碗盤山，省下大把時間，專注在美食和家人身上。\n在台灣，洗衣機的普及率已經幾乎來到 100%，但為何洗碗機卻遲遲無法普及呢？深入調查台灣家庭的使用需求，Scion 團隊發現以下這些困擾是讓許多人對洗碗機卻步的幾大關卡。\n\n\n\n\n在廚房，空間跟時間一樣重要。\n台灣的廚房其實沒有像國外那麽好的空間條件，可以安裝嵌入式洗碗機，而市面上許多桌上型洗碗機，即使體積不大，也會因為難以忽視的厚度，無法安置於狹小的廚房檯面。\n\n而 i wash 徹底改變了洗碗機的結構，打造出僅 36 公分的超薄機身，因為夠薄，所以不再受到空間限制，想放哪就放哪，成為小廚房的理想選擇，同時為大型廚房提供更多儲存空間，徹底解決台灣家庭安裝洗碗機的最大痛點。\n\n「買了洗碗機才發現：容量不夠，好難用...」\n這是許多人在入手洗碗機後才發現的惡夢，因為錯估了洗碗機的容量，還有自己的生活方式，導致洗碗機買了卻不常用，只能放著生灰塵。\n\n其實，市面上許多洗碗機雖然主打「四人份」，但其實放入兩人份的碗盤、餐具，空間早已所剩無幾，甚至無法放進烹飪用的廚具...。Scion 團隊知道：消費者需要的是更有延展空間的洗碗機，而「家庭人數 x 1.5 倍 合理且實際的容量！」\n據統計，台灣每三個家庭就有一個擁有超過 4 位家庭成員，三餐使用的碗盤餐具、再加上廚具，要洗的東西超