# 基本的なサイトのスクレイピング例（scrapyのみで可能な場合）

In [11]:
from multiprocessing import Process
from typing import List, Dict, Any, Iterator

import scrapy
from scrapy import signals
from scrapy.crawler import CrawlerProcess

In [12]:
class MetyaComicItem(scrapy.Item):
    
    # ID
    id = scrapy.Field()
    
    # URL
    url = scrapy.Field()
    
    # タイトル
    title = scrapy.Field()
    
    # あらすじ
    abstract = scrapy.Field()
    
    # レビュー
    review = scrapy.Field()
    

class MetyaComicSpider(scrapy.Spider):
    
    name = 'metya_comic_spider'
    allowed_domains = ['metya.com']

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(MetyaComicSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(spider.item_scraped_callback, signal=signals.item_scraped)
        return spider
    
    def __init__(self, urls: List[str], *args, **kwargs):
        
        self.start_urls = urls
        self.current_url_idx = 0
        super(MetyaComicSpider, self).__init__(*args, **kwargs)
    
    def spider_opened(self, spider):
        
        self.logger.info("Spider opened: %s", spider.name)
        
    def start_requests(self):
        
        url = self.start_urls[self.current_url_idx]
        yield scrapy.Request(url, callback=self.parse_manga_page)
        
    def parse_manga_page(self, response: scrapy.http.Response) -> Iterator[Dict]:
        
        item = MetyaComicItem()
        
        # ID
        id = response.url.split("/")[-1]
        
        # URL
        item['url'] = response.url
        
        # タイトル
        title = response.xpath('//div[@class="p-bookInfo_title"]/h1/text()').get()
        if title:
            item['title'] = title

        # あらすじ
        abstract = response.xpath('//div[@class="p-bookInfo_summary"]//p/text()').get()
        if abstract:
            item['abstract'] = abstract.replace('\n', '')
                
        # 評価
        review = response.xpath('//div[@class="p-bookInfo_body"]//span[@class="p-customerReview_averageScore"]/text()').get()
        if review:
            item['review'] = review.replace("\n", "").strip()

        yield item
        
    def item_scraped_callback(self, item, response, spider):
        
        self.logger.info(f"Scraped item: [{self.start_urls[self.current_url_idx]}]")
        self.current_url_idx += 1
        if self.current_url_idx < len(self.start_urls):
            self.schedule_next_page()
            
    def schedule_next_page(self):
        
        next_url = self.start_urls[self.current_url_idx]
        self.logger.info(f"Scheduling next page: {next_url}")
        next_request = scrapy.Request(next_url, callback=self.parse)
        self.crawler.engine.crawl(next_request)

    
def start_crawl(settings: Dict[str, Any], urls: List[str]):

    process: CrawlerProcess = CrawlerProcess(settings=settings)
    process.crawl(MetyaComicSpider, urls)
    process.start()

In [13]:
target_urls = ["https://mechacomic.jp/books/85773"]

settings: Dict[str, Any] = {
    'DOWNLOAD_DELAY': 3,
    'TELNETCONSOLE_ENABLED': False,
}
settings['exec_yyyymmdd'] = '2025-05-01'

Process(target=start_crawl, args=(settings, target_urls)).start()

2025-05-23 23:10:38 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2025-05-23 23:10:38 [scrapy.utils.log] INFO: Versions: lxml 5.2.1.0, libxml2 2.12.6, cssselect 1.2.0, parsel 1.9.1, w3lib 2.1.2, Twisted 24.3.0, Python 3.11.3 (main, May 23 2023, 13:42:38) [GCC 8.3.0], pyOpenSSL 24.1.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.5, Platform Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.28
2025-05-23 23:10:38 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2025-05-23 23:10:38 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2025-05-23 23:10:38 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2025-05-23 23:10:38 [scrapy.crawler] INFO: Overridden settings:
{'D

# 動的サイトのスクレイピング例（scrapy + selenium）

In [1]:
import time
from multiprocessing import Process
from typing import List, Dict, Any, Iterator

import scrapy
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

In [7]:
# ノートブック実行用
import nest_asyncio
nest_asyncio.apply()

In [8]:
def get_driver():
    return WebDriverWrapper()

class WebDriverWrapper:
    def __init__(self):
        self.driver = self._create_driver()
        self.load_count = 0
        self.max_loads_before_quit = 10  # 任意の回数に設定

    def _create_driver(self) -> webdriver.Chrome:
        """新しい webdriver.Chrome インスタンスを作成する"""
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")  
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-gpu")  # GPUを無効にする
        options.add_argument("--disable-extensions")  # 拡張機能を無効にする
        options.add_argument('--window-size=1920,1080')
        webdriver_service = Service(ChromeDriverManager().install())
        return webdriver.Chrome(service=webdriver_service, options=options)

    def _restart_driver(self):
        """ドライバーを再起動する"""
        self.driver.quit()
        self.driver = self._create_driver()
        self.load_count = 0

    def get(self, url):
        """get メソッドのラッパー"""
        if self.load_count >= self.max_loads_before_quit:
            self._restart_driver()

        self.driver.get(url)
        self.load_count += 1
        
    def get_dom(self, xpath):
        return self.driver.find_elements(By.XPATH, xpath)

    def __getattr__(self, name):
        """get メソッド以外はオリジナルのドライバメソッドを返す"""
        return getattr(self.driver, name)

    def quit(self):
        """明示的にドライバーを終了する"""
        if self.driver:
            self.driver.quit()
            self.driver = None
            
driver = get_driver()
            
class ManbaScrapySpiderMiddleware(object):
    def process_request(self, request, spider):
        driver.get(request.url)
        time.sleep(1)
        return HtmlResponse(driver.current_url, body=driver.page_source, encoding='utf-8', request=request)

In [9]:
class ManbaItem(scrapy.Item):
    
    # ID
    id = scrapy.Field()
    # URL
    url = scrapy.Field()
    # タイトル
    title = scrapy.Field()
    # あらすじ
    abstract = scrapy.Field()
    # レビュー
    review = scrapy.Field()


class ManbaSpider(scrapy.Spider):
    
    name = "manba_spider"
    allowed_domains = ["manba.co.jp"]
    
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(ManbaSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(spider.item_scraped_callback, signal=signals.item_scraped)
        return spider
    
    def __init__(self, urls: List[str], *args, **kwargs):
        
        self.start_urls = urls
        self.current_url_idx = 0
        super(ManbaSpider, self).__init__(*args, **kwargs)
    
    def spider_opened(self, spider):
        
        self.logger.info("Spider opened: %s", spider.name)
        
    def start_requests(self):
        
        url = self.start_urls[self.current_url_idx]
        yield scrapy.Request(url, callback=self.parse_manga_page)

    def parse_manga_page(self, response):
        
        time.sleep(1)
        
        item = ManbaItem()
        
        # ID
        item['id'] = response.url.split("/")[-1]
        
        # URL
        item['url'] = response.url
        
        # タイトル
        title = response.xpath('//h1[@class="board-books-main-title"]/a/text()').get()
        if title:
            item['title'] = title
            
        # あらすじ
        abstract = response.xpath('//p[@class="summary"]/text()').get()
        if abstract:
            item['abstract'] = abstract.replace('\n', '')
            
        # レビュー
        review = response.xpath('//div[@class="board-metadata-favorite-ratings"]/div[@class="text"]/text()').get()
        if review:
            item['review'] = review

        yield item
        
    
    def item_scraped_callback(self, item, response, spider):
        
        self.logger.info(f"Scraped item: [{self.start_urls[self.current_url_idx]}]")
        self.current_url_idx += 1
        if self.current_url_idx < len(self.start_urls):
            self.schedule_next_page()
            
    def schedule_next_page(self):
        
        next_url = self.start_urls[self.current_url_idx]
        self.logger.info(f"Scheduling next page: {next_url}")
        next_request = scrapy.Request(next_url, callback=self.parse)
        self.crawler.engine.crawl(next_request)

    def closed(self, response):
        
        driver.quit()
                    

def start_crawl(settings: Dict[str, Any], urls: List[str]):

    process: CrawlerProcess = CrawlerProcess(settings=settings)
    process.crawl(ManbaSpider, urls)
    process.start()

In [10]:
target_urls = ["https://manba.co.jp/boards/19959"]

settings: Dict[str, Any] = {
    'DOWNLOAD_DELAY': 3,
    'TELNETCONSOLE_ENABLED': False,
    'REQUEST_FINGERPRINTER_IMPLEMENTATION': "2.7",
    'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
    'FEED_EXPORT_ENCODING': "utf-8",
    'CONCURRENT_REQUESTS': 3,
    'DOWNLOADER_MIDDLEWARES': {
        '__main__.ManbaScrapySpiderMiddleware': 543,
    },
}
settings['exec_yyyymmdd'] = '2025-05-01'

Process(target=start_crawl, args=(settings, target_urls)).start()

2025-05-23 22:43:19 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2025-05-23 22:43:19 [scrapy.utils.log] INFO: Versions: lxml 5.2.1.0, libxml2 2.12.6, cssselect 1.2.0, parsel 1.9.1, w3lib 2.1.2, Twisted 24.3.0, Python 3.11.3 (main, May 23 2023, 13:42:38) [GCC 8.3.0], pyOpenSSL 24.1.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.5, Platform Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.28
2025-05-23 22:43:19 [scrapy.addons] INFO: Enabled addons:
[]
2025-05-23 22:43:19 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2025-05-23 22:43:19 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.unix_events._UnixSelectorEventLoop


2025-05-23 22:43:19 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2025-05-23 22:43:19 [scrapy.crawler] INFO: Overridden settings:
{'CONCURRENT_REQUESTS': 3,
 'DOWNLOAD_DELAY': 3,
 'FEED_EXPORT_ENCODING': 'utf-8',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'TELNETCONSOLE_ENABLED': False,
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2025-05-23 22:43:19 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 '__main__.ManbaScrapySpiderMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddle