In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib
import time

In [43]:
import time
import requests
from bs4 import BeautifulSoup
import sqlite3

def scrape_suumo_all_pages():
    """
    1ページ目〜Nページ目までを順番にリクエストして、情報を取得する例。
    """
    base_url = (
        "https://suumo.jp/jj/chintai/ichiran/FR301FC005/"
        "?fw2=&mt=9999999&cn=9999999&ta=12&et=9999999"
        "&sc=12101&sc=12102&sc=12103&sc=12104&sc=12105&sc=12106"
        "&shkr1=03&ar=030&bs=040&ct=9999999&shkr3=03"
        "&shkr2=03&mb=0&shkr4=03&cb=0.0"
    )
    
    MAX_PAGE = 3  # デモ用に少なめに設定（本来2194など実際のページ数に）
    WAIT_SECONDS = 1
    
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.49 Safari/537.36"
        )
    }
    
    all_results = []

    for page in range(1, MAX_PAGE + 1):
        url = f"{base_url}&page={page}"
        print(f"[Info] Requesting page {page} / {MAX_PAGE} : {url}")
        time.sleep(WAIT_SECONDS)

        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        listings = soup.find_all('div', class_='cassetteitem')
        
        for item in listings:
            title_elem = item.find('div', class_='cassetteitem_content-title')
            building_name = title_elem.get_text(strip=True) if title_elem else None

            table = item.find('table', class_='cassetteitem_other')
            if not table:
                continue
            
            rows = table.find_all('tr', class_='js-cassette_link')
            for row in rows:
                rent_elem = row.find('span', class_='cassetteitem_other-emphasis')
                rent = rent_elem.get_text(strip=True) if rent_elem else None

                details = row.find_all('td', class_='detailbox-property-col')
                if len(details) >= 2:
                    left_divs = details[0].find_all('div')
                    right_divs = details[1].find_all('div')
                    if len(left_divs) >= 2:
                        area = left_divs[0].get_text(strip=True)
                        direction = left_divs[1].get_text(strip=True)
                    else:
                        area = None
                        direction = None

                    if len(right_divs) >= 2:
                        building_type = right_divs[0].get_text(strip=True)
                        building_age = right_divs[1].get_text(strip=True)
                    else:
                        building_type = None
                        building_age = None
                else:
                    area = None
                    direction = None
                    building_type = None
                    building_age = None

                # アクセス情報
                access_div = item.find('div', class_='cassetteitem_detail-text')
                if access_div:
                    raw_access_lines = access_div.find_all('li')
                    if raw_access_lines:
                        accesses = [li.get_text(strip=True) for li in raw_access_lines]
                    else:
                        access_text = access_div.get_text('\n', strip=True)
                        accesses = access_text.split('\n')
                else:
                    accesses = []

                data = {
                    "building_name": building_name,
                    "rent": rent,
                    "area": area,
                    "direction": direction,
                    "building_type": building_type,
                    "building_age": building_age,
                    "accesses": ", ".join(accesses),  # カンマ区切りなど、保存形式は任意
                    "page": page
                }
                
                all_results.append(data)

    return all_results

def insert_data_to_db(data_list, db_name='suumo_data.db'):
    """
    取得した物件データをSQLiteデータベースにINSERTする。
    db_name: 保存するSQLiteファイル名
    """
    # SQLiteに接続（ファイルが無い場合は新規作成される）
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    # テーブル作成（IF NOT EXISTSで重複作成を回避）
    create_table_sql = """
    CREATE TABLE IF NOT EXISTS suumo_listings(
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        building_name TEXT,
        rent TEXT,
        area TEXT,
        direction TEXT,
        building_type TEXT,
        building_age TEXT,
        accesses TEXT,
        page INTEGER
    )
    """
    
    cursor.execute(create_table_sql)
    
    # INSERT文のプレースホルダ
    insert_sql = """
    INSERT INTO suumo_listings
    (building_name, rent, area, direction, building_type, building_age, accesses, page)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    """

    # まとめてINSERTする場合はexecutemanyを使用：
    records = []
    for item in data_list:
        records.append((
            item["building_name"],
            item["rent"],
            item["area"],
            item["direction"],
            item["building_type"],
            item["building_age"],
            item["accesses"],
            item["page"]
        ))
    
    # 挿入
    cursor.executemany(insert_sql, records)
    
    # 変更をコミットして保存
    conn.commit()
    conn.close()

def main():
    # スクレイピング実行
    print("[Info] Starting scraping...")
    data_list = scrape_suumo_all_pages()
    print(f"[Info] Scraping finished. Total items: {len(data_list)}")
    
    # DBに保存
    print("[Info] Inserting data into SQLite DB...")
    insert_data_to_db(data_list, db_name='suumo_data.db')
    print("[Info] Done.")

if __name__ == "__main__":
    main()

[Info] Starting scraping...
[Info] Requesting page 1 / 3 : https://suumo.jp/jj/chintai/ichiran/FR301FC005/?fw2=&mt=9999999&cn=9999999&ta=12&et=9999999&sc=12101&sc=12102&sc=12103&sc=12104&sc=12105&sc=12106&shkr1=03&ar=030&bs=040&ct=9999999&shkr3=03&shkr2=03&mb=0&shkr4=03&cb=0.0&page=1
[Info] Requesting page 2 / 3 : https://suumo.jp/jj/chintai/ichiran/FR301FC005/?fw2=&mt=9999999&cn=9999999&ta=12&et=9999999&sc=12101&sc=12102&sc=12103&sc=12104&sc=12105&sc=12106&shkr1=03&ar=030&bs=040&ct=9999999&shkr3=03&shkr2=03&mb=0&shkr4=03&cb=0.0&page=2
[Info] Requesting page 3 / 3 : https://suumo.jp/jj/chintai/ichiran/FR301FC005/?fw2=&mt=9999999&cn=9999999&ta=12&et=9999999&sc=12101&sc=12102&sc=12103&sc=12104&sc=12105&sc=12106&shkr1=03&ar=030&bs=040&ct=9999999&shkr3=03&shkr2=03&mb=0&shkr4=03&cb=0.0&page=3
[Info] Scraping finished. Total items: 0
[Info] Inserting data into SQLite DB...
[Info] Done.


In [52]:
import requests
from bs4 import BeautifulSoup
import time
import sqlite3

DB_NAME = "suumo_data.db"  # SQLiteのファイル名
TABLE_NAME = "suumo_listings"

def init_db(db_name=DB_NAME, table_name=TABLE_NAME):
    """
    データベースへの接続とテーブル作成を行う。(存在しなければ作成)
    """
    conn = sqlite3.connect(db_name)
    cur = conn.cursor()
    create_table_sql = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        building_name TEXT,
        rent TEXT,
        area TEXT,
        direction TEXT,
        building_type TEXT,
        building_age TEXT,
        accesses TEXT
    );
    """
    cur.execute(create_table_sql)
    conn.commit()
    return conn


def insert_listing(conn, table_name, listing):
    """
    1件の物件データ(listing: dict)をDBにINSERTする。
    """
    cur = conn.cursor()
    insert_sql = f"""
    INSERT INTO {table_name} 
    (building_name, rent, area, direction, building_type, building_age, accesses)
    VALUES (?, ?, ?, ?, ?, ?, ?)
    """
    values = (
        listing.get("building_name"),
        listing.get("rent"),
        listing.get("area"),
        listing.get("direction"),
        listing.get("building_type"),
        listing.get("building_age"),
        listing.get("accesses"),
    )
    cur.execute(insert_sql, values)
    conn.commit()


def scrape_suumo_page(url):
    """
    与えられたURLからページを取得し、物件情報を抜き出してリストを返す。
    """
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        )
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Error: Status code {response.status_code} for {url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    property_elements = soup.find_all("div", class_="property-body-element")

    listings_data = []
    for prop in property_elements:
        # 物件名
        title_tag = prop.find_previous("h2", class_="property_inner-title")
        if not title_tag:
            title_tag = prop.find("h2", class_="property_inner-title")
        building_name = title_tag.get_text(strip=True) if title_tag else None

        # 賃料
        rent_col = prop.find("td", class_="detailbox-property-col detailbox-property--col1")
        rent = None
        if rent_col:
            rent_div = rent_col.find("div", class_="detailbox-property-point")
            rent = rent_div.get_text(strip=True) if rent_div else None

        # 専有面積、向き、築年数など
        col3s = prop.find_all("td", class_="detailbox-property-col detailbox-property--col3")
        area = None
        direction = None
        building_type = None
        building_age = None

        if len(col3s) >= 2:
            blocks_1 = col3s[0].find_all("div")  # [間取り, 専有面積, 向き]など
            if len(blocks_1) >= 3:
                area = blocks_1[1].get_text(strip=True).replace("\n", "")
                direction = blocks_1[2].get_text(strip=True)

            blocks_2 = col3s[1].find_all("div")  # [アパート, 築14年]など
            if len(blocks_2) >= 2:
                building_type = blocks_2[0].get_text(strip=True)
                building_age = blocks_2[1].get_text(strip=True)

        # アクセス情報
        note_box = prop.find("div", class_="detailnote-box")
        access_list = []
        if note_box:
            access_divs = note_box.find_all("div")
            for div_el in access_divs:
                text = div_el.get_text(strip=True)
                if text and "見学予約可" not in text and "新着" not in text:
                    access_list.append(text)
        access_str = "・".join(access_list)

        data = {
            "building_name": building_name,
            "rent": rent,
            "area": area,
            "direction": direction,
            "building_type": building_type,
            "building_age": building_age,
            "accesses": access_str
        }
        listings_data.append(data)

    return listings_data


def main():
    # --------- 1. DBの初期化(接続 & テーブル作成) ----------
    conn = init_db()  # suumo_data.db / suumo_listingsテーブル生成

    # --------- 2. ページURLの設定 ----------
    base_url = (
        "https://suumo.jp/jj/chintai/ichiran/FR301FC005/"
        "?fw2=&mt=9999999&cn=9999999&ta=12&et=9999999"
        "&sc=12101&sc=12102&sc=12103&sc=12104&sc=12105&sc=12106"
        "&shkr1=03&ar=030&bs=040&ct=9999999&shkr3=03&shkr2=03"
        "&srch_navi=1&mb=0&shkr4=03&cb=0.0"
    )

    # 2194ページまで繰り返し取得(本当にページがあるか要確認)
    total_inserted = 0
    for page_num in range(1, 2195):
        # 1ページ目はそのまま, 2ページ目以降は "&page={page_num}" を付与
        if page_num == 1:
            url = base_url
        else:
            url = base_url + f"&page={page_num}"

        print(f"[Info] Scraping page {page_num}: {url}")
        listings = scrape_suumo_page(url)
        print(f"[Info] -> found {len(listings)} listings.")

        # 何も取得できなかったら「もうページが存在しない」と判断してbreak
        if not listings:
            print("[Info] No data on this page. Possibly last page reached.")
            break

        # 取得データをDBにINSERT
        for listing in listings:
            insert_listing(conn, TABLE_NAME, listing)
            total_inserted += 1

        # 大量アクセスを避けるための待機(秒数は増やしたほうが無難)
        time.sleep(5)

    print(f"[Info] Done. Total {total_inserted} records inserted.")

    # 最後にDBをクローズ
    conn.close()


if __name__ == "__main__":
    main()

[Info] Scraping page 1: https://suumo.jp/jj/chintai/ichiran/FR301FC005/?fw2=&mt=9999999&cn=9999999&ta=12&et=9999999&sc=12101&sc=12102&sc=12103&sc=12104&sc=12105&sc=12106&shkr1=03&ar=030&bs=040&ct=9999999&shkr3=03&shkr2=03&srch_navi=1&mb=0&shkr4=03&cb=0.0
[Info] -> found 30 listings.
[Info] Scraping page 2: https://suumo.jp/jj/chintai/ichiran/FR301FC005/?fw2=&mt=9999999&cn=9999999&ta=12&et=9999999&sc=12101&sc=12102&sc=12103&sc=12104&sc=12105&sc=12106&shkr1=03&ar=030&bs=040&ct=9999999&shkr3=03&shkr2=03&srch_navi=1&mb=0&shkr4=03&cb=0.0&page=2
[Info] -> found 30 listings.
[Info] Scraping page 3: https://suumo.jp/jj/chintai/ichiran/FR301FC005/?fw2=&mt=9999999&cn=9999999&ta=12&et=9999999&sc=12101&sc=12102&sc=12103&sc=12104&sc=12105&sc=12106&shkr1=03&ar=030&bs=040&ct=9999999&shkr3=03&shkr2=03&srch_navi=1&mb=0&shkr4=03&cb=0.0&page=3
[Info] -> found 30 listings.
[Info] Scraping page 4: https://suumo.jp/jj/chintai/ichiran/FR301FC005/?fw2=&mt=9999999&cn=9999999&ta=12&et=9999999&sc=12101&sc=12102&