In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import csv

# SeleniumのWebDriverをセットアップ
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# 初期ページにアクセス
base_url = "https://biz-maps.com/s/cities/131041/m-inds/512"
driver.get(base_url)

# URLを格納するリスト
urls = []

# ページを移動するためのループ
for _ in range(2): #ページ数
    try:
        # 現在のURLを取得してリストに追加
        current_url = driver.current_url
        urls.append(current_url)

        # 次のページリンクを探してクリック
        next_button = driver.find_element(By.XPATH, "//a[@rel='next']")
        next_button.click()
        time.sleep(3)  # ページが完全にロードされるのを待つ
    except Exception as e:
        print("次のページが見つかりません:", e)
        break

# WebDriverを終了
driver.quit()

# CSVファイルに保存
csv_filename = "urlsds.csv"
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["URL"])  # ヘッダー行
    for url in urls:
        writer.writerow([url])

print(f"URLリストを {csv_filename} に保存しました。")

URLリストを urlsds.csv に保存しました。


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv

# 開始時刻の記録
start_time = time.time()

# 入力ファイル名と出力ファイル名
input_csv = "urlsds.csv"
output_csv = "companykokkoo_data.csv"

# ヘッダー情報を指定
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# 入力CSVからURLを読み込む
urls = []
with open(input_csv, mode="r", encoding="utf-8") as file:
    reader = csv.reader(file)
    next(reader)  # ヘッダーをスキップ
    urls = [row[0] for row in reader]

# データを格納するリスト
all_company_names = []
all_company_urls = []
all_extracted_links = []

# 各URLを訪問してデータを抽出
for url in urls:
    print(f"Processing URL: {url}")
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            
            # 会社名とURLを抽出
            company_names = []
            company_urls = []
            extracted_links = []
            
            companies = soup.find_all("div", class_="results__name")
            for company in companies:
                name = company.text.strip()
                parent = company.find_parent("a")
                company_url = parent.get("href") if parent else None
                if company_url and "http" not in company_url:
                    company_url = "https://biz-maps.com" + company_url
                
                # 各企業のページを訪問して特定リンクを取得
                if company_url:
                    try:
                        company_response = requests.get(company_url, headers=headers)
                        if company_response.status_code == 200:
                            company_soup = BeautifulSoup(company_response.content, "html.parser")
                            specific_links = company_soup.find_all("a", href=True)
                            external_links = [
                                link.get("href")
                                for link in specific_links
                                if "http" in link.get("href") and "biz-maps.com" not in link.get("href")
                            ]
                            
                            external_links = [
                                "" if link == "https://www.hifcorp.co.jp/" else link
                                for link in external_links
                            ]
                            
                            link_text = external_links[0] if external_links else "None"
                        else:
                            link_text = "Failed to fetch company page"
                    except Exception as e:
                        link_text = f"Error: {e}"
                else:
                    link_text = "None"
                
                company_names.append(name)
                company_urls.append(company_url)
                extracted_links.append(link_text)
            
                time.sleep(2)  # サーバー負荷軽減のため遅延を追加
            
            # URLごとのデータを統合
            all_company_names.extend(company_names)
            all_company_urls.extend(company_urls)
            all_extracted_links.extend(extracted_links)
        else:
            print(f"Failed to fetch {url}: {response.status_code}")
    except Exception as e:
        print(f"Error processing {url}: {e}")

# 全データをCSV形式で保存
data = {
    "企業名": all_company_names,
    "企業URL": all_extracted_links
}
df = pd.DataFrame(data)
df.to_csv(output_csv, index=False, encoding="utf-8-sig")

# 終了時刻の記録と経過時間の計算
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Data saved to {output_csv}")
print(f"Total processing time: {elapsed_time:.2f} seconds")

Processing URL: https://biz-maps.com/s/cities/131041/m-inds/512
Processing URL: https://biz-maps.com/s/cities/131041/m-inds/512?city%5B0%5D=131041&mid_industry%5B0%5D=512&ph=%25242y%252405%2524%252FmDYcz.znctbzLmKNi0ZaetolXAG6EzOcsglkD6dETs3QNt.JD7oy&page=2
Data saved to companykokkoo_data.csv
Total processing time: 110.13 seconds
