In [2]:
# python 3.11.4

from bs4 import BeautifulSoup
import requests
import pandas as pd
pd.set_option('display.max_columns', 10)

# リストを初期化
room_plans = []
room_links = []
plan_urls = []
all_titles = []
all_hashtags = []
all_images = []

# 各planごとのハッシュタグをまとめる辞書
hashtags_by_plan = {}

# 26ページ分のループ
for page in range(1, 27):
    url = f"https://hellointerior.jp/tag/26?page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # div innerの中身を抽出
    divs = soup.find_all('div',class_='inner')
    lis = divs[0].find_all("li",class_="popular_room_list")

    for plan in lis:
        plan_name = plan.find('a', class_='parse-image-home').text.strip()
        plan_link = plan.find('a', class_='parse-image-home').get('href')
        full_plan_url = f"https://hellointerior.jp{plan_link}"
            
        room_plans.append(plan_name)
        room_links.append(plan_link)
        plan_urls.append(full_plan_url)

        hashtags = plan.find_all('a', class_='font-family-hiragino-w3')
        hashtags_by_plan[plan_link] = [tag.text.strip() for tag in hashtags if tag.text.startswith('#')]

        # JPEGリンクを抽出
        img_tag = plan.find('img', class_='home-coordinate-image')
        if img_tag and img_tag.has_attr('data-src'):
            img_link = img_tag['data-src']
            all_images.append(img_link)
        else:
            all_images.append(None)  # 画像リンクが見つからない場合はNoneを追加

        # 物件のタイトルを抽出
        title = img_tag.get('alt')
        if title:
            all_titles.append(title)
        else:
            all_titles.append("不明")  # 不明なタイトルを追加

    # ハッシュタグ情報を取得
    for li in lis:
        hashtags = li.find_all("a",class_="font-family-hiragino-w3")
        list_hashtags = [tag.text.strip() for tag in hashtags]
        all_hashtags.append(list_hashtags)

# データフレームを作成
df = pd.DataFrame()
df["plan番号"] = room_plans
df["リンク"] = room_links
df["完全なURL"] = plan_urls
df["物件タイトル"] = all_titles
df["ハッシュタグ"] = [", ".join(tags) for tags in all_hashtags]
df["画像リンク"] = all_images

print(df)

df.to_csv('hellointerior_scraping.csv', index=False, encoding='utf-8-sig')

hashtag_info = pd.read_csv('./hellointerior_scraping.csv')
hashtag_info_0 = hashtag_info.copy()

hashtag_info = hashtag_info_0.copy()
import warnings
warnings.filterwarnings('ignore')

features = set()
for options in hashtag_info['ハッシュタグ']:
    features.update(options.split(','))

for feature in features:
    hashtag_info[feature] = hashtag_info['ハッシュタグ'].apply(lambda x: 1 if feature in x else 0)

hashtag_info.to_csv('hellointerior_scraping_info.csv', index=False, encoding='utf-8-sig' )

hashtag_info.head()