In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

# 1. 웹페이지 URL 설정
url = "https://www.hp-lexicon.org/spells-and-potions/"

# 2. 페이지 요청 및 HTML 파싱
response = requests.get(url)
response.text
soup = BeautifulSoup(response.content, 'html.parser')
sections = ["spells", "potions"] #, "magical_items_devices", "magical_and_mundane_plants", "miscellaneous_magic"]

In [2]:
sections_contents = {}
for section in sections:
    sections_contents[section] = []
    spells_section = soup.find('span', id=section).find_next('ul')
    spell_links = spells_section.find_all('a')

    # 추출한 링크 및 텍스트 출력
    for link in spell_links:
        spell_name = link.text.strip()
        spell_url = link['href']
        sections_contents[section].append((spell_name, spell_url))
        # print(f'Spell Name: {spell_name}, URL: {spell_url}')

In [3]:
for section, items in sections_contents.items():
    print(f"Section: {section}")
    for item in items:
        print(f" - {item}")
    print("\n")

Section: spells
 - ('Spells', 'https://www.hp-lexicon.org/magiccategory/spells/')
 - ('Incantations', 'https://www.hp-lexicon.org/magiccategory/incantations/')
 - ('Magical effects', 'https://www.hp-lexicon.org/magiccategory/effects/')
 - ('Magical Disciplines', 'https://www.hp-lexicon.org/magiccategory/magical-discipline/')


Section: potions
 - ('Potions', 'https://www.hp-lexicon.org/magiccategory/potions/')
 - ('Potion Ingredients', 'https://www.hp-lexicon.org/thing-category/potion-ingredients/')




In [5]:
import re
from tqdm import tqdm

def summary_search(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    div_data = soup.find('div', class_="col-md-8")
    summary = []
    for element in div_data.find_all():
        if element.name=='h2' and element.get_text(strip=True) in ['History and Notes', 'References from the canon', 'Known Animagi and Their Forms']:
            break
        if element.name == 'p':
            summary.append(element.get_text(strip=True))
    return '\n'.join(summary).rstrip()


def letter_search(url):
    spells = []
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    articles = soup.find_all('article', id=re.compile("post"))
    
    for article in articles:
        text_parts =  article.find('span', itemprop="headline").get_text(strip=True).split('—')
        charm_name = text_parts[0].strip()
        spell_name = text_parts[1].strip() if len(text_parts) > 1 else ""
        read_more_link = article.find('span', class_='descr').find('a', class_='read-more')
        if read_more_link:
            summary = summary_search(read_more_link['href'])
        else:
            summary = summary_search(article.find('a', itemprop="url")['href'])
        if summary == '':
            continue

        spell_data = {
            "name": charm_name,
            "description": summary
        }
        if spell_name != '':
            spell_data["incantation"] = spell_name
        spells.append(spell_data)
    return spells
    
def inner_search(url):
    datas = []
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # print(soup.prettify())
    pages = soup.find('ul', class_='pagination').find_all('a')
    search_pages = []

    # for page in pages:
    for page in tqdm(pages):
        letter = page.text.strip()
        # print(letter)
        relative_url = '?letter='+letter
        full_url = urljoin(url, relative_url)
        datas.extend(letter_search(full_url))
    return datas

In [6]:
import json
import os

path = '.\Crawl_data'
if not os.path.exists(path):
    os.makedirs(path)
    
for section, items in sections_contents.items():
    print(f"Section: {section}")
    section_path = os.path.join(path, section)
    
    if not os.path.exists(section_path):
        os.makedirs(section_path)
    # data = []
    for item in items:
        doc_name, link = item
        print(doc_name)
        file_name = doc_name.lower()+'.json'
        file_path = os.path.join(section_path, file_name)
        rlt = inner_search(link)
        with open(file_path, 'w', encoding='utf-8') as json_file:
            json.dump(rlt, json_file, ensure_ascii=False, indent=4)
        # data.extend(rlt)
    # file_path
    # break
# data

Section: potions
Potion Ingredients


  0%|          | 0/22 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [7]:
Wizarding_Culture = [
    ("Wizarding culture", "https://www.hp-lexicon.org/thing-category/culture/"),
    ("Books and Literature", "https://www.hp-lexicon.org/thing-category/books/"),
    ("Food and drinks", "https://www.hp-lexicon.org/thing-category/food-and-drinks/"),
    ("Ministry of Magic", "https://www.hp-lexicon.org/thing-category/ministry-of-magic/"),
    ("Broomsticks", "https://www.hp-lexicon.org/thing-category/broomsticks/"),
    ("Businesses", "https://www.hp-lexicon.org/thing-category/businesses/"),
    ("Clothing", "https://www.hp-lexicon.org/thing-category/clothing/"),
    ("Common items", "https://www.hp-lexicon.org/thing-category/common-items/"),
    ("Communication", "https://www.hp-lexicon.org/thing-category/communication/"),
    ("Diseases and healing", "https://www.hp-lexicon.org/thing-category/healing/"),
    ("Furniture and household items", "https://www.hp-lexicon.org/thing-category/household/"),
    ("Occupations", "https://www.hp-lexicon.org/thing-category/occupations/"),
    ("Publications", "https://www.hp-lexicon.org/thing-category/publications/"),
    ("Rules and laws", "https://www.hp-lexicon.org/thing-category/rules-and-laws/"),
    ("Security", "https://www.hp-lexicon.org/thing-category/security/"),
    ("Sweets", "https://www.hp-lexicon.org/thing-category/sweets/"),
    ("Transportation", "https://www.hp-lexicon.org/thing-category/transportation/")]

Language = [# ("Glossary", "https://www.hp-lexicon.org/thing-category/words-and-terms/"), 어휘.. 필요할까?
    ("Insults, Curses, and Interjections", "https://www.hp-lexicon.org/thing-category/insults/"),
    ("Languages", "https://www.hp-lexicon.org/thing-category/languages/"),
    ("Symbols", "https://www.hp-lexicon.org/thing-category/symbols/"),
    ("Titles, nicknames, and honorifics", "https://www.hp-lexicon.org/thing-category/titles-nicknames-and-honorifics/")]

Hogwarts_and_Schools = [
    ("Hogwarts academics", "https://www.hp-lexicon.org/thing-category/hogwarts-academics/"), # 시간표 작성할때 필요하다!!
    ("Schools", "https://www.hp-lexicon.org/placetype/schools/")]

Magic = [
    ("Magical artifacts", "https://www.hp-lexicon.org/thing-category/magical-artifacts/"),
    ("Magical objects", "https://www.hp-lexicon.org/thing-category/magical-objects/"),
    ("Dark magic items","https://www.hp-lexicon.org/thing-category/dark/"),
    ("Magical identities", "https://www.hp-lexicon.org/thing-category/magical-identities/"),
    ("Sentient objects", "https://www.hp-lexicon.org/thing-category/sentient-objects/"),
    ("Wandmaking", "https://www.hp-lexicon.org/thing-category/wand-woods/")]
Sports_and_Recreation = [
    ("Sports and competitions", "https://www.hp-lexicon.org/thing-category/sport/"),
    ("Sports teams","https://www.hp-lexicon.org/thing-category/sports-teams/"),
    ("Games, toys, and jokes", "https://www.hp-lexicon.org/thing-category/games-toys-and-jokes/")]

Datas = [("Wizarding Culture", Wizarding_Culture), ("Language", Language), ("Hogwarts and Schools", Hogwarts_and_Schools), ("Magic", Magic), ("Sports and Recreation", Sports_and_Recreation)]

plants = ("Plants", "https://www.hp-lexicon.org/thing-category/plants/") # 이거는... potion ingredients가 포함되지 않은걸 걸러야함.

In [17]:
for data in Datas:
    section, items = data
    print(f"Section: {section}")
    
    section_path = os.path.join(path, section)
    
    if not os.path.exists(section_path):
        os.makedirs(section_path)

    for item in items:
        doc_name, link = item
        doc_name = doc_name.replace(' ', '_')
        print(doc_name)
        file_name = doc_name.lower()+'.json'
        file_path = os.path.join(section_path, file_name)
        rlt = inner_search(link)
        with open(file_path, 'w', encoding='utf-8') as json_file:
            json.dump(rlt, json_file, ensure_ascii=False, indent=4)
    # for section_contents in data:
    #     section, url = 

Section: Wizarding Culture
Wizarding_culture


100%|██████████| 19/19 [01:52<00:00,  5.93s/it]


Books_and_Literature


100%|██████████| 24/24 [04:59<00:00, 12.46s/it]


Food_and_drinks


100%|██████████| 20/20 [04:51<00:00, 14.56s/it]


Ministry_of_Magic


 18%|█▊        | 3/17 [00:25<02:01,  8.69s/it]