# 웹사이트 데이터 수집

## 메인 리스트 접근

In [1]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.dandwiki.com/wiki/5e_All_Spells'

response = requests.get(url)

if response.status_code == 200:
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
else : 
    print(response.status_code) # 에러코드가 나지 않아야 함

## 총 데이터 갯수 확인

In [19]:
links = soup.select('.mw-parser-output > div > ul > li > a')
len(links)

3415

## 서브 URL 방문하여 데이터 수집

In [69]:
import time
from tqdm.notebook import tqdm

data = []

for link in tqdm(links):
    suburl = f"https://www.dandwiki.com{link['href']}"
    response = requests.get(suburl)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    
    desc = ''

    for ele in soup.select('.mw-parser-output > p'):
        if '→' in ele.get_text():
            break

        desc += ele.get_text().strip()
    
    record = {
        'name': link.text, 
        'table': soup.select('.d20 > tbody > tr'), 
        'description': desc
    }
    
    data.append(record)

  0%|          | 0/3415 [00:00<?, ?it/s]

# 필요한 데이터: name, range, duration, descrption 추출

In [79]:
# 테이블 데이터가 오류나는 경우가 있어서 별도로 전처리

import pandas as pd

for i, record in enumerate(data):
    tbody = record['table']
    
    try:
        record['range'] =   tbody[2].get_text().strip()
        record['duration'] =   tbody[-1].get_text().strip()
    except:
        pass
    
    
    data[i] = record
    

In [81]:
df = pd.DataFrame(data)

In [82]:
df.drop('table', inplace=True, axis=1)

In [84]:
df

Unnamed: 0,name,description,range,duration
0,10cc of Time Stop,As a reaction to another creature taking an ac...,Self,Instantaneous
1,6/8 Time Beatdown,When you cast this spell you choose a number o...,100 feet,"Concentration, up to 5 minutes"
2,617 Pages,You call upon the foundational magics that sep...,30 feet,"Concentration, up to 1 hour"
3,A Comedy of Errors,This spell requires you to spend an action eve...,150 feet,Concentration 10 minutes
4,A Little Help,You reach into the collective consciousness of...,Touch,1 hour
...,...,...,...,...
3410,Warden's Volley,You fire a piece of non-magical ammunition fro...,Self (40-foot-radius),Self (40-foot-radius)
3411,Weakening Sludge,You throw a lump of disgusting energy to a tar...,60 feet,"concentration, 1 minute"
3412,Wumbo Super-Position,At Higher Levels. When you cast this spell usi...,Self,Instantaneous
3413,Manifold Garden,This spell creates a finite plane with limited...,240 feet (see text),Instantaneous


# 저장

In [85]:
df.to_csv('dataset/custom-spells.csv')