### BeautifulSoup 활용법

In [1]:
from bs4 import BeautifulSoup

html = '''
<html> 
    <head>Test HTML</head> 
    <body> 
        <h1> Market  
            <p id='fruits1' class='name' title=‘Banana’> Banana 
                <span class = 'price'> 3000 </span> 
                <span class = 'inventory'> 500 </span> 
                <span class = 'store'> CU </span> 
                <a href = 'http://test1'> url1 </a> 
            </p> 
            <p id='fruits2' class='name' title=‘Orange’> Orange 
                <span class = 'price'> 2000 </span> 
                <span class = 'inventory'> 100 </span> 
                <span class = 'store'> GS </span> 
                <a href = 'http://test2'> url2 </a> 
            </p> 
            <p id='fruits3' class='name' title=‘Pineapple’> Pineapple 
                <span class = 'price'> 5000 </span> 
                <span class = 'inventory'> 10 </span> 
                <span class = 'store'> CU </span> 
                <a href = 'http://test1'> url1 </a> 
            </p> 
        </h1> 
    </body> 
</html>
'''

soup = BeautifulSoup(html, 'html.parser')  

In [2]:
soup.select('span')

[<span class="price"> 3000 </span>,
 <span class="inventory"> 500 </span>,
 <span class="store"> CU </span>,
 <span class="price"> 2000 </span>,
 <span class="inventory"> 100 </span>,
 <span class="store"> GS </span>,
 <span class="price"> 5000 </span>,
 <span class="inventory"> 10 </span>,
 <span class="store"> CU </span>]

In [3]:
# Class name .
soup.select('.price')

[<span class="price"> 3000 </span>,
 <span class="price"> 2000 </span>,
 <span class="price"> 5000 </span>]

In [4]:
# id name #
soup.select('#fruits2')

[<p class="name" id="fruits2" title="‘Orange’"> Orange 
                 <span class="price"> 2000 </span>
 <span class="inventory"> 100 </span>
 <span class="store"> GS </span>
 <a href="http://test2"> url2 </a>
 </p>]

In [5]:
# Structure
soup.select('p.name > span.price')  

[<span class="price"> 3000 </span>,
 <span class="price"> 2000 </span>,
 <span class="price"> 5000 </span>]

In [6]:
# get text
soup.select('p > span.price')[0].text  

# get multiple texts
prices = soup.select('p > span.price')
for price in prices:        
    print(price.text)   

 3000 
 2000 
 5000 


In [7]:
# get attrs
soup.select('a')[0].attrs['href'] 

# get multiple attrs
urls = soup.select('a')
for url in urls:        
    print(url.attrs['href']) 

http://test1
http://test2
http://test1


In [8]:
# find the first one with the info including tag, class, value
soup2 = soup.find('p', {'class' : 'name'})
print(soup2.prettify())

<p class="name" id="fruits1" title="‘Banana’">
 Banana
 <span class="price">
  3000
 </span>
 <span class="inventory">
  500
 </span>
 <span class="store">
  CU
 </span>
 <a href="http://test1">
  url1
 </a>
</p>



In [9]:
# find all 
lst = {'Text':[]}

for el in soup.find_all('span', {'class' : 'price'}):
    lst['Text'].append(el.getText())

lst

{'Text': [' 3000 ', ' 2000 ', ' 5000 ']}

In [10]:
# find all attribute values
for el in soup.find_all('a'):
    print(el.attrs['href'])

http://test1
http://test2
http://test1


### 다음 영화 리뷰 수집

In [11]:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import pandas as pd
from tqdm import *

url = 'https://movie.daum.net/moviedb/grade?movieId=93252&type=netizen'

with urllib.request.urlopen(url) as response:
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    
lst = {'Rating':[], 'Text':[]}

for el in soup.find_all('em', {'class' : 'emph_grade'}):
    lst['Rating'].append(el.getText())
    
for el in soup.find_all('p', {'class' : 'desc_review'}):
    lst['Text'].append(el.getText())
    
df = pd.DataFrame(lst)

In [12]:
df

Unnamed: 0,Rating,Text
0,10,\n ...
1,10,\n ...
2,7,\n
3,0,\n
4,10,\n ...
5,10,\n ...
6,10,\n ...
7,10,\n ...
8,10,\n ...
9,10,\n ...


In [13]:
df.Text[0]

' \n                                            마무리까지 완벽했다 그 전 인피니티워의 감동과 대적할만한 내 인생 히어로물 희대의 명작 중 하나!!\n                                        '

- 모든 페이지 리뷰 및 평점 수집

In [1]:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import pandas as pd
from tqdm import *

url = 'https://movie.daum.net/moviedb/grade?movieId=93252&type=netizen'
df = pd.DataFrame({'Rating':[], 'Text':[]})

for i in tqdm(range(1,454)):

    page = '&page=' + str(i)    
    url_page = url + page
    
    with urllib.request.urlopen(url_page) as response:
        html = response.read()
        soup = BeautifulSoup(html, 'html.parser')
    
    lst = {'Rating':[], 'Text':[]}

    for el in soup.find_all('em', {'class' : 'emph_grade'}):
        lst['Rating'].append(el.getText())
    
    for el in soup.find_all('p', {'class' : 'desc_review'}):
        lst['Text'].append(el.getText())
    
    df2 = pd.DataFrame(lst)
    df = pd.concat([df,df2],ignore_index=True)

df.to_csv('data/endgame.csv', index=False)

100%|██████████| 453/453 [01:43<00:00,  4.39it/s]


- 맨 마지막 페이지 번호를 파악하기 어려운 경우
- 맨 마지막 페이지에는 10개이하의 리뷰가 있을 것임. 0~9개까지의 리뷰가 있을 것임.
- 이에 착안하여 range에서는 무척 큰 숫자까지 for loop을 돌게하고, 수집된 리뷰가 10개 이하면 for loop를 멈추는 코드 작성

In [5]:
url = 'https://movie.daum.net/moviedb/grade?movieId=93252&type=netizen'
df = pd.DataFrame({'Rating':[], 'Text':[]})

for i in range(1,1000000):

    page = '&page=' + str(i)    
    url_page = url + page
    
    with urllib.request.urlopen(url_page) as response:
        html = response.read()
        soup = BeautifulSoup(html, 'html.parser')
    
    lst = {'Rating':[], 'Text':[]}

    for el in soup.find_all('em', {'class' : 'emph_grade'}):
        lst['Rating'].append(el.getText())
    
    for el in soup.find_all('p', {'class' : 'desc_review'}):
        lst['Text'].append(el.getText())
    
    df2 = pd.DataFrame(lst)
    df = pd.concat([df,df2],ignore_index=True)
    
    if (len(lst['Text']) < 10): break

df.to_csv('data/endgame.csv', index=False)

### 연습문제 

# New York 한 호텔의 리뷰 수집
# Trump International Hotel and Tower New York
#https://www.tripadvisor.com/Hotel_Review-g60763-d93623-Reviews-Trump_International_Hotel_and_Tower_New_York-New_York_City_New_York.html

In [1]:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import pandas as pd
from tqdm import *

In [2]:
# 수집할 리뷰의 주소 
original_url = "https://www.tripadvisor.com/Hotel_Review-g60763-d93623-Reviews-Trump_International_Hotel_and_Tower_New_York-New_York_City_New_York.html"

# 호텔 리뷰 마지막 페이지 번호 확인
with urllib.request.urlopen(original_url) as response:
        html = response.read()
        soup = BeautifulSoup(html, 'html.parser')
        
pages = soup.select('div.pageNumbers')

for page in pages:        
    pageString = page.text
    
last = pageString.split('…')
last_page = last[1]

In [3]:
lst = {'Hotel':[], 'Date':[], 'Stars':[], 'Title':[], 'Text':[]}

for i in tqdm(range(3)): # 3대신 last_page 를 입력하면 모든 리뷰를 수집함 
    
    url1 = original_url.split('Reviews-')[0]
    url1 = url1 + 'Reviews-'
    urlTemp = original_url.split('Reviews-')[1]
    url3 = urlTemp.split('.')[0]
    url4 = urlTemp.split('.')[1]
    url4 = '.' + url4
     
    if(i == 0):
        url = url1 + url3 + url4
    else:
        url2 = 'or' + str(i*5) + '-'
        url = url1 + url2 + url3 + url4
        
    with urllib.request.urlopen(url) as response:
        html = response.read()
        soup = BeautifulSoup(html, 'html.parser')
    
    for el_title in soup.select('div.glasR4aX'):
        lst['Hotel'].append(url3)
        lst['Title'].append(el_title.getText())
    
    for el_date in soup.select('div._2fxQ4TOx > span'):
        lst['Date'].append(el_date.getText().split('review')[1])
        
    for el_stars in soup.select('div.nf9vGX55 > span'):
        lst['Stars'].append(int(el_stars.attrs['class'][1].split('_')[1])/10)
        
    for el in soup.select('q.IRsGHoPm'):
        lst['Text'].append(el.getText())
        
df = pd.DataFrame(lst)

100%|██████████| 3/3 [00:06<00:00,  2.02s/it]


In [4]:
df

Unnamed: 0,Hotel,Date,Stars,Title,Text
0,Trump_International_Hotel_and_Tower_New_York-N...,Sep 2020,2.0,Smelly,Musty mildew smell in all rooms. Got switched ...
1,Trump_International_Hotel_and_Tower_New_York-N...,Feb 2020,5.0,Delightful stay,Truly enjoyed our stay at this wonderful hotel...
2,Trump_International_Hotel_and_Tower_New_York-N...,Sep 2020,3.0,This is a five-star hotel without any five-sta...,Both the check-in and check-out service was di...
3,Trump_International_Hotel_and_Tower_New_York-N...,Aug 2020,5.0,Hope to come back soon!,I cannot find anything or anybody to find faul...
4,Trump_International_Hotel_and_Tower_New_York-N...,Mar 2020,5.0,A Wonderful place to Stay for a short break,"This place was exceptional, from the moment we..."
5,Trump_International_Hotel_and_Tower_New_York-N...,Mar 2020,3.0,Service was terrible,The property was very nice but the view of cen...
6,Trump_International_Hotel_and_Tower_New_York-N...,Mar 2020,2.0,staff need training...lots of it,"the entire staff is a bit rude, always in a ba..."
7,Trump_International_Hotel_and_Tower_New_York-N...,Mar 2020,5.0,Wow. Wasn’t expecting that level of service...,I’m a Marriott guy. Have been for 15 years of ...
8,Trump_International_Hotel_and_Tower_New_York-N...,Feb 2020,5.0,Luxurious,We loved our recent stay. The staff was awesom...
9,Trump_International_Hotel_and_Tower_New_York-N...,Feb 2020,5.0,Fabulous!,Fabulous hotel at a fabulous location with fab...


In [5]:
df.to_csv('data/Trump_Hotel_NY.csv')