# 03. Web Data

## 1. BeautifulSoup for web data

---

## BeautifulSoup Basic
- install
    - conda install -c anaconda beautifulsoup4
    - pip install beautifulsoup4
- data
    - 03.test_first.html

- Docs 
    - https://www.crummy.com/software/BeautifulSoup/bs4/doc/#

In [11]:
# 모듈 import
from bs4 import BeautifulSoup

In [14]:
page = open("../data/03. test_first.html", "r").read()
page

'<!doctype html>\n<html>\n    <head>\n        <title>Very Simple HTML Code by PinkWink</title>\n    </head>\n    <body>\n        <div>\n            <p class="inner-text first-item" id="first">\n                Happy PinkWink.\n                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>\n            </p>\n            <p class="inner-text second-item">\n                Happy Data Science.\n                <a href="https://www.python.org" id="py-link">Python</a>\n            </p>\n        </div>\n        <p class="outer-text first-item" id="second">\n            <b>\n                Data Science is funny.\n            </b>\n        </p>\n        <p class="outer-text">\n            <b>\n                All I need is Love.\n            </b>\n        </p>\n    </body>\n</html>'

In [15]:
soup = BeautifulSoup(page, "html.parser")
soup

<!DOCTYPE html>

<html>
<head>
<title>Very Simple HTML Code by PinkWink</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>
</html>

In [19]:
# head 태그 가져오기
soup.head

<head>
<title>Very Simple HTML Code by PinkWink</title>
</head>

In [20]:
# title 태그 가져오기
soup.title

<title>Very Simple HTML Code by PinkWink</title>

In [21]:
soup.p

<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>

In [25]:
# find() 함수
soup.find("p")

<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>

In [24]:
# find_all() 함수
soup.find_all("p")

[<p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
 </p>,
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>,
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>]

In [26]:
soup.find("p", class_="inner-text second-item")

<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>

In [30]:
data1 = soup.find("p", {"class":"outer-text first-item"})
data1

<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>

In [32]:
data1.text

'\n\n                Data Science is funny.\n            \n'

In [33]:
data1.text.strip()

'Data Science is funny.'

In [35]:
# 다중 조건
soup.find(
    "p", {
        "class":"inner-text first-item", 
        "id":"first"
        }
    )

<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>

In [40]:
# find_all()
p_all = soup.find_all("b")
for p in p_all:
    print(p.text.strip())
    print("-"*50)

Data Science is funny.
--------------------------------------------------
All I need is Love.
--------------------------------------------------


In [42]:
# 특정 태그 확인
soup.find_all(class_="outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>]

In [44]:
# 특정 태그 확인
for data in soup.find_all(id="pw-link"):
    print(data.text)

PinkWink


In [46]:
p_all = soup.find_all("p")
for p in p_all:
    print(p.text.strip())
    print("-"*50)

Happy PinkWink.
                PinkWink
--------------------------------------------------
Happy Data Science.
                Python
--------------------------------------------------
Data Science is funny.
--------------------------------------------------
All I need is Love.
--------------------------------------------------


In [47]:
# a 태그에서 href 속성값 추출
links = soup.find_all("a")
links

[<a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>,
 <a href="https://www.python.org" id="py-link">Python</a>]

In [51]:
for link in links:
    print(link.text, "=>", link.get("href"))
    print(link.text, "=>", link["href"])
    print()

PinkWink => http://www.pinkwink.kr
PinkWink => http://www.pinkwink.kr

Python => https://www.python.org
Python => https://www.python.org



## BeautifulSoup 예제 1-1 - 네이버 금융

In [57]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs4

In [59]:
url = "https://finance.naver.com/marketindex/"
page = urlopen(url)
soup = bs4(page, "html.parser")
print(soup)


<script language="javascript" src="/template/head_js.naver?referer=info.finance.naver.com&amp;menu=marketindex&amp;submenu=market"></script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20210916165954/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript"></script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20210916165954/js/jindo.1.5.3.element-text-patch.js" type="text/javascript"></script>
<div id="container" style="padding-bottom:0px;">
<div class="market_include">
<div class="market_data">
<div class="market1">
<div class="title">
<h2 class="h_market1"><span>환전 고시 환율</span></h2>
</div>
<!-- data -->
<div class="data">
<ul class="data_lst" id="exchangeList">
<li class="on">
<a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
<h3 class="h_lst"><span class="blind">미국 USD</span></h3>
<div class="head_info point_up">
<span class="value">1,187.00</span>
<span class="txt_krw

In [64]:
money_datas = soup.find_all("span", {"class":"value"})
for idx, money in enumerate(money_datas):
    print(idx, money.text)

0 1,187.00
1 1,065.15
2 1,386.42
3 183.63
4 110.9700
5 1.1700
6 1.3707
7 93.3800
8 75.45
9 1644.42
10 1750.0
11 66465.57


In [68]:
text_datas = soup.find_all("span", {"class":"blind"})
for idx, text in enumerate(text_datas[::3]):
    print(idx, text.text)

0 미국 USD
1 일본 JPY(100엔)
2 유럽연합 EUR
3 중국 CNY
4 달러/일본 엔
5 유로/달러
6 영국 파운드/달러
7 달러인덱스
8 달러
9 원
10 달러
11 원


## BeautifulSoup 예제 1-2 - 네이버 금융
- find, find_all
- select, select_one
- find, select_one : 단일 선택
- find_all, select_one : 다중 선택

In [72]:
import requests
from bs4 import BeautifulSoup as bs4

url = "https://finance.naver.com/marketindex/"
response = requests.get(url)
text = response.text
soup = bs4(text, "html.parser")
print(soup.prettify())

<script language="javascript" src="/template/head_js.naver?referer=info.finance.naver.com&amp;menu=marketindex&amp;submenu=market">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20210916165954/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20210916165954/js/jindo.1.5.3.element-text-patch.js" type="text/javascript">
</script>
<div id="container" style="padding-bottom:0px;">
 <div class="market_include">
  <div class="market_data">
   <div class="market1">
    <div class="title">
     <h2 class="h_market1">
      <span>
       환전 고시 환율
      </span>
     </h2>
    </div>
    <!-- data -->
    <div class="data">
     <ul class="data_lst" id="exchangeList">
      <li class="on">
       <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
        <h3 class="h_lst">
         <span class="blind">
          미국 U

In [78]:
exchange_list = soup.select("#exchangeList > li")
exchange_list

[<li class="on">
 <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
 <h3 class="h_lst"><span class="blind">미국 USD</span></h3>
 <div class="head_info point_up">
 <span class="value">1,187.00</span>
 <span class="txt_krw"><span class="blind">원</span></span>
 <span class="change">6.00</span>
 <span class="blind">상승</span>
 </div>
 </a>
 <a class="graph_img" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdc', '', '', event);">
 <img alt="" height="153" src="https://ssl.pstatic.net/imgfinance/chart/marketindex/FX_USDKRW.png" width="295"/>
 </a>
 <div class="graph_info">
 <span class="time">2021.09.28 20:04</span>
 <span class="source">하나은행 기준</span>
 <span class="count">고시회차<span class="num">318</span>회</span>
 </div>
 </li>,
 <li class="">
 <a class="head jpy" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_JPYKRW" onclick="clickcr(this, 'fr1.jpyt

In [105]:
title = exchange_list[0].select_one(".h_lst").text
exchange = exchange_list[0].select_one(".value").text
change = exchange_list[0].select_one(".change").text
updown = exchange_list[0].select_one(".head_info.point_up > .blind").text

print(title, exchange, change, updown, sep=" | ")


미국 USD | 1,187.00 | 6.00 | 상승


In [111]:
base_url = "https://finance.naver.com"
for data in exchange_list:
    sub_url = data.select_one("a").get("href")
    title = data.select_one(".h_lst").text
    exchange = data.select_one(".value").text
    change = data.select_one(".change").text
    updown = data.select_one(".head_info.point_up > .blind").text

    print(title, exchange, change, updown, base_url+sub_url ,sep=" | ")

미국 USD | 1,187.00 | 6.00 | 상승 | https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW
일본 JPY(100엔) | 1,065.15 | 0.32 | 상승 | https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_JPYKRW
유럽연합 EUR | 1,386.42 | 3.65 | 상승 | https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_EURKRW
중국 CNY | 183.63 | 0.94 | 상승 | https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_CNYKRW


In [None]:

exchange_list = []
base_url = "https://finance.naver.com"

for data in exchange_list:
    data = {
        "title" : data.select_one(".h_lst").text,
        "exchange" : data.select_one(".value").text, 
        "change" : data.select_one(".change").text,  
        "updown" :  data.select_one(".head_info.point_up > .blind").text, 
        "link" : data.select_one("a").get("href")
    }
    exchange_list.append(data)

In [113]:
df = pd.DataFrame(exchange_list)
df.to_excel("./naver_finance.xlsx", encoding="utf-8")

  values = np.array([convert(v) for v in values])


In [114]:
! dir

 D ����̺��� �������� �̸��� �����ϴ�.
 ���� �Ϸ� ��ȣ: A479-D011

 d:\����\������Ʈ\��ī������ - �����ͻ��̾� 1��\nekalakubae_data_science_1st\ds_study\code ���͸�

2021-09-29  ���� 12:32    <DIR>          .
2021-09-29  ���� 12:32    <DIR>          ..
2021-09-28  ���� 10:00    <DIR>          .ipynb_checkpoints
2021-09-17  ���� 02:26         1,073,844 .~01. Analysis Seoul CCTV.ipynb
2021-09-24  ���� 12:15         1,073,844 01. Analysis Seoul CCTV.ipynb
2021-09-28  ���� 05:11         7,242,128 02. Analysis Seoul Crime.ipynb
2021-09-11  ���� 09:50       106,266,277 02. Analysis Seoul Crime.pdf
2021-09-28  ���� 11:34             7,001 03. Web Data.ipynb
2021-09-28  ���� 02:15             3,010 folium.html
2021-09-28  ���� 02:15             3,010 folium2.html
2021-09-27  ���� 07:52               267 google_api_key.zip
2021-09-29  ���� 12:32             5,715 naver_finance.xlsx
               9�� ����         115,675,096 ����Ʈ
               3�� ���͸�  1,943,315,070,976 ����Ʈ ����


---

## BeautifulSoup 예제 2 - 여명의 눈동자(위키백과 데이터 가져오기)
- urllib 을 이용한 URL 인코딩 참고 자료 
    - https://brownbears.tistory.com/501

In [7]:
from urllib import parse
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup as bs4

In [16]:
url = 'https://ko.wikipedia.org/wiki/{search_words}'
req = Request(
        url.format(
            search_words=parse.quote("여명의_눈동자") # 한글을 URL로 인코딩
        )
    )
response = urlopen(req)
soup = bs4(response, "html.parser")

In [38]:
for idx, ul in enumerate(soup.find_all("ul")):
    if "채시라" in ul.text:
        print("="*30 + "> 번 라인 :",idx)
        print(ul.text)

채시라 : 윤여옥 역 (아역: 김민정)
박상원 : 장하림(하리모토 나츠오) 역 (아역: 김태진)
최재성 : 최대치(사카이) 역 (아역: 장덕수)
1991년 MBC 연기대상 남자 최우수상 - 최재성
1991년 MBC 연기대상 여자 최우수상 - 채시라
1992년 제28회 백상예술대상 TV부문 대상
1992년 제28회 백상예술대상 TV부문 작품상
1992년 제28회 백상예술대상 TV부문 남자 연기상 - 최재성
1992년 제28회 백상예술대상 TV부문 여자 연기상 - 채시라
1992년 제28회 백상예술대상 TV부문 연출상 - 김종학
1992년 제28회 백상예술대상 TV부문 기술상(촬영) - 조수현
1992년 제28회 백상예술대상 TV부문 남자 인기상 - 박상원
1992년 제19회 한국방송대상 드라마TV부문 최우수 작품상
1992년 제19회 한국방송대상 TV부문 프로듀서상 - 김종학
1992년 제19회 한국방송대상 TV부문 미술상 - 윤상준
당초 윤여옥 역은 김미숙, 안명지 역은 배종옥이 맡을 뻔 했지만 개인사정으로 고사하였다.
줄곧 KBS 드라마에 출연해 오던  최재성이 1990년 KBS 사태 후 다른 방송사로 옮겨 처음 출연한 드라마이기도 했으며[7] 최재성은 《여명의 눈동자》 이후 타방송사에서만 활동해 오다가 <아씨>로 KBS 복귀를 했다.
총 제작기간이 2년 4개월, 출연자와 엑스트라가 모두 21,000명이다.
처음으로 제주4.3사건, 위안부 등을 다루었으며 또한 연출기법 등에 화제가 되었다.[8]
극중 최대치가 뱀을 뜯어먹는 장면은 실제로 배우 최재성이 살아있는 뱀을 뜯는 법을 배워서 직접 껍질을 벗기면서 먹는 장면을 촬영하였으며, 그 후 뱀의 비린내로 고생하였다 한다.
최대치와 윤여옥이 난징에서 헤어지기 전 철조망을 사이에 두고 나눈 키스신, 소위 '철조망 키스신'은 아직까지도 많은 이들로부터 명장면으로 꼽힌다.[9][10]
1992년 2월 25일 채시라는 모교 동국대학교 졸업식에서 '여옥 역을 맡아 학교의 명예를 드높인 점'을 인정받아 민병천 당시 동국대 

In [44]:
main_actor = soup.find_all("ul")[15]
for li in main_actor.find_all("li"):
    data = li.text.strip().replace("\xa0", "")
    print(data, end="\n")

채시라: 윤여옥 역 (아역: 김민정)
박상원: 장하림(하리모토 나츠오) 역 (아역: 김태진)
최재성: 최대치(사카이) 역 (아역: 장덕수)


- 인스턴트타입 확인하기

In [47]:
isinstance(["\xa0",1,2,3,4,5], list)

True

- 얕은 복사

In [48]:
data = [1,2,3,4,5,6]

In [50]:
data_cp = data
data_cp

[1, 2, 3, 4, 5, 6]

In [51]:
data_cp[2] = "데이터교체"
data_cp

[1, 2, '데이터교체', 4, 5, 6]

In [52]:
data

[1, 2, '데이터교체', 4, 5, 6]

- 깊은 복사

In [53]:
data_2 = [9,10,11,12,13]

In [54]:
data_cp_deep = data_2.copy()
data_cp_deep

[9, 10, 11, 12, 13]

In [56]:
data_cp_deep[2] = "데이터교체"
data_cp_deep

[9, 10, '데이터교체', 12, 13]

In [57]:
data_2

[9, 10, 11, 12, 13]

In [58]:
data_2.insert(2, "인덱스2번에추가")

In [59]:
data_2

[9, 10, '인덱스2번에추가', 11, 12, 13]

---

## 2. 시카고 맛집 데이터 분석 - 개요
- https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/
- chicago magazine the 50 best sandwiches

```
최종목표
총 51개 페이지에서 각 가게의 정보를 가져온다.
- 가게 이름
- 대표 메뉴
- 대표 메뉴의 가격
- 가게 주소
```

## 3. 시카고 맛집 데이터 분석 - 
- HTTP 403 Forbidden 클라이언트 오류 상태 응답 코드는 서버에 요청이 전달되었지만, 권한 때문에 거절되었다는 것을 의미합니다.
- user agent 랜덤 생성
    - conda install -c conda-forge fake-useragent
    - pip install fake-useragent

In [1]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as bs4
from fake_useragent import UserAgent

In [2]:
url_base = "https://www.chicagomag.com/"
url_sub = "Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/"
url = url_base + url_sub
url 

'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'

In [9]:
# headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"}
ua = UserAgent()
headers = {"User-Agent":ua.ie}
req = Request(url, headers=headers)
response = urlopen(req)
soup = bs4(response, "html.parser")

In [19]:
div_all = soup.find_all("div", {"class":"sammy"})
len(div_all)

50

In [21]:
tmp_one = div_all[0]
tmp_one

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
</div>

In [24]:
tmp_one.find("div", {"class":"sammyRank"}).get_text()

'1'

In [28]:
tmp_one.find("div", class_="sammyListing")

<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>

In [33]:
tmp_one.select_one(".sammyListing").text

'BLT\nOld Oak Tap\nRead more '

In [31]:
tmp_one.find("div", class_="sammyListing").find("a").get("href")

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [34]:
import re

tmp_string = tmp_one.find("div", class_="sammyListing").text
re.split(("\n|\r\n"), tmp_string)

['BLT', 'Old Oak Tap', 'Read more ']

In [35]:
print(re.split(("\n|\r\n"), tmp_string)[0])
print(re.split(("\n|\r\n"), tmp_string)[1])

BLT
Old Oak Tap


In [43]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as bs4
from fake_useragent import UserAgent
from urllib.parse import urljoin
import re

url_base = "https://www.chicagomag.com/"

# 필요한 내용을 담을 빈 리스트
# 리스트로 하나씩 컬럼을 만들고, DataFrame으로 합칠 예정
rank = []
main_menu = []
cafe_name = []
url_add = []

list_soup = soup.find_all("div", {"class":"sammy"}) # soup.select(".sammy")
len(list_soup)

50

In [50]:
for item in list_soup:
    # 랭킹
    rank.append(item.find(class_="sammyRank").get_text())
    tmp_string = item.find(class_="sammyListing").get_text()
    tmp_string_split = re.split(("\n|\r\n"), tmp_string)
    # 메인 메뉴 이름
    main_menu.append(tmp_string_split[0])
    # 카페 이름
    cafe_name.append(tmp_string_split[1])
    # URL 정보
    url_add.append(urljoin(url_base, item.find("a").get("href")))

In [51]:
len(rank), len(main_menu), len(cafe_name), len(url_add)

(50, 50, 50, 50)

In [54]:
import pandas as pd

data = {
    "Rank":rank,
    "Menu":main_menu,
    "Cafe":cafe_name,
    "URL":url_add
}
df = pd.DataFrame(data, columns=["Rank", "Cafe", "Menu", "URL"])
df.head()

Unnamed: 0,Rank,Cafe,Menu,URL
0,1,Old Oak Tap,BLT,https://www.chicagomag.com/Chicago-Magazine/No...
1,2,Au Cheval,Fried Bologna,https://www.chicagomag.com/Chicago-Magazine/No...
2,3,Xoco,Woodland Mushroom,https://www.chicagomag.com/Chicago-Magazine/No...
3,4,Al’s Deli,Roast Beef,https://www.chicagomag.com/Chicago-Magazine/No...
4,5,Publican Quality Meats,PB&L,https://www.chicagomag.com/Chicago-Magazine/No...


In [55]:
# 데이터 저장
df.to_csv("../data/03. best_sandwiches_list_chicago.csv", sep=",", encoding="utf-8")

In [63]:
pd.read_csv("../data/03. best_sandwiches_list_chicago.csv", sep=",", encoding="utf-8").head()

Unnamed: 0.1,Unnamed: 0,Rank,Cafe,Menu,URL
0,0,1,Old Oak Tap,BLT,https://www.chicagomag.com/Chicago-Magazine/No...
1,1,2,Au Cheval,Fried Bologna,https://www.chicagomag.com/Chicago-Magazine/No...
2,2,3,Xoco,Woodland Mushroom,https://www.chicagomag.com/Chicago-Magazine/No...
3,3,4,Al’s Deli,Roast Beef,https://www.chicagomag.com/Chicago-Magazine/No...
4,4,5,Publican Quality Meats,PB&L,https://www.chicagomag.com/Chicago-Magazine/No...


---

## 04. 시카고 맛집 데이터 분석 - 하위 페이지

In [28]:
import pandas as pd
from urllib.request import urlopen, Request
from fake_useragent import UserAgent
from bs4 import BeautifulSoup as bs4

In [29]:
df = pd.read_csv("../data/03. best_sandwiches_list_chicago.csv", sep=",", encoding="utf-8", index_col=0)
df.tail()

Unnamed: 0,Rank,Cafe,Menu,URL
45,46,Chickpea,Kufta,https://www.chicagomag.com/Chicago-Magazine/No...
46,47,The Goddess and Grocer,Debbie’s Egg Salad,https://www.chicagomag.com/Chicago-Magazine/No...
47,48,Zenwich,Beef Curry,https://www.chicagomag.com/Chicago-Magazine/No...
48,49,Toni Patisserie,Le Végétarien,https://www.chicagomag.com/Chicago-Magazine/No...
49,50,Phoebe’s Bakery,The Gatsby,https://www.chicagomag.com/Chicago-Magazine/No...


In [30]:
df["URL"][0]

'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [31]:
ua = UserAgent()
headers = {"user-agent":ua.ie}
req = Request(df["URL"][0], headers=headers)
html = urlopen(req).read()
soup = bs4(html, "html.parser")

In [32]:
tmp_data = soup.find("p", {"class":"addy"})
tmp_data

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>

### reqular expression
- import re
- re.search
- re.split
- .x : 임의의 한 문자를 표현(x가 마지막으로 끝납니다.)
- x+ : x가 1번 이상 반복합니다.
- x? : x가 존재하거나 존재하지 않을 수 있습니다.
- x* : x가 0번 이상 반복합니다.
- x|y : x또는 y를 찾습니다.(or연산자 의미) 

In [33]:
import re

In [34]:
price_tmp = tmp_data.text
price_tmp

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [35]:
price_tmp = re.split((".,"), price_tmp)
price_tmp

['\n$10. 2109 W. Chicago Ave', ' 773-772-040', ' theoldoaktap.com']

In [36]:
price_tmp = price_tmp[0]
price_tmp

'\n$10. 2109 W. Chicago Ave'

In [37]:
tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
tmp

'$10.'

In [38]:
address = price_tmp[len(tmp) + 2:]
address 

'2109 W. Chicago Ave'

- DataFrame의 index로 반복문 사용하기

In [39]:
price = []
address = []

for n in df.index[:3]:
    req = Request(df["URL"][n], headers={"user-agent":ua.ie})
    html = urlopen(req).read()
    soup = bs4(html, "html.parser")
    
    gettings = soup.find("p", "addy").get_text()
    
    tmp = re.split(".,", gettings)[0]
    price = re.search("\$\d+\.(\d+)?", tmp).group()
    address = tmp[len(price) + 2:]
#     print(gettings, price, address, sep=' | ')
    print(price, address, sep=" | ")
    

$10. | 2109 W. Chicago Ave
$9. | 800 W. Randolph St
$9.50 |  445 N. Clark St


In [41]:
from tqdm import tqdm

prices = []
addresses = []

for idx, rows in tqdm(df.iterrows()):
    req = Request(rows["URL"], headers={"user-agent":ua.ie})
    html = urlopen(req).read()
    soup = bs4(html, "html.parser")
    
    gettings = soup.find("p", class_="addy").get_text()
    tmp = re.split(".,", gettings)[0]
    price = re.search("\$\d+\.(\d+)?", tmp).group()
    address = tmp[len(price) + 2:]
    
    prices.append(price)
    addresses.append(address)
    
    print(idx, rows["Rank"], rows["Cafe"], rows["Menu"], rows["URL"], sep=" | ")
    print(price, address, sep=" | ")
    print("="*50)

1it [00:01,  1.90s/it]

0 | 1 | Old Oak Tap | BLT | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/
$10. | 2109 W. Chicago Ave


2it [00:03,  1.49s/it]

1 | 2 | Au Cheval | Fried Bologna | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Au-Cheval-Fried-Bologna/
$9. | 800 W. Randolph St


3it [00:04,  1.33s/it]

2 | 3 | Xoco | Woodland Mushroom | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Xoco-Woodland-Mushroom/
$9.50 |  445 N. Clark St


4it [00:05,  1.27s/it]

3 | 4 | Al’s Deli | Roast Beef | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Als-Deli-Roast-Beef/
$9.40 |  914 Noyes St


5it [00:06,  1.22s/it]

4 | 5 | Publican Quality Meats | PB&L | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Publican-Quality-Meats-PB-L/
$10. | 825 W. Fulton Mkt


6it [00:07,  1.22s/it]

5 | 6 | Hendrickx Belgian Bread Crafter | Belgian Chicken Curry Salad | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Hendrickx-Belgian-Bread-Crafter-Belgian-Chicken-Curry-Salad/
$7.25 |  100 E. Walton St


7it [00:08,  1.20s/it]

6 | 7 | Acadia | Lobster Roll | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Acadia-Lobster-Roll/
$16. | 1639 S. Wabash Ave


8it [00:09,  1.16s/it]

7 | 8 | Birchwood Kitchen | Smoked Salmon Salad | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Birchwood-Kitchen-Smoked-Salmon-Salad/
$10. | 2211 W. North Ave


9it [00:11,  1.16s/it]

8 | 9 | Cemitas Puebla | Atomica Cemitas | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Cemitas-Puebla-Atomica-Cemitas/
$9. | 3619 W. North Ave


10it [00:12,  1.15s/it]

9 | 10 | Nana | Grilled Laughing Bird Shrimp and Fried Po’ Boy | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Nana-Grilled-Laughing-Bird-Shrimp-and-Fried-Oyster-Po-Boy/
$17. | 3267 S. Halsted St


11it [00:13,  1.15s/it]

10 | 11 | Lula Cafe | Ham and Raclette Panino | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Lula-Cafe-Ham-and-Raclette-Panino/
$11. | 2537 N. Kedzie Blvd


12it [00:14,  1.16s/it]

11 | 12 | Ricobene’s | Breaded Steak | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Ricobenes-Breaded-Steak/
$5.49 |  Multiple location


13it [00:15,  1.15s/it]

12 | 13 | Frog n Snail | The Hawkeye | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Frog-n-Snail-The-Hawkeye/
$14. | 3124 N. Broadwa


14it [00:17,  1.27s/it]

13 | 14 | Crosby’s Kitchen | Chicken Dip | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Crosbys-Kitchen-Chicken-Dip/
$10. | 3455 N. Southport Ave


15it [00:18,  1.22s/it]

14 | 15 | Longman & Eagle | Wild Boar Sloppy Joe | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Longman-and-Eagle-Wild-Boar-Sloppy-Joe/
$13. | 2657 N. Kedzie Ave


16it [00:19,  1.20s/it]

15 | 16 | Bari | Meatball Sub | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Bari-Meatball-Sub/
$4.50 |  1120 W. Grand Ave


17it [00:20,  1.19s/it]

16 | 17 | Manny’s | Corned Beef | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Mannys-Corned-Beef/
$11.95 |  1141 S. Jefferson St


18it [00:21,  1.19s/it]

17 | 18 | Eggy’s | Turkey Club | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Eggys-Turkey-Club/
$11.50 |  333 E. Benton Pl


19it [00:22,  1.16s/it]

18 | 19 | Old Jerusalem | Falafel | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Jerusalem-Falafel/
$6.25 |  1411 N. Wells St


20it [00:24,  1.15s/it]

19 | 20 | Mindy’s HotChocolate | Crab Cake | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Mindys-HotChocolate-Crab-Cake/
$15. | 1747 N. Damen Ave


21it [00:25,  1.15s/it]

20 | 21 | Olga’s Delicatessen | Chicken Schnitzel | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Olgas-Delicatessen-Chicken-Schnitzel/
$5. | 3209 W. Irving Park Rd


22it [00:26,  1.31s/it]

21 | 22 | Dawali Mediterranean Kitchen | Shawarma | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Dawali-Mediterranean-Kitchen-Shawarma/
$6. | Multiple location


23it [00:28,  1.47s/it]

22 | 23 | Big Jones | Toasted Pimiento Cheese | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Big-Jones-Toasted-Pimiento-Cheese/
$8. | 5347 N. Clark St


24it [00:29,  1.36s/it]

23 | 24 | La Pane | Vegetarian Panino | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-La-Pane-Vegetarian-Panino/
$5.99 |  2954 W. Irving Park Rd


25it [00:31,  1.29s/it]

24 | 25 | Pastoral | Cali Chèvre | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Pastoral-Cali-Chevre/
$7.52 |  Multiple location


26it [00:32,  1.40s/it]

25 | 26 | Max’s Deli | Pastrami | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Maxs-Deli-Pastrami/
$11.95 |  191 Skokie Valley Rd


27it [00:33,  1.31s/it]

26 | 27 | Lucky’s Sandwich Co. | The Fredo | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Luckys-Sandwich-Co-The-Fredo/
$7.50 |  Multiple location


28it [00:35,  1.43s/it]

27 | 28 | City Provisions | Smoked Ham | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-City-Provisions-Smoked-Ham/
$12.95 |  1818 W. Wilson Ave


29it [00:36,  1.37s/it]

28 | 29 | Papa’s Cache Sabroso | Jibarito | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Papas-Cache-Sabroso-Jibarito/
$7. | 2517 W. Division St


30it [00:37,  1.33s/it]

29 | 30 | Bavette’s Bar & Boeuf | Shaved Prime Rib | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Bavettes-Bar-and-Boeuf-Shaved-Prime-Rib/
$21. | 218 W. Kinzie St


31it [00:39,  1.37s/it]

30 | 31 | Hannah’s Bretzel | Serrano Ham and Manchego Cheese | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Hannahs-Bretzel-Serrano-Ham-and-Manchego-Cheese/
$9.79 |  Multiple location


32it [00:40,  1.33s/it]

31 | 32 | La Fournette | Tuna Salad | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-La-Fournette-Tuna-Salad/
$9.75 |  1547 N. Wells St


33it [00:41,  1.28s/it]

32 | 33 | Paramount Room | Paramount Reuben | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Paramount-Room-Paramount-Reuben/
$13. | 415 N. Milwaukee Ave


34it [00:43,  1.36s/it]

33 | 34 | Melt Sandwich Shoppe | The Istanbul | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Melt-Sandwich-Shoppe-The-Istanbul/
$7.95 |  1840 N. Damen Ave


35it [00:44,  1.35s/it]

34 | 35 | Floriole Cafe & Bakery | B.A.D. | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Floriole-Cafe-and-Bakery-BAD/
$9. | 1220 W. Webster Ave


36it [00:45,  1.27s/it]

35 | 36 | First Slice Pie Café | Duck Confit and Mozzarella | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-First-Slice-Pie-Cafe-Duck-Confit-and-Mozzarella/
$9. | 5357 N. Ashland Ave


37it [00:46,  1.22s/it]

36 | 37 | Troquet | Croque Monsieur | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Troquet-Croque-Monsieur/
$8. | 1834 W. Montrose Ave


38it [00:48,  1.20s/it]

37 | 38 | Grahamwich | Green Garbanzo | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Grahamwich-Green-Garbanzo/
$8. | 615 N. State St


39it [00:49,  1.18s/it]

38 | 39 | Saigon Sisters | The Hen House | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Saigon-Sisters-The-Hen-House/
$7. | Multiple location


40it [00:50,  1.18s/it]

39 | 40 | Rosalia’s Deli | Tuscan Chicken | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Rosalias-Deli-Tuscan-Chicken/
$6. | 241 N. York Rd


41it [00:51,  1.16s/it]

40 | 41 | Z&H MarketCafe | The Marty  | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Z-and-H-MarketCafe-The-Marty/
$7.25 |  1323 E. 57th St


42it [00:52,  1.14s/it]

41 | 42 | Market House on the Square | Whitefish | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Market-House-on-the-Square-Whitefish/
$11. | 655 Forest Ave


43it [00:53,  1.13s/it]

42 | 43 | Elaine’s Coffee Call | Oat Bread, Pecan Butter, and Fruit Jam | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Elaines-Coffee-Call-Oat-Bread-Pecan-Butter-and-Fruit-Jam/
$6. | Hotel Lincol


44it [00:55,  1.26s/it]

43 | 44 | Marion Street Cheese Market | Cauliflower Melt | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Marion-Street-Cheese-Market-Cauliflower-Melt/
$9. | 100 S. Marion St


45it [00:56,  1.32s/it]

44 | 45 | Cafecito | Cubana | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Cafecito-Cubano/
$5.49 |  26 E. Congress Pkwy


46it [00:57,  1.27s/it]

45 | 46 | Chickpea | Kufta | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Chickpea-Kufta/
$8. | 2018 W. Chicago Ave


47it [00:58,  1.23s/it]

46 | 47 | The Goddess and Grocer | Debbie’s Egg Salad | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-The-Goddess-and-Grocer-Debbies-Egg-Salad/
$6.50 |  25 E. Delaware Pl


48it [01:00,  1.22s/it]

47 | 48 | Zenwich | Beef Curry | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Zenwich-Beef-Curry/
$7.50 |  416 N. York St


49it [01:01,  1.22s/it]

48 | 49 | Toni Patisserie | Le Végétarien | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Toni-Patisserie-Le-Vegetarien/
$8.75 |  65 E. Washington St


50it [01:02,  1.25s/it]

49 | 50 | Phoebe’s Bakery | The Gatsby | https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Phoebes-Bakery-The-Gatsby/
$6.85 |  3351 N. Broadwa





In [42]:
df.head(3)

Unnamed: 0,Rank,Cafe,Menu,URL
0,1,Old Oak Tap,BLT,https://www.chicagomag.com/Chicago-Magazine/No...
1,2,Au Cheval,Fried Bologna,https://www.chicagomag.com/Chicago-Magazine/No...
2,3,Xoco,Woodland Mushroom,https://www.chicagomag.com/Chicago-Magazine/No...


In [43]:
df["Price"] = prices
df["Address"] = addresses

In [44]:
df.head(3)

Unnamed: 0,Rank,Cafe,Menu,URL,Price,Address
0,1,Old Oak Tap,BLT,https://www.chicagomag.com/Chicago-Magazine/No...,$10.,2109 W. Chicago Ave
1,2,Au Cheval,Fried Bologna,https://www.chicagomag.com/Chicago-Magazine/No...,$9.,800 W. Randolph St
2,3,Xoco,Woodland Mushroom,https://www.chicagomag.com/Chicago-Magazine/No...,$9.50,445 N. Clark St


In [45]:
df = df.loc[:, ["Rank", "Cafe", "Menu", "Price", "Address"]]
df.set_index("Rank", inplace=True)
df.head()

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Old Oak Tap,BLT,$10.,2109 W. Chicago Ave
2,Au Cheval,Fried Bologna,$9.,800 W. Randolph St
3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St
4,Al’s Deli,Roast Beef,$9.40,914 Noyes St
5,Publican Quality Meats,PB&L,$10.,825 W. Fulton Mkt


In [46]:
df.to_csv(
    "../data/03. best_sandwiches_list_chicago2.csv", sep=",", encoding="UTF-8"
)

In [47]:
pd.read_csv("../data/03. best_sandwiches_list_chicago2.csv", index_col=0).head()

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Old Oak Tap,BLT,$10.,2109 W. Chicago Ave
2,Au Cheval,Fried Bologna,$9.,800 W. Randolph St
3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St
4,Al’s Deli,Roast Beef,$9.40,914 Noyes St
5,Publican Quality Meats,PB&L,$10.,825 W. Fulton Mkt


---

## 5. 시카고 맛집 데이터 지도 시각화


In [48]:
import folium
import pandas as pd
import numpy as np
import googlemaps
from tqdm import tqdm

In [50]:
df = pd.read_csv("../data/03. best_sandwiches_list_chicago2.csv", index_col=0)
df.tail()

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
46,Chickpea,Kufta,$8.,2018 W. Chicago Ave
47,The Goddess and Grocer,Debbie’s Egg Salad,$6.50,25 E. Delaware Pl
48,Zenwich,Beef Curry,$7.50,416 N. York St
49,Toni Patisserie,Le Végétarien,$8.75,65 E. Washington St
50,Phoebe’s Bakery,The Gatsby,$6.85,3351 N. Broadwa


In [51]:
gmaps_key = "xxxxxxxxxxxxxx"
gmaps = googlemaps.Client(key=gmaps_key)
gmaps

<googlemaps.client.Client at 0x23c23e478b0>

In [61]:
lat = []
lng = []

for idx, rows in tqdm(df.iterrows()):
    if not rows["Address"] == "Multiple location":
        target_name = rows["Address"] + ", Chicago"
        gmaps_output = gmaps.geocode(target_name)[0]
        geometry = gmaps_output["geometry"]
        location = geometry["location"]
        lat.append(location["lat"])
        lng.append(location["lng"])
    else:
        lat.append(np.nan)
        lng.append(np.nan)  

50it [00:18,  2.73it/s]


In [62]:
len(lat), len(lng)

(50, 50)

In [64]:
df["lat"] = lat
df["lng"] = lng

In [66]:
df.tail(3)

Unnamed: 0_level_0,Cafe,Menu,Price,Address,lat,lng
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
48,Zenwich,Beef Curry,$7.50,416 N. York St,41.910583,-87.940488
49,Toni Patisserie,Le Végétarien,$8.75,65 E. Washington St,41.883106,-87.625438
50,Phoebe’s Bakery,The Gatsby,$6.85,3351 N. Broadwa,41.943163,-87.644507


In [79]:
mapping = folium.Map(location=["41.8781136", "-87.6297982"], zoom_start=11)

for idx, rows in df.iterrows():
    if not rows["Address"] == "Multiple location":
        folium.Marker(
            location=[rows["lat"], rows["lng"]],
            popup=rows["Cafe"],
            tooltip=rows["Address"],
            icon=folium.Icon(
                icon="coffee",
                prefix="fa",
            )
        ).add_to(mapping)

mapping

---