In [56]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request # 서버 요청 객체를 생성하는 모듈
import pandas as pd


#### 크롤링 사이트
- https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/

In [57]:
url_base = 'https://www.chicagomag.com/'
url_sub =  'Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'
url = url_base + url_sub
url


'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'

- 서버 요청 객체 : 네트워크 규칙에 맞춰서 서버에게 전달해야 하는 정보를 구성할 수 있는 객체
    - header를 포함 시킬 수 있음

In [58]:
req = Request(url, headers={'User-Agent':'Mozilla/5.0'})
req

<urllib.request.Request at 0x1df6fe81910>

In [59]:
#res = urlopen(req) #H TTPError: HTTP Error 403: Forbidden
res = urlopen(req) # 위코드에서 header 설정을 안하면 HTTPError 발생
res


<http.client.HTTPResponse at 0x1df6f18fa90>

In [60]:
# bs4 객체 생성
soup_obj = BeautifulSoup(res, "html.parser")
#soup_obj


In [61]:
## 랭킹되어 있는 샌드위치 가게 목록 찾아오기
## div calss:sammy
len(soup_obj.find_all('div',{"class":"sammy"}))


50

In [62]:
temp_all = soup_obj.find_all('div',{"class":"sammy"}) # 모든 샌드위치가게 정보 저장


In [63]:
temp_one = temp_all[0]
temp_one

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
</div>

In [64]:
# 샌드위치 가게 순위
temp_one.find(class_="sammyRank") # class는 예약어이므로 find 함수 내부적으로 class_ 를 사용하고 있음


<div class="sammyRank">1</div>

In [65]:
# 상호명
strn = temp_one.find(class_="sammyListing").get_text()
strn

'BLT\nOld Oak Tap\nRead more '

In [66]:
strn.split('\n')

['BLT', 'Old Oak Tap', 'Read more ']

In [67]:
# 서브페이지 url
temp_one.find('a')['href']

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

#### url 생성
- urljoin() 함수를 이용 : url 형식을 체크

In [68]:
from urllib.parse import urljoin

In [69]:
urljoin(url_base,temp_one.find('a')['href'])

'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

#### 50개 레스토랑 정보 추출
   - list에 저장

In [70]:
rank =  []
main_menu = []
cafe_name = []
url_link = []


In [71]:
url_base = 'https://www.chicagomag.com/'
url_sub =  'Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'
url = url_base + url_sub

req = Request(url,headers={'User-Agent':'Mozilla/5.0'})
html=urlopen(req)

soup = BeautifulSoup(html,'html.parser')
soup_list = soup.find_all('div','sammy') # 전체 50개 레스토랑 정보 추출 코드


In [72]:
# soup_list안의 레스토랑 각각에 대한 정보를 추출해서 list에 저장하는 코드
for item in soup_list : #레스토랑 1개의 정보가 item에 저장
    rank.append(item.find(class_='sammyRank').get_text())
    tmp_listing = item.find(class_="sammyListing").get_text()
    main_menu.append(tmp_listing.split('\n')[0])
    cafe_name.append(tmp_listing.split('\n')[1])
    url_link.append(urljoin(url_base,item.find('a')['href']))


In [73]:
len(rank),len(main_menu),len(cafe_name),len(url_link)


(50, 50, 50, 50)

In [74]:
# 수집한 자료를 df로 만들어서 csv로 저장
data = {'Rank':rank, 'Cafe':cafe_name, 'Menu':main_menu, 'URL':url_link}

df = pd.DataFrame(data)
# df
df.to_csv('./crawl_data/시카고샌드위치가게.csv',sep=',',encoding='utf-8')


#### 수집한 데이터를 활용해서 샌드위치 가게 지도 시각화



In [75]:
## 데이터 읽어오기
df = pd.read_csv('./crawl_data/시카고샌드위치가게.csv', index_col=0)
df.head()
df.tail()

Unnamed: 0,Rank,Cafe,Menu,URL
45,46,Chickpea,Kufta,https://www.chicagomag.com/Chicago-Magazine/No...
46,47,The Goddess and Grocer,Debbie’s Egg Salad,https://www.chicagomag.com/Chicago-Magazine/No...
47,48,Zenwich,Beef Curry,https://www.chicagomag.com/Chicago-Magazine/No...
48,49,Toni Patisserie,Le Végétarien,https://www.chicagomag.com/Chicago-Magazine/No...
49,50,Phoebe’s Bakery,The Gatsby,https://www.chicagomag.com/Chicago-Magazine/No...


In [76]:
# df url을 활용하여 페이지 요청 후 필요자료  (각 cafe의 주소) 추출하는 코드


In [77]:
req = Request(df['URL'][0], headers={'User-Agent':'Mozilla/5.0'})
res = urlopen(req)
soup_tmp = BeautifulSoup(res,'html.parser')




In [78]:
temp_string = soup_tmp.find('p','addy').get_text()
# print(temp_string)
print(temp_string.split())
# temp_string

['$10.', '2109', 'W.', 'Chicago', 'Ave.,', '773-772-0406,', 'theoldoaktap.com']


In [79]:
# 주소 추출
temp_string.split()[1:-2]


['2109', 'W.', 'Chicago', 'Ave.,']

In [80]:
# 추출 data 결합
a = ' '.join(temp_string.split()[1:-2])
a

'2109 W. Chicago Ave.,'

In [82]:
# 가격 추출
temp_string.split()[0][:-1]
print(temp_string.split()[0])

$10.


- 전체 data 추출


In [83]:
price = []
address = []


In [89]:
df.index


Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
           dtype='int64')

In [90]:
for i in df.index :
    req = Request(df['URL'][i],headers={'User-Agent':'Mozilla/5.0'}) # 객체 생성
    html = urlopen(req) # 요청후 응답 반환
    soup_tmp = BeautifulSoup(html,'html.parser') #bs 객체 생성
    temp_string = soup_tmp.find('p','addy').get_text() # 주소와 가격이 포함된 정보 추출
    price.append(temp_string.split()[0][:-1]) # 추출한 정보에서 가격을 분리해서 list에 저장
    address.append(' '.join(temp_string.split()[1:-2])) # 추출한 정보에서 주소를 분리해서 list에 저장


In [30]:
price, address

(['$10',
  '$9',
  '$9.50',
  '$9.40',
  '$10',
  '$7.25',
  '$16',
  '$10',
  '$9',
  '$17',
  '$11',
  '$5.49',
  '$14',
  '$10',
  '$13',
  '$4.50',
  '$11.95',
  '$11.50',
  '$6.25',
  '$15',
  '$5',
  '$6',
  '$8',
  '$5.99',
  '$7.52',
  '$11.95',
  '$7.50',
  '$12.95',
  '$7',
  '$21',
  '$9.79',
  '$9.75',
  '$13',
  '$7.95',
  '$9',
  '$9',
  '$8',
  '$8',
  '$7',
  '$6',
  '$7.25',
  '$11',
  '$6',
  '$9',
  '$5.49',
  '$8',
  '$6.50',
  '$7.50',
  '$8.75',
  '$6.85'],
 ['2109 W. Chicago Ave.,',
  '800 W. Randolph St.,',
  '445 N. Clark St.,',
  '914 Noyes St., Evanston,',
  '825 W. Fulton Mkt.,',
  '100 E. Walton',
  '1639 S. Wabash Ave.,',
  '2211 W. North Ave.,',
  '3619 W. North Ave.,',
  '3267 S. Halsted St.,',
  '2537 N. Kedzie Blvd.,',
  'Multiple',
  '3124 N. Broadway,',
  '3455 N. Southport Ave.,',
  '2657 N. Kedzie Ave.,',
  '1120 W. Grand Ave.,',
  '1141 S. Jefferson St.,',
  '333 E. Benton Pl.,',
  '1411 N. Wells St.,',
  '1747 N. Damen Ave.,',
  '3209 W. Irving P

#### 여러번 반복 접근을 해야 하므로 상태 진행바를 통해 진행상태 확인
- for i in tqdm_notebook(df.index) :


In [91]:
from tqdm import tqdm_notebook # 반복문의 반복 요소에 적용시키면 반복요소가 얼마나 진행되었는지 상태바를 표시

price = []
address = []

for i in tqdm_notebook(df.index) :
    req = Request(df['URL'][i],headers={'User-Agent':'Mozilla/5.0'}) # 객체 생성
    html = urlopen(req) # 요청후 응답 반환
    soup_tmp = BeautifulSoup(html,'html.parser') #bs 객체 생성
    temp_string = soup_tmp.find('p','addy').get_text() # 주소와 가격이 포함된 정보 추출
    price.append(temp_string.split()[0][:-1]) # 추출한 정보에서 가격을 분리해서 list에 저장
    address.append(' '.join(temp_string.split()[1:-2])) # 추출한 정보에서 주소를 분리해서 list에 저장


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(df.index) :


  0%|          | 0/50 [00:00<?, ?it/s]

In [93]:
price

['$10',
 '$9',
 '$9.50',
 '$9.40',
 '$10',
 '$7.25',
 '$16',
 '$10',
 '$9',
 '$17',
 '$11',
 '$5.49',
 '$14',
 '$10',
 '$13',
 '$4.50',
 '$11.95',
 '$11.50',
 '$6.25',
 '$15',
 '$5',
 '$6',
 '$8',
 '$5.99',
 '$7.52',
 '$11.95',
 '$7.50',
 '$12.95',
 '$7',
 '$21',
 '$9.79',
 '$9.75',
 '$13',
 '$7.95',
 '$9',
 '$9',
 '$8',
 '$8',
 '$7',
 '$6',
 '$7.25',
 '$11',
 '$6',
 '$9',
 '$5.49',
 '$8',
 '$6.50',
 '$7.50',
 '$8.75',
 '$6.85']

In [94]:
### 수집된 각 cafe의 price와 address를 df에 추가
df['price'] = price
df['address'] = address


In [95]:
df.head()

Unnamed: 0,Rank,Cafe,Menu,URL,price,address
0,1,Old Oak Tap,BLT,https://www.chicagomag.com/Chicago-Magazine/No...,$10,"2109 W. Chicago Ave.,"
1,2,Au Cheval,Fried Bologna,https://www.chicagomag.com/Chicago-Magazine/No...,$9,"800 W. Randolph St.,"
2,3,Xoco,Woodland Mushroom,https://www.chicagomag.com/Chicago-Magazine/No...,$9.50,"445 N. Clark St.,"
3,4,Al’s Deli,Roast Beef,https://www.chicagomag.com/Chicago-Magazine/No...,$9.40,"914 Noyes St., Evanston,"
4,5,Publican Quality Meats,PB&L,https://www.chicagomag.com/Chicago-Magazine/No...,$10,"825 W. Fulton Mkt.,"


In [96]:
# Rank 컬럼을 index로 생성 : set_index(inplace=True) - 원본수정

df.set_index("Rank", inplace=True)

In [97]:
df.tail()

Unnamed: 0_level_0,Cafe,Menu,URL,price,address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
46,Chickpea,Kufta,https://www.chicagomag.com/Chicago-Magazine/No...,$8,"2018 W. Chicago Ave.,"
47,The Goddess and Grocer,Debbie’s Egg Salad,https://www.chicagomag.com/Chicago-Magazine/No...,$6.50,"25 E. Delaware Pl.,"
48,Zenwich,Beef Curry,https://www.chicagomag.com/Chicago-Magazine/No...,$7.50,"416 N. York St., Elmhurst,"
49,Toni Patisserie,Le Végétarien,https://www.chicagomag.com/Chicago-Magazine/No...,$8.75,"65 E. Washington St.,"
50,Phoebe’s Bakery,The Gatsby,https://www.chicagomag.com/Chicago-Magazine/No...,$6.85,"3351 N. Broadway,"


In [98]:
## 시카고샌드위치_주소.csv
df.to_csv('./crawl_data/시카고샌드위치_주소.csv',sep=',',encoding='utf-8')


### 수집된 주소를 이용해서 각 상점의 위경도 찾아오고 FOLIUM에 cafe 마커 표시


In [100]:
# 필요패키지 import
import googlemaps #install
import folium
import pandas as pd


In [101]:
## 데이터 읽어오기
df = pd.read_csv('./crawl_data/시카고샌드위치_주소.csv',index_col=0)
df.head()


Unnamed: 0_level_0,Cafe,Menu,URL,price,address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Old Oak Tap,BLT,https://www.chicagomag.com/Chicago-Magazine/No...,$10,"2109 W. Chicago Ave.,"
2,Au Cheval,Fried Bologna,https://www.chicagomag.com/Chicago-Magazine/No...,$9,"800 W. Randolph St.,"
3,Xoco,Woodland Mushroom,https://www.chicagomag.com/Chicago-Magazine/No...,$9.50,"445 N. Clark St.,"
4,Al’s Deli,Roast Beef,https://www.chicagomag.com/Chicago-Magazine/No...,$9.40,"914 Noyes St., Evanston,"
5,Publican Quality Meats,PB&L,https://www.chicagomag.com/Chicago-Magazine/No...,$10,"825 W. Fulton Mkt.,"


In [104]:
# 구글 클라이언트 등록키를 이용해서 client 객체 생성
gmapsKey = 'AIzaSyDaavIigsdXYCOaBIG_Gt-S0mScya5TWbE'
gmaps = googlemaps.Client(key = gmapsKey)

In [108]:
df['address'][1]

'2109 W. Chicago Ave.,'

In [109]:
# 첫번째 상점의 위경도 찾기
# 미국 주 이름앞에는 , 가 와야함(두번있어도 상관 없음 단, 없으면 못찾는다)
target_name = df['address'][1] + "," + 'Chicago'
target_name

'2109 W. Chicago Ave.,,Chicago'

In [113]:
# 위경도 찾기
g_info = gmaps.geocode(target_name)


g_lo = g_info[0].get("geometry")['location']
g_lo['lat'], g_lo['lng']

(41.8956049, -87.67996149999999)

In [120]:
# 50개 위경도 찾아오기
lat=[]
lng=[]

from tqdm import tqdm_notebook
for n in tqdm_notebook(df.index) :
    target_name = df['address'][n] +','+ 'Chicago'
    g_info = gmaps.geocode(target_name)
    g_lo =g_info[0].get("geometry")['location']
    lat.append(g_lo['lat'])
    lng.append(g_lo['lng'])


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for n in tqdm_notebook(df.index) :


  0%|          | 0/50 [00:00<?, ?it/s]

In [121]:
len(lat), len(lng)

(50, 50)

In [122]:
df['lat'] = lat
df['lng'] = lng
df.head()

Unnamed: 0_level_0,Cafe,Menu,URL,price,address,lat,lng
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Old Oak Tap,BLT,https://www.chicagomag.com/Chicago-Magazine/No...,$10,"2109 W. Chicago Ave.,",41.895605,-87.679961
2,Au Cheval,Fried Bologna,https://www.chicagomag.com/Chicago-Magazine/No...,$9,"800 W. Randolph St.,",41.884639,-87.64759
3,Xoco,Woodland Mushroom,https://www.chicagomag.com/Chicago-Magazine/No...,$9.50,"445 N. Clark St.,",41.890523,-87.630783
4,Al’s Deli,Roast Beef,https://www.chicagomag.com/Chicago-Magazine/No...,$9.40,"914 Noyes St., Evanston,",42.058322,-87.683748
5,Publican Quality Meats,PB&L,https://www.chicagomag.com/Chicago-Magazine/No...,$10,"825 W. Fulton Mkt.,",41.886604,-87.648536


In [46]:
df.to_csv('./crawl_data/시카고샌드위치위경도포함.csv')

### 지도 시각화

In [123]:
lat_c = df['lat'].mean()
lng_c = df['lng'].mean()



In [128]:
map = folium.Map(location=[lat_c,lng_c], zoom_start= 11)
folium.Marker([lat_c,lng_c], popup='Center').add_to(map)
map

In [125]:
# 전체 cafe의 위치에 Marker 표시하기

In [130]:
map_fin = folium.Map(location=[lat_c,lng_c], zoom_start=11)
map_fin

In [51]:
for n in df.index :
    folium.Marker([df['lat'][n],df['lng'][n]],
                  popup=df['Cafe'][n]).add_to(map_fin)


In [52]:
map_fin

In [53]:
## 지도 저장
map_fin.save('./crawl_data/시카고카페.html')