In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

# 1. Data crawling function

In [2]:
def crawling(add, brand):
    resp = requests.get(add) # 페이지 주소
    soup = BeautifulSoup(resp.text, 'html.parser')
    
    # 편의점 이름
    tag1 = "float-left font-" + brand + " font-weight-bold"
    #"float-left font-cu font-weight-bold"
    get_name = soup.find("small", tag1) # 편의점 이름을 가져오기 위한 small 태그 select
    name = get_name.get_text() 

    # 상품 분류
    get_type = soup.find_all('small', 'float-right font-weight-bold')
    type_ = []
    for u in get_type:
        type_.append(u.get_text())
    size = len(type_)
    
    # 상품 이름
    get_item = soup.find_all('strong') 
    item = []
    for u in get_item:
        item.append(u.get_text())
    item = item[1:size + 1] 

    # 상품 가격 (여기서는 우선 개당 가격을 가져 옴)
    get_price = soup.find_all('span', 'text-muted small')
    price = []
    for u in get_price:
        tmp = u.get_text()[1:-1][0:-1]
        tmp_new = tmp.replace(",", "")
        price.append(int(tmp_new))

    # 행사 유형
    tag2 = 'badge bg-' + brand + ' text-white'
    #badge bg-cu text-white
    get_event_type = soup.find_all('span', tag2)
    event_type = []
    for u in get_event_type:
        event_type.append(u.get_text())

    # 상품 이미지
    get_img = soup.find_all('img', 'prod_img')
    img = []
    for u in get_img:
        img.append(u.get('src'))
    
    return name, item, type_, price, event_type, img

In [3]:
def appendData(data, name, item, type_, price, event_type, img):
    for i in range(len(img)):
        real_price = int(price[i]) * int(event_type[i][0])
        data = data.append({'편의점 명' : name, '상품 명' : item[i], '분류' : type_[i], '개당 가격' :price[i], '행사 유형' : event_type[i], '이미지' : img[i]}, ignore_index = True)
        
    return data

In [4]:
def getData(brand):
    data = pd.DataFrame(columns = ('편의점 명', '상품 명', '분류', '개당 가격', '행사 유형', '이미지'))
    for i in range(1, int(brand_dic[brand]) + 1):
        name, item, type_, price, event_type, img = crawling('https://pyony.com/brands/' + brand + '/?page=' + str(i) + '&event_type=&category=&item=100&sort=&q=', brand)
        data = appendData(data, name, item, type_, price, event_type, img)
    
    return data
    

In [5]:
brand_dic = {'cu' : '6', 'gs25' : '9', 'seven' : '9', 'ministop' : '5', 'emart24' : '9'} # 편의점 별 마지막 index 

# 2. Collecting data using function above

#### CU data

In [6]:
dataCU = getData('cu')
dataCU.to_csv("dataCU.csv", mode = 'a', header = True, encoding = 'ms949') #ipynb과 같은 경로에 csv파일 생성
dataCU.head()

Unnamed: 0,편의점 명,상품 명,분류,개당 가격,행사 유형,이미지
0,CU(씨유),2080)칫솔,생활용품,1400,1+1,http://bgf-cu.xcache.kinxcdn.com/product/88010...
1,CU(씨유),46cm)초극세모칫솔,생활용품,1750,1+1,http://bgf-cu.xcache.kinxcdn.com/product/88010...
2,CU(씨유),46cm)쿨민트치약,생활용품,1750,1+1,http://bgf-cu.xcache.kinxcdn.com/product/88010...
3,CU(씨유),CJ)NEW컨디션환,생활용품,1450,1+1,http://bgf-cu.xcache.kinxcdn.com/product/88094...
4,CU(씨유),CJ)NEW컨디션환3입,생활용품,4250,1+1,http://bgf-cu.xcache.kinxcdn.com/product/88094...


#### GS25 data

In [7]:
dataGS25 = getData('gs25')
dataGS25.to_csv("dataGS25.csv", mode = 'a', header = True, encoding = 'ms949') #ipynb과 같은 경로에 csv파일 생성
dataGS25.head()

Unnamed: 0,편의점 명,상품 명,분류,개당 가격,행사 유형,이미지
0,GS25(지에스25),CJ)6년근홍삼진골드100ML,음료,667,2+1,http://gs25appimg.gsretail.com/imgsvr/item/GD_...
1,GS25(지에스25),CJ)가쓰오우동(용기),식품,1933,2+1,http://gs25appimg.gsretail.com/imgsvr/item/GD_...
2,GS25(지에스25),CJ)맛밤80G,식품,2333,2+1,http://gs25appimg.gsretail.com/imgsvr/item/GD_...
3,GS25(지에스25),CJ)맥스봉갈릭후랑크90G,식품,1333,2+1,http://gs25appimg.gsretail.com/imgsvr/item/GD_...
4,GS25(지에스25),CJ)맥스봉소시지150G,식품,2000,2+1,http://gs25appimg.gsretail.com/imgsvr/item/GD_...


#### Seven eleven data

In [8]:
data711 = getData('seven')
data711.to_csv("data711.csv", mode = 'a', header = True, encoding = 'ms949') #ipynb과 같은 경로에 csv파일 생성
data711.head()

Unnamed: 0,편의점 명,상품 명,분류,개당 가격,행사 유형,이미지
0,7-ELEVEN(세븐일레븐),CJ)갈릭후랑크90g,식품,1333,2+1,http://www.7-eleven.co.kr/upload/product/88010...
1,7-ELEVEN(세븐일레븐),CJ)강된장보리비빔밥(컵반),식품,2333,2+1,http://www.7-eleven.co.kr/upload/product/88010...
2,7-ELEVEN(세븐일레븐),CJ)고추장나물비빔밥(컵반),식품,2333,2+1,http://www.7-eleven.co.kr/upload/product/88010...
3,7-ELEVEN(세븐일레븐),CJ)고추장제육덮밥(컵반),식품,2333,2+1,http://www.7-eleven.co.kr/upload/product/88010...
4,7-ELEVEN(세븐일레븐),CJ)낙지콩나물비빔밥(컵반),식품,3000,2+1,http://www.7-eleven.co.kr/upload/product/88010...


#### Mini stop data

In [9]:
dataMini = getData('ministop')
dataMini.to_csv("dataMini.csv", mode = 'a', header = True, encoding = 'ms949') #ipynb과 같은 경로에 csv파일 생성
dataMini.head()

Unnamed: 0,편의점 명,상품 명,분류,개당 가격,행사 유형,이미지
0,MINISTOP(미니스톱),2%복숭아240캔,음료,500,1+1,https://www.ministop.co.kr/MiniStopHomePage/pa...
1,MINISTOP(미니스톱),2%아쿠아500ml,음료,1133,2+1,https://www.ministop.co.kr/MiniStopHomePage/pa...
2,MINISTOP(미니스톱),2080베이킹소다치약,생활용품,1750,1+1,https://www.ministop.co.kr/MiniStopHomePage/pa...
3,MINISTOP(미니스톱),2080식스라인칫솔,생활용품,1950,1+1,https://www.ministop.co.kr/MiniStopHomePage/pa...
4,MINISTOP(미니스톱),2080칫솔,생활용품,1400,1+1,https://www.ministop.co.kr/MiniStopHomePage/pa...


#### Emart 24 data

In [10]:
dataEmart = getData('emart24')
dataEmart.to_csv("dataEmart.csv", mode = 'a', header = True, encoding = 'ms949') #ipynb과 같은 경로에 csv파일 생성
dataEmart.head()

Unnamed: 0,편의점 명,상품 명,분류,개당 가격,행사 유형,이미지
0,EMART24(이마트24),2080)치약오리지날125g,생활용품,1750,1+1,https://www.emart24.co.kr/upload/eventProduct/...
1,EMART24(이마트24),2080)칫솔오리지날탄력모,생활용품,1400,1+1,https://www.emart24.co.kr/upload/eventProduct/...
2,EMART24(이마트24),2080)퓨어핑크솔트치약120g,생활용품,1750,1+1,https://www.emart24.co.kr/upload/eventProduct/...
3,EMART24(이마트24),CJ)NEW컨디션환,생활용품,1450,1+1,https://www.emart24.co.kr/upload/eventProduct/...
4,EMART24(이마트24),CJ)NEW컨디션환3입,생활용품,4250,1+1,https://www.emart24.co.kr/upload/eventProduct/...


# 3. Merging data

In [11]:
data = pd.concat([dataCU, dataGS25, data711, dataMini, dataEmart], ignore_index = True) 
data.to_csv("data.csv", mode = 'a', header = True, encoding = 'ms949') #ipynb과 같은 경로에 csv파일 생성
data

Unnamed: 0,편의점 명,상품 명,분류,개당 가격,행사 유형,이미지
0,CU(씨유),2080)칫솔,생활용품,1400,1+1,http://bgf-cu.xcache.kinxcdn.com/product/88010...
1,CU(씨유),46cm)초극세모칫솔,생활용품,1750,1+1,http://bgf-cu.xcache.kinxcdn.com/product/88010...
2,CU(씨유),46cm)쿨민트치약,생활용품,1750,1+1,http://bgf-cu.xcache.kinxcdn.com/product/88010...
3,CU(씨유),CJ)NEW컨디션환,생활용품,1450,1+1,http://bgf-cu.xcache.kinxcdn.com/product/88094...
4,CU(씨유),CJ)NEW컨디션환3입,생활용품,4250,1+1,http://bgf-cu.xcache.kinxcdn.com/product/88094...
5,CU(씨유),CJ)가쓰오우동컵,식품,1933,2+1,http://bgf-cu.xcache.kinxcdn.com/product/88010...
6,CU(씨유),CJ)맥스봉갈릭후랑크,식품,1333,2+1,http://bgf-cu.xcache.kinxcdn.com/product/88010...
7,CU(씨유),CJ)맥스봉리치치즈50g,식품,1200,2+1,http://cdn2.bgfretail.com/bgfbrand/files/produ...
8,CU(씨유),CJ)맥스봉마늘핫바,식품,1333,2+1,http://bgf-cu.xcache.kinxcdn.com/product/88010...
9,CU(씨유),CJ)맥스봉숯불갈비후랑크,식품,1333,2+1,http://bgf-cu.xcache.kinxcdn.com/product/88010...
