# 爬蟲程式

## 1. Google_API
這部分是採用google提供的place API資訊，參酌網址如下：https://developers.google.com/places/web-service/overview
透過API提供的資料，我們收集了不同地區內的各個餐廳店家名稱、地址、電話、評分、價位、連結、評價數、經緯度等資訊。

In [1]:
import requests, json, csv
from urllib.parse import urlencode
api_key = "這裡要放入從google platform申請的API金鑰"

#### 1.1 店家清單搜尋
這部分利用了place API的nearby search功能。

In [None]:
#Places API
def searching_restaurants(content, key, rad_input = 100):
    """根據輸入的內容與範圍搜尋店家"""
    base_endpoint_places = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
    params = {
        "key" : key,
        "location" : "25.012606331913954, 121.53534692174303",  #以特定經緯度地點搜尋
        "radius" : rad_input,  #方圓半徑
        "keyword" : content  #搜尋內容
    }
    params_encoded = urlencode(params)
    places_endpoint = f"{base_endpoint_places}?{params_encoded}"

    req = requests.get(places_endpoint)
    dta = req.json()
    re_list = dta["results"]
    place_id_list = []
    for i in re_list:
        place_id_list.append(i['place_id'])
    return place_id_list

#關鍵字搜尋：根據各種食物的類型搜尋店家，並傳回各種類型的店家清單
place_dictionary = dict()
Keyword_list = ["義大利麵", "咖哩", "牛排", "粥", "冰品", "飲料", "火鍋", "水餃", "小吃", "炒飯", "日式料理", "韓式料理", "泰式料理", "滷味", "鹹酥雞", "便當", "早午餐", "低GI", "拉麵", "素食", "速食"]
for i in Keyword_list:
    dta_list = searching_restaurants(i, api_key, rad_input = 500)
    place_dictionary[i] = dta_list
print(place_dictionary)

#### 1.2 店家詳細資料搜尋
這部分則是根據1.1 傳回的清單，用place API中的details功能，找尋店家的詳細資料

In [None]:
# 各店家細節csv
def search_for_details(place, goo_key):
    '''利用place API找尋給定店家的細節資訊'''
    place_id = place
    detail_base_endpoint = "https://maps.googleapis.com/maps/api/place/details/json"
    detail_params = {
        "place_id" : f"{place_id}",
        "language" : "zh-TW",
        "fields" : "name,formatted_address,geometry,rating,formatted_phone_number,url,opening_hours,rating,review,user_ratings_total",  #要收集的資訊
        "key" : goo_key
    }
    detail_params_encoded = urlencode(detail_params)
    detail_url = f"{detail_base_endpoint}?{detail_params_encoded}"
    req_detail = requests.get(detail_url)
    req_dict = req_detail.json()
    detail_dict = req_dict['result']
    
    #整理要傳回的資訊
    for_return = []
    for i in ['name', 'formatted_address', 'formatted_phone_number', 'rating', 'price_level', 'url', 'user_ratings_total']:
        try:
            for_return.append(detail_dict[i])
        except KeyError:
            for_return.append('null')
    for_return.append(detail_dict['geometry']['location']['lat'])
    for_return.append(detail_dict['geometry']['location']['lng'])
    return for_return

#讀取places資料
Keyword_list = ["義大利麵", "咖哩", "牛排", "粥", "冰品", "飲料", "火鍋", "水餃", "小吃", "炒飯", "日式料理", "韓式料理", "泰式料理", "滷味", "鹹酥雞", "便當", "早午餐", "低GI", "拉麵", "素食", "速食"]
type_dict = {"義大利麵":'西式/美式', "咖哩":'日式料理', "牛排":'西式/美式', "粥":"台式/小吃", "冰品":'飲料/甜點', "飲料":'飲料/甜點', "火鍋":'火鍋', "水餃":'台式/小吃', "小吃":'台式/小吃', "炒飯":'台式/小吃', "日式料理":"日式料理", "韓式料理":"韓式料理", "泰式料理":"東南亞", "滷味":"台式/小吃", "鹹酥雞":'台式/小吃', "便當":"台式/小吃", "早午餐":'西式/美式', "低GI":'素食/健康', "拉麵":'日式料理', "素食":"素食/健康", "速食":'速食'}
fh1 = open('gongguan_place_id.txt', 'r', encoding = 'utf-8')
fhstr = fh1.read()
fhdic = eval(fhstr)
fh1.close()

#讀取details，存成二維list
total_list = []
for i in Keyword_list:
    places_list = fhdic[i]
    for aplace in places_list:
        list_for_a_place = [type_dict[i], i, aplace]
        place_detail = search_for_details(aplace, api_key)
        list_for_a_place.extend(place_detail)
        total_list.append(list_for_a_place)

#寫入csv檔案
with open('gongguan_new_details.csv', 'w', newline='', errors = 'ignore') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['general_type', 'specific_type', 'place_id', '店名', '地址', '電話', '評分', '價位', '連結', '評價數', '緯度', '經度'])
    for i in total_list:
        writer.writerow(i)
    csvfile.close()

#### 1.3 店家開放時間、繁忙時段 
在google地圖當中，能夠看到各個店家的開放時間與每個小時的繁忙時段。然而，google目前並未直接透過API提供各店家的繁忙時段資訊。因此，這邊使用了另一個populartimes的套件，抓取不同店家的繁忙時段資料，而營業時段的資料則仍維持使用place API。
source: https://github.com/m-wrzr/populartimes

In [None]:
pip install git+https://github.com/m-wrzr/populartimes

In [None]:
import populartimes
#各店家開放時間、populartime
def search_for_times(place, goo_key):
    '''利用place API找尋給定店家的開放時間與繁忙時段'''
    #places API
    place_id = place
    detail_base_endpoint = "https://maps.googleapis.com/maps/api/place/details/json"
    detail_params = {
        "place_id" : f"{place_id}",
        "language" : "zh-TW",
        "fields" : "name,opening_hours",  #要收集的資訊
        "key" : goo_key
    }
    detail_params_encoded = urlencode(detail_params)
    detail_url = f"{detail_base_endpoint}?{detail_params_encoded}"
    req_detail = requests.get(detail_url)
    req_dict = req_detail.json()
    detail_dict = req_dict['result']
    
    #populartimes
    pop_det = populartimes.get_id(goo_key, place)
    try:
        pop_list = pop_det['populartimes']
    except KeyError:
        pop_list = [{'data' : 'null'}, {'data' : 'null'}, {'data' : 'null'}, {'data' : 'null'}, {'data' : 'null'}, {'data' : 'null'}, {'data' : 'null'}]

    #整理要傳回的資訊
    for_return = []
    for_return.append(detail_dict['name'])
    try:
        for_return.append(detail_dict['opening_hours']['periods'])
    except KeyError:
            for_return.append('null')
    try:
        for_return.append(detail_dict['opening_hours']['weekday_text'])
    except KeyError:
            for_return.append('null')
    for i in range(0, 7):
        for_return.append(pop_list[i]['data'])
    return for_return

#讀取places資料
Keyword_list = ["義大利麵", "咖哩", "牛排", "粥", "冰品", "飲料", "火鍋", "水餃", "小吃", "炒飯", "日式料理", "韓式料理", "泰式料理", "滷味", "鹹酥雞", "便當", "早午餐", "低GI", "拉麵", "素食", "速食"]
type_dict = {"義大利麵":'西式/美式', "咖哩":'日式料理', "牛排":'西式/美式', "粥":"台式/小吃", "冰品":'飲料/甜點', "飲料":'飲料/甜點', "火鍋":'火鍋', "水餃":'台式/小吃', "小吃":'台式/小吃', "炒飯":'台式/小吃', "日式料理":"日式料理", "韓式料理":"韓式料理", "泰式料理":"東南亞", "滷味":"台式/小吃", "鹹酥雞":'台式/小吃', "便當":"台式/小吃", "早午餐":'西式/美式', "低GI":'素食/健康', "拉麵":'日式料理', "素食":"素食/健康", "速食":'速食'}
fh1 = open('gongguan_new_id.txt', 'r', encoding = 'utf-8')
fhstr = fh1.read()
fhdic = eval(fhstr)
fh1.close()
#讀取details，存成二維list
total_list = []
for i in Keyword_list:
    places_list = fhdic[i]
    for aplace in places_list:
        list_for_a_place = [type_dict[i], i, aplace]
        place_detail = search_for_times(aplace, api_key)
        list_for_a_place.extend(place_detail)
        total_list.append(list_for_a_place)

#寫入csv檔案
with open('gongguan_time_details.csv', 'w', newline='', errors = 'ignore') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['general_type', 'specific_type', 'place_id', '店名', 'periods', 'weekday_text', 'Mon', 'Tues', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun'])
    for i in total_list:
        writer.writerow(i)

#### 1.4 google評論

In [None]:
#reviews_from_google
def search_for_google_reviews(place, goo_key):
    '''利用place API找尋給定店家的開放時間與繁忙時段'''
    #places API
    place_id = place
    detail_base_endpoint = "https://maps.googleapis.com/maps/api/place/details/json"
    detail_params = {
        "place_id" : f"{place_id}",
        "language" : "zh-TW",
        "fields" : "name,review",  #要收集的資訊
        "key" : goo_key
    }
    detail_params_encoded = urlencode(detail_params)
    detail_url = f"{detail_base_endpoint}?{detail_params_encoded}"
    req_detail = requests.get(detail_url)
    req_dict = req_detail.json()
    detail_dict = req_dict['result']
    
    #整理要傳回的資訊
    for_return = []
    for_return.append(detail_dict['name'])
    try:
        for are in detail_dict['reviews']:   
            for_return.append(are['text'])
    except KeyError:
            for_return.append('null')
    return for_return

#讀取places資料
Keyword_list = ["義大利麵", "咖哩", "牛排", "粥", "冰品", "飲料", "火鍋", "水餃", "小吃", "炒飯", "日式料理", "韓式料理", "泰式料理", "滷味", "鹹酥雞", "便當", "早午餐", "低GI", "拉麵", "素食", "速食"]
type_dict = {"義大利麵":'西式/美式', "咖哩":'日式料理', "牛排":'西式/美式', "粥":"台式/小吃", "冰品":'飲料/甜點', "飲料":'飲料/甜點', "火鍋":'火鍋', "水餃":'台式/小吃', "小吃":'台式/小吃', "炒飯":'台式/小吃', "日式料理":"日式料理", "韓式料理":"韓式料理", "泰式料理":"東南亞", "滷味":"台式/小吃", "鹹酥雞":'台式/小吃', "便當":"台式/小吃", "早午餐":'西式/美式', "低GI":'素食/健康', "拉麵":'日式料理', "素食":"素食/健康", "速食":'速食'}
fh1 = open('gongguan_new_id.txt', 'r', encoding = 'utf-8')
fhstr = fh1.read()
fhdic = eval(fhstr)
fh1.close()

#讀取reviews，存成二維list
total_list = []
for i in Keyword_list:
    places_list = fhdic[i]
    for aplace in places_list:
        list_for_a_place = [type_dict[i], i, aplace]
        reviews = search_for_google_reviews(aplace, api_key)
        list_for_a_place.extend(reviews)
        total_list.append(list_for_a_place)

#寫入csv檔案
with open('gongguan_google_reviews.csv', 'w', newline='', errors = 'ignore') as csvfile:
    writer = csv.writer(csvfile)
    for i in total_list:
        writer.writerow(i)

## 2. Instagram的評論、圖片搜尋
這部分主要是使用selenium幫助我們自動化的抓取資料(各個地點的ig評論、照片、連結)，並進行一些基礎的資料清理後，儲存成csv檔案

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv

#### 2.1 IG的位置連結 
根據商家的名稱，搜尋INSTAGRAM的各個地點連結！

In [None]:
#根據店家名稱抓取location路徑
URI='https://www.instagram.com'
driver=webdriver.Chrome()
driver.get(URI)

#登入
time.sleep(1)
accounts = driver.find_element_by_xpath('//*[@name="username"]')
time.sleep(1)
accounts.send_keys("")  #你的帳號名稱
time.sleep(1)
pw = driver.find_element_by_xpath('//*[@name="password"]')
time.sleep(1)
pw.send_keys("")  #你的密碼
login_btn = driver.find_element_by_xpath("//button[@class='sqdOP  L3NKy   y3zKF     ']")
login_btn.click()
time.sleep(3)

#是否自動登入：選擇稍後再說
later_btn = driver.find_element_by_xpath("//button[@class='sqdOP yWX7d    y3zKF     ']")
later_btn.click()
time.sleep(2)

#是否開啟通知：選擇稍後再說
later_btn_2 = driver.find_element_by_xpath("//button[@class='aOOlW   HoLwm ']")
later_btn_2.click()
time.sleep(2)

def search_for_href(place_name):
    driver.get(URI)
    time.sleep(1)
    
    #搜尋資料
    search = driver.find_element_by_xpath("//input[@placeholder='搜尋']")
    search.clear() 
    search.send_keys(place_name)
    search.send_keys(Keys.ENTER) 
    time.sleep(1)

    #擷取連結
    try:
        first_re = driver.find_element_by_xpath("//div[@class='nebtz coreSpriteLocation']/ancestor::a")
    except:
        try:
            time.sleep(5)
            first_re = driver.find_element_by_xpath("//div[@class='nebtz coreSpriteLocation']/ancestor::a")
        except:
            return "null"
    try: 
        link = first_re.get_attribute("href")
    except:
        link = 'null'
    return link

#執行搜尋
list_of_href = []
with open('./crawling/gongguan/gongguan_details.csv', newline='') as file_cont:  #讀取商家名稱資料
    # 讀取 CSV 檔案內容
    content = csv.reader(file_cont)
    content = list(content)
    for row in range(1, len(content)):
        single_id = content[row][2]
        single_name = content[row][3]
        list_of_href.append([])
        list_of_href[-1].append(single_id)
        list_of_href[-1].append(single_name)
        list_of_href[-1].append(search_for_href(single_name))
        print(list_of_href[-1])
#輸出資料
print(list_of_href)
file_cont.close()

#寫入csv檔案
with open('gongguan_IG_links.csv', 'w', newline='', errors = 'ignore') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['place_id', 'name', 'IG_link'])
    for i in list_of_href:
        writer.writerow(i)


#### 2.2 IG圖片與連結 
找尋前述連結內的熱門貼文，並從中抓取圖片、附上圖片來源。

In [None]:
#搜尋各location內的圖片
URI='https://www.instagram.com'
driver=webdriver.Chrome()
driver.get(URI)

#登入
time.sleep(1)
accounts = driver.find_element_by_xpath('//*[@name="username"]')
time.sleep(1)
accounts.send_keys("fish._.03xx")
time.sleep(1)
pw = driver.find_element_by_xpath('//*[@name="password"]')
time.sleep(1)
pw.send_keys("zxcvbnm2536")
login_btn = driver.find_element_by_xpath("//button[@class='sqdOP  L3NKy   y3zKF     ']")
login_btn.click()
time.sleep(3)

#是否自動登入：選擇稍後再說
later_btn = driver.find_element_by_xpath("//button[@class='sqdOP yWX7d    y3zKF     ']")
later_btn.click()
time.sleep(2)

#是否開啟通知：選擇稍後再說
later_btn_2 = driver.find_element_by_xpath("//button[@class='aOOlW   HoLwm ']")
later_btn_2.click()
time.sleep(2)


def search_for_post_link(location):
    '''從location中抓出貼文連結'''
    driver.get(location)
    time.sleep(1)

    #擷取連結
    try:
        link_ob = driver.find_element_by_xpath("//div[@class='v1Nh3 kIKUG  _bz0w']/a")
    except:
        try:
            time.sleep(5)
            link_ob = driver.find_element_by_xpath("//div[@class='v1Nh3 kIKUG  _bz0w']/a")
        except:
            return "None"
    try: 
        href = link_ob.get_attribute("href")
    except:
        href = 'None'
    return href

def search_for_img(place_id, post_link):
    driver.get(a_link)
    time.sleep(3)
    try: 
        time.sleep(1)
        img_tag = driver.find_element_by_xpath("//div[@class='eLAPa RzuR0']/div/img")
        img_href = img_tag.get_attribute("src")
        resp = requests.get(img_href, stream=True)
        local_file = open(str(place_id) + '.jpg', 'wb')
        shutil.copyfileobj(resp.raw, local_file)
        print("completed!")
        del resp
        time.sleep(3)
        local_file.close()
        return True
    except:
        return False

#開啟IG location檔案
with open('118_link.csv', newline='') as csvFile:
    rows = csv.reader(csvFile)
    rows = list(rows)
    location_list = []
    for i in range(1, len(rows)):
        place_id = rows[i][0]
        name = rows[i][1]
        ig_link = rows[i][2]
        location_list.append([place_id, name, ig_link])
csvFile.close()

new_list = []
for a_loc in location_list:
    if a_loc[2] == 'null':
        new_list.append(a_loc)
        new_list[-1].append("None")
        continue
    else:
        #確定該地點的確有文章
        try: 
            a_link = search_for_post_link(a_loc[2])
        except:
            new_list.append(a_loc)
            new_list[-1].append("None")
            continue
        
        #載入圖片
        if search_for_img(a_loc[0], a_link) == True:
            new_list.append(a_loc)
            new_list[-1].append(a_link)
        else:
            new_list.append(a_loc)
            new_list[-1].append("None")
with open('118_img.csv','w', newline='') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(['place_id', 'name', 'IG_link', 'img_source'])
    for i in new_list:
        writer.writerow(i)
csvFile.close()
        

#### 2.3 抓取ig評論資料
抓取熱門貼文(前九則)的內文，並儲存成CSV檔

In [None]:
#搜尋各location內的連結
URI='https://www.instagram.com'
driver=webdriver.Chrome()
driver.get(URI)

#登入
time.sleep(1)
accounts = driver.find_element_by_xpath('//*[@name="username"]')
time.sleep(1)
accounts.send_keys("帳號名稱")
time.sleep(1)
pw = driver.find_element_by_xpath('//*[@name="password"]')
time.sleep(1)
pw.send_keys("密碼")
login_btn = driver.find_element_by_xpath("//button[@class='sqdOP  L3NKy   y3zKF     ']")
login_btn.click()
time.sleep(3)

#是否自動登入：選擇稍後再說
later_btn = driver.find_element_by_xpath("//button[@class='sqdOP yWX7d    y3zKF     ']")
later_btn.click()
time.sleep(2)

#是否開啟通知：選擇稍後再說
later_btn_2 = driver.find_element_by_xpath("//button[@class='aOOlW   HoLwm ']")
later_btn_2.click()
time.sleep(2)


def search_for_post_links(location):
    '''從location中抓出貼文連結'''
    driver.get(location)
    time.sleep(1)

    #擷取連結
    try:
        a_list = driver.find_elements_by_xpath("//div[@class='v1Nh3 kIKUG  _bz0w']/a")
    except:
        try:
            time.sleep(5)
            a_list = driver.find_elements_by_xpath("//div[@class='v1Nh3 kIKUG  _bz0w']/a")
        except:
            return "null"
    link_list = []
    try: 
        for i in a_list:
            link_list.append(i.get_attribute("href"))
    except:
        link_list.append('null')
    return link_list

def search_for_details(a_post): 
    '''從post_link抓取內文'''
    driver.get(a_post)
    time.sleep(1)

    text_list = []
    #擷取連結
    try:
        a_span = driver.find_elements_by_xpath("//div[@class='C4VMK']/span")
    except:
        try:
            time.sleep(5)
            a_span = driver.find_elements_by_xpath("//div[@class='C4VMK']/span")
        except:
            return "null"
    return a_span

def preprocessing(text):
    '''刪除不該出現的文字'''
    sent = str(text)
    sent = sent.replace("\n", "")
    sent = sent.replace("<class 'list'>", "")
    sent = sent.replace("-", "")
    return sent


#開啟IG location檔案
with open('p2.csv', newline='') as csvFile:
    rows = csv.reader(csvFile)
    rows = list(rows)
    location_list = []
    for i in range(1, len(rows)):
        place_id = rows[i][0]
        ig_link = rows[i][2]
        location_list.append([place_id, ig_link])
csvFile.close()

for a_loc in location_list:
    if a_loc[1] == 'null':
        continue
    else:
        links = search_for_post_links(a_loc[1])
        f = open(str(a_loc[0]) + '.txt', 'w', newline='', errors = 'ignore')
        #決定要抓幾篇
        if len(links) <= 9: 
            posts_amount = len(links)
        else:
            posts_amount = 9

        for i in range(posts_amount):
            texts = search_for_details(links[i])
            for j in texts:
                test = j.text
                f.write(preprocessing(test))
        f.close()

#### 2.4 資料清理與合併
將原始的文字檔去除換行、表情符號等等，最後存成CSV FILE

In [None]:
#整併txt檔案成csv
def preprocessing(text):
    '''刪除不該出現的文字'''
    sent = str(text)
    sent = sent.replace("\n", "")
    sent = sent.replace("<class 'list'>", "")
    sent = sent.replace("-", "")
    sent = sent.replace(",", "")
    return sent

#讀取IGlink檔案
with open('wenzhou_IG_links.csv', newline='') as csvFile:
    rows = csv.reader(csvFile)
    rows = list(rows)
    location_list = []
    for i in range(1, len(rows)):
        place_id = rows[i][0]
        name = rows[i][1]
        ig_link = rows[i][2]
        location_list.append([place_id, name, ig_link])
csvFile.close()

#open新csv檔案
fh1 = open('wenzhou_IG_content.csv', 'w', newline='', errors = 'ignore')
writer = csv.writer(fh1)
writer.writerow(['place_id', 'name', 'IG_link', 'content'])
for a_restaurant in location_list:
    new_list = []
    new_list.extend(a_restaurant)
    res_id = a_restaurant[0]
    try:
        fh_res = open(str(res_id) + '.txt', 'r', newline='', errors = 'ignore')
        content = fh_res.read()
        content = preprocessing(content)
        content = str(content)
        new_list.append(content)
        fh_res.close()
    except:
        new_list.append('None')
    writer.writerow(new_list)
fh1.close()

## 3 其他資料處理
包含圖片轉檔、文字轉拼音兩個部分。

#### 3.1 圖片轉檔
由於抓下來的時候是jpg flie，此處轉檔成PNG，以便前端使用，並進行壓縮檔案。
這邊使用了PIL的套件，進行圖片處理。

In [None]:
from PIL import Image
import pandas as pd
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/chhtwhc/FindFoodEatFood/main/TextMining.csv',usecols=[4])
ids = np.array(df)
id_list=ids.tolist()
print(id_list)
#轉檔
for i in id_list:
    try: 
        im1 = Image.open(r'food_old\\' + str(i[0]) + '.png')
        im1.save(r'food\\' + str(i[0]) + '.png', optimize = True, quality = 15)
    except:
        continue

#### 3.2 文字轉拼音
這邊使用了pypinyin的套件，把漢字轉為拼音資料。

In [None]:
pip install pypinyin

In [None]:
import csv
from pypinyin import pinyin

#開啟檔案
with open('gongguan_details.csv', newline='') as csvFile:
    rows = csv.reader(csvFile)
    rows = list(rows)
    location_list = []
    for i in range(1, len(rows)):
        place_id = rows[i][2]
        name = rows[i][3]
        location_list.append([place_id, name])
csvFile.close()

def convert(name):
    """文字轉拼音"""    
    alist = pinyin(name)
    total_str = str()
    for i in alist:
        total_str += str(i[0])
        total_str += " "
    return total_str

#寫入csv檔案
with open('gongguan_pinyin.csv', 'w', newline='', encoding = 'utf-8', errors = 'ignore') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['place_id', 'chinese_name', 'pinyin'])
    for i in location_list:
        pin_name = convert(i[1])
        i.append(pin_name)
        writer.writerow(i)
csvFile.close()
