In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import time
import random
from urllib.parse import urljoin, urlparse

In [3]:
class DistillerScraper:
    def __init__(self):
        self.base_url = "https://distiller.com"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        })
        self.spirits_data = []
        
    def get_category_urls(self):
        """獲取所有烈酒類別的 URL"""
        categories = [
            'whiskey', 'tequila-mezcal', 'rum', 
            'brandy', 'gin', 'vodka', 'liqueurs-bitters'
        ]
        return [f"{self.base_url}/search?category={cat}" for cat in categories]
    
    def scrape_spirit_page(self, spirit_url):
        """爬取單個烈酒頁面的詳細資訊"""
        try:
            response = self.session.get(spirit_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # 提取基本資訊
            spirit_data = {
                'name': self.extract_name(soup),
                'category': self.extract_category(soup),
                'origin': self.extract_origin(soup),
                'age': self.extract_age(soup),
                'expert_score': self.extract_expert_score(soup),
                'community_score': self.extract_community_score(soup),
                'flavor_profile': self.extract_flavor_profile(soup)
            }
            
            return spirit_data
            
        except Exception as e:
            print(f"爬取 {spirit_url} 時發生錯誤: {e}")
            return None
    
    def extract_name(self, soup):
        """提取烈酒名稱"""
        name_elem = soup.find('h1', class_='spirit-name') or soup.find('h1')
        return name_elem.get_text().strip() if name_elem else 'N/A'
    
    def extract_category(self, soup):
        """提取烈酒類別"""
        category_elem = soup.find('span', class_='category') or soup.find('div', class_='spirit-type')
        return category_elem.get_text().strip() if category_elem else 'N/A'
    
    def extract_origin(self, soup):
        """提取產地資訊"""
        origin_elem = soup.find('span', class_='origin') or soup.find('div', class_='location')
        return origin_elem.get_text().strip() if origin_elem else 'N/A'
    
    def extract_age(self, soup):
        """提取年份資訊"""
        age_elem = soup.find('span', class_='age') or soup.find('div', class_='age-statement')
        return age_elem.get_text().strip() if age_elem else 'N/A'
    
    def extract_expert_score(self, soup):
        """提取專家評分"""
        expert_score_elem = soup.find('div', class_='expert-score') or soup.find('span', class_='distiller-score')
        return expert_score_elem.get_text().strip() if expert_score_elem else 'N/A'
    
    def extract_community_score(self, soup):
        """提取社群評分"""
        community_score_elem = soup.find('div', class_='user-rating') or soup.find('span', class_='community-score')
        return community_score_elem.get_text().strip() if community_score_elem else 'N/A'
    
    def extract_flavor_profile(self, soup):
        """提取風味圖譜"""
        flavor_elems = soup.find_all('span', class_='flavor-tag') or soup.find_all('div', class_='flavor')
        flavors = [elem.get_text().strip() for elem in flavor_elems]
        return ', '.join(flavors) if flavors else 'N/A'
    
    def scrape_all_spirits(self):
        """爬取所有烈酒資訊"""
        category_urls = self.get_category_urls()
        
        for category_url in category_urls:
            print(f"正在爬取類別: {category_url}")
            self.scrape_category(category_url)
            # 添加延遲避免過度請求
            time.sleep(random.uniform(1, 3))
    
    def scrape_category(self, category_url):
        """爬取特定類別的所有烈酒"""
        page = 1
        while True:
            url = f"{category_url}&page={page}"
            response = self.session.get(url)
            
            if response.status_code != 200:
                break
                
            soup = BeautifulSoup(response.content, 'html.parser')
            spirit_links = soup.find_all('a', href=True)
            
            # 過濾出烈酒頁面連結
            spirit_urls = []
            for link in spirit_links:
                href = link['href']
                if '/spirits/' in href:
                    full_url = urljoin(self.base_url, href)
                    spirit_urls.append(full_url)
            
            if not spirit_urls:
                break
                
            for spirit_url in spirit_urls:
                spirit_data = self.scrape_spirit_page(spirit_url)
                if spirit_data:
                    self.spirits_data.append(spirit_data)
                    print(f"已爬取: {spirit_data['name']}")
                
                # 添加隨機延遲
                time.sleep(random.uniform(0.5, 2))
            
            page += 1
    
    def save_to_csv(self, filename='distiller_spirits.csv'):
        """將資料儲存為 CSV 檔案"""
        if not self.spirits_data:
            print("沒有資料可儲存")
            return
        
        df = pd.DataFrame(self.spirits_data)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"資料已儲存至 {filename}，共 {len(self.spirits_data)} 條記錄")

In [5]:
# 使用範例
if __name__ == "__main__":
    scraper = DistillerScraper()
    scraper.scrape_all_spirits()
    scraper.save_to_csv('distiller_spirits_reviews.csv')

正在爬取類別: https://distiller.com/search?category=whiskey
已爬取: Monkey 47 Dry Gin
已爬取: Eric Bordelet Selection "Henri Bernard Beudin" 1997 Single Cask Calvados
已爬取: Rémy Martin Louis XIII Cognac
已爬取: Cognac Dudognon Heritage
已爬取: Hibiki 21 Year
已爬取: Highland Park 18 Year


KeyboardInterrupt: 