## Import Packages

In [None]:
# === BEAUTIFULSOUP ===
from bs4 import BeautifulSoup

# === OS ===
import os

# === PANDAS ===
import pandas as pd

# === PICKLE ===
import pickle

# === SELENIUM ===
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# === TIME ===
import time

## Save Login Details
Function that saves login details as cookie information

In [None]:
COOKIES_FILE_PATH = 'cookies.pkl'

def save_cookies(driver, path=COOKIES_FILE_PATH):
    with open(path, 'wb') as file:
        pickle.dump(driver.get_cookies(), file)

def load_cookies(driver, path=COOKIES_FILE_PATH):
    with open(path, 'rb') as file:
        cookies = pickle.load(file)
        for cookie in cookies:
            driver.add_cookie(cookie)

## Run Webscrapper

In [None]:
# MANUAL STEPS
# ============
# 1. Verify the CAPTCHA
# 2. Press the Enter key to continue after verifying the CAPTCHA
# 3. Press "Enter" key in VSCode terminal to continue (ensures that login has been successful)

# =============================================================================

# Constants
BASE_URL = "https://untappd.com"
LOGIN_URL = f"{BASE_URL}/login"
HOME_URL = f"{BASE_URL}/home"
SEARCH_URL = f"{BASE_URL}/search?q=singapore&type=brewery&sort="

# Credentials
USERNAME = ""
PASSWORD = ""

# Initialize Selenium WebDriver
driver = webdriver.Chrome()
driver.maximize_window()

def login():
    driver.get(LOGIN_URL)
    wait = WebDriverWait(driver, 10)

    # Load cookies if they exist and check if already logged in
    if os.path.exists(COOKIES_FILE_PATH):
        load_cookies(driver)
        driver.get(HOME_URL)  # Navigate to home to check if login was successful
        if driver.current_url == HOME_URL:
            print("Logged in using saved cookies")
            driver.get(SEARCH_URL)
            return
        else:
            print("Failed to log in with saved cookies. Proceeding with manual login.")

    # Enter username
    username_input = wait.until(EC.presence_of_element_located((By.ID, 'username')))
    username_input.send_keys(USERNAME)

    # Enter password
    password_input = driver.find_element(By.ID, 'password')
    password_input.send_keys(PASSWORD)

    try:
        # Wait for the CAPTCHA and solve it manually if it appears
        captcha_frame = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'iframe[title="reCAPTCHA"]')))
        if captcha_frame:
            print("Please solve the CAPTCHA manually if it appears, then press Enter to continue...")
            input()
    except:
        # No CAPTCHA present, continue with login
        pass

    # Wait until redirected to HOME_URL
    wait.until(EC.url_to_be(HOME_URL))
    print("Login successful, redirected to home")

    # Save cookies after successful login
    save_cookies(driver)

    # Redirect to SEARCH_URL
    driver.get(SEARCH_URL)
    print("Navigated to search page")


def get_soup():
    time.sleep(2)  # Ensure the page has fully loaded
    return BeautifulSoup(driver.page_source, 'html.parser')

def click_show_more(class_name):
    time.sleep(2)  # Wait for new content to load

    # Check if there is an "announcement" modal
    try:
        announcement_modal = driver.find_element(By.CLASS_NAME, 'announcement')
        if announcement_modal:
            track_button = announcement_modal.find_element(By.CLASS_NAME, 'track-click')
            track_button.click()
    except:
        pass

    while True:
        try:
            # Scroll to the bottom of the page
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Wait for new content to load

            # Find and click the "Show More" button
            show_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, class_name))
            )
            ActionChains(driver).move_to_element(show_more_button).click().perform()
            time.sleep(2)  # Ensure the new content has loaded
        except Exception as e:
            break

def parse_breweries(soup):
    breweries = []
    for brewery in soup.select('div.beer-details'):
        name = brewery.select_one('p.name a').text.strip()
        link = BASE_URL + brewery.select_one('p.name a')['href']
        breweries.append((name, link))
    print(f"Found {len(breweries)} breweries")
    return breweries

def parse_breweries_details(soup):
    country = soup.select_one('p.brewery').text.strip() if soup.select_one('p.brewery') else ""
    return country

def parse_beers(soup):
    beers = []
    for beer in soup.select('div.beer-details'):
        name = beer.select_one('p.name a').text.strip()
        link = BASE_URL + beer.select_one('p.name a')['href']
        beers.append((name, link))
    print(f"Found {len(beers)} beers")
    return beers

def parse_beer_details(soup):
    abv = soup.select_one('p.abv').text.strip() if soup.select_one('p.abv') else ""
    drink_category = soup.select_one('div.name p.style').text.strip() if soup.select_one('div.name p.style') else ""
    latest_review_date = soup.select_one('div.bottom a.time')['data-gregtime'] if soup.select_one('div.bottom a.time') else ""
    official_description = soup.select_one('div.beer-descrption-read-less').text.strip() if soup.select_one('div.beer-descrption-read-less') else ""
    if official_description.endswith(" Show Less"):
        official_description = official_description[:-10]  # Remove the last 10 characters (" Show Less")
    photo_link = soup.select_one('a.label.image-big')['data-image'] if soup.select_one('a.label.image-big') else ""
    return abv, drink_category, latest_review_date, official_description, photo_link

def scrape_untappd():
    login()  # Perform login

    beer_data = []

    # Step A: Scrape all breweries
    click_show_more('more_search')  # Ensure all breweries are loaded
    soup = get_soup()
    breweries = parse_breweries(soup)

    for brewery_idx, (brewery_name, brewery_url) in enumerate(breweries):
        print("==================================================")
        print(f"Scraping brewery {brewery_idx + 1}/{len(breweries)}: {brewery_name}")

        # Step B: Scrape beers from each brewery
        driver.get(f"{brewery_url}/beer")
        click_show_more('more-list-items ')  # Ensure all beers are loaded
        soup = get_soup()
        beers = parse_beers(soup)

        # Step C: Scrape brewery details
        country = parse_breweries_details(soup)
        for beer_idx, (beer_name, beer_url) in enumerate(beers):
            print(f"Scraping beer {beer_idx + 1}/{len(beers)}: {beer_name}")

            # Step D: Scrape beer details
            driver.get(beer_url)
            beer_soup = get_soup()
            abv, drink_category, latest_review_date, official_description, photo_link = parse_beer_details(beer_soup)

            # Collect beer data
            beer_data.append({
                'Brewery Index': brewery_idx + 1,
                'Beer Index': beer_idx + 1,
                'Scraping Remarks / Errors': '',  # Add any remarks if needed
                'Date of latest review': latest_review_date,
                'Expression Name': beer_name,
                'Producer': brewery_name,
                'Bottler': '', # Leave blank for now
                'Country of Origin': country,
                'Drink Type': 'Beer',
                'Drink Category': drink_category,
                'Age': '',
                'ABV': abv,
                '88B Website Review Link': '', # Leave blank for now
                'Official Description': official_description,
                'Producer Website Link': '',
                'Photo Link': photo_link
            })

        # Step E: Save the collected data to CSV
        # Save for every producer scraped
        df = pd.DataFrame(beer_data)
        df.to_csv('untappd_beers.csv', index=False)

# Run the scraper
scrape_untappd()

# Close the driver
driver.quit()