# Install Relevant Packages

In [2]:
# pip install python-dotenv

---

## Webscrapper 1: Scrape Search Results
Scrape from `https://untappd.com/search?q={country}&type=brewery&sort=`

In [3]:
# IMPORT PACKAGES
# ============
# === BEAUTIFULSOUP ===
from bs4 import BeautifulSoup

# === DOTENV ===
from dotenv import load_dotenv

# === OS ===
import os

# === PANDAS ===
import pandas as pd

# === PICKLE ===
import pickle

# === SELENIUM ===
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# === TIME ===
import time

# =============================================================================

# SAVE LOGIN DETAILS
# ============

COOKIES_FILE_PATH = 'cookies.pkl'

def save_cookies(driver, path=COOKIES_FILE_PATH):
    with open(path, 'wb') as file:
        pickle.dump(driver.get_cookies(), file)

def load_cookies(driver, path=COOKIES_FILE_PATH):
    with open(path, 'rb') as file:
        cookies = pickle.load(file)
        for cookie in cookies:
            driver.add_cookie(cookie)

# =============================================================================

# Constants
BASE_URL = "https://untappd.com"
LOGIN_URL = f"{BASE_URL}/login"
HOME_URL = f"{BASE_URL}/home"

# Credentials
load_dotenv()
USERNAME = os.getenv('USERNAME')
PASSWORD = os.getenv('PASSWORD')

# Countries
COUNTRIES = [
    # "singapore",
    # "laos",
    # "malaysia",
    # "cambodia",
    # "indonesia",
    # "hong kong",
    "philippines",
    "taiwan",
    "vietnam",
    "israel",
    "india",
    "thailand",
    "korea",
    "japan",
    "china",
    "new zealand",
    "australia"
]

# Define the directory containing the CSV files
DIRECTORY = "../Results/Untappd Search"

def last_csv(countries):
    csv_files = []
    
    for country in countries:
        # Construct the full file name with .csv extension
        file_name = f"{country}.csv"
        file_path = os.path.join(DIRECTORY, file_name)
        
        # Check if the file exists in the directory
        if os.path.exists(file_path):
            csv_files.append(file_path)  # Store the full path instead of just the file name
    
    # Return the path of the last CSV file found, or None if no files were found
    return csv_files[-1] if csv_files else None

def last_record(file_path):
    if file_path is not None:
        try:
            # Load the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Check if the DataFrame is not empty
            if not df.empty:
                # Return the last row of the DataFrame
                return df.iloc[-1]
            else:
                return None
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return None
    else:
        return None

# Initialize Selenium WebDriver
driver = webdriver.Chrome()
driver.maximize_window()

def login():
    # MANUAL STEPS
    # ============
    # 1. Verify the CAPTCHA
    # 2. Press the Enter key to continue after verifying the CAPTCHA
    # 3. Press "Enter" key in VSCode terminal to continue (ensures that login has been successful)

    driver.get(LOGIN_URL)
    wait = WebDriverWait(driver, 10)

    # Load cookies if they exist and check if already logged in
    if os.path.exists(COOKIES_FILE_PATH):
        load_cookies(driver)
        driver.get(HOME_URL)  # Navigate to home to check if login was successful
        if driver.current_url == HOME_URL:
            print("Logged in using saved cookies")
            return
        else:
            print("Failed to log in with saved cookies. Proceeding with manual login.")

    # Enter username
    username_input = wait.until(EC.presence_of_element_located((By.ID, 'username')))
    username_input.send_keys(USERNAME)

    # Enter password
    password_input = driver.find_element(By.ID, 'password')
    password_input.send_keys(PASSWORD)

    try:
        # Wait for the CAPTCHA and solve it manually if it appears
        captcha_frame = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'iframe[title="reCAPTCHA"]')))
        if captcha_frame:
            print("Please solve the CAPTCHA manually if it appears, then press Enter to continue...")
            input()
    except:
        # No CAPTCHA present, continue with login
        pass

    # Wait until redirected to HOME_URL
    wait.until(EC.url_to_be(HOME_URL))
    print("Login successful, redirected to home")

    # Save cookies after successful login
    save_cookies(driver)

def get_soup():
    time.sleep(2)  # Ensure the page has fully loaded
    return BeautifulSoup(driver.page_source, 'html.parser')

def click_show_more(class_name):
    time.sleep(2)  # Wait for new content to load

    # Check if there is an "announcement" modal
    try:
        announcement_modal = driver.find_element(By.CLASS_NAME, 'announcement')
        if announcement_modal:
            track_button = announcement_modal.find_element(By.CLASS_NAME, 'track-click')
            track_button.click()
    except:
        pass

    while True:
        try:
            # Scroll to the bottom of the page
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Wait for new content to load

            # Find and click the "Show More" button
            show_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, class_name))
            )
            ActionChains(driver).move_to_element(show_more_button).click().perform()
            time.sleep(2)  # Ensure the new content has loaded
        except Exception as e:
            break

def parse_breweries(soup):
    breweries = []
    for brewery in soup.select('div.beer-details'):
        name = brewery.select_one('p.name a').text.strip()
        link = BASE_URL + brewery.select_one('p.name a')['href']
        breweries.append((name, link))
    print(f"Found {len(breweries)} breweries")
    return breweries

def parse_breweries_details(soup):
    country = soup.select_one('p.brewery').text.strip() if soup.select_one('p.brewery') else ""
    return country

def parse_beers(soup):
    beers = []
    for beer in soup.select('div.beer-details'):
        name = beer.select_one('p.name a').text.strip()
        link = BASE_URL + beer.select_one('p.name a')['href']
        beers.append((name, link))
    print(f"Found {len(beers)} beers")
    return beers

def parse_beer_details(soup):
    abv = soup.select_one('p.abv').text.strip() if soup.select_one('p.abv') else ""
    drink_category = soup.select_one('div.name p.style').text.strip() if soup.select_one('div.name p.style') else ""
    latest_review_date = soup.select_one('div.bottom a.time')['data-gregtime'] if soup.select_one('div.bottom a.time') else ""
    official_description = soup.select_one('div.beer-descrption-read-less').text.strip() if soup.select_one('div.beer-descrption-read-less') else ""
    if official_description.endswith(" Show Less"):
        official_description = official_description[:-10]  # Remove the last 10 characters (" Show Less")
    photo_link = soup.select_one('a.label.image-big')['data-image'] if soup.select_one('a.label.image-big') else ""
    return abv, drink_category, latest_review_date, official_description, photo_link

def scrape_untappd():
    login()  # Perform login

    # Extract last scraped details
    last_csv_file = last_csv(COUNTRIES)  # Get the last CSV file path
    last_row = last_record(last_csv_file)  # Get the last row from the last CSV file

    start_brewery_idx = 0
    start_beer_idx = 0
    start_country_idx = 0

    if last_csv_file is not None and last_row is not None:
        # Continue from last scraped brewery and beer
        start_brewery_idx = last_row['Brewery Index'] - 1
        start_beer_idx = last_row['Beer Index']
        country = last_csv_file.split('/')[-1].split('.')[0]
        # Find the index of the country to start from
        for idx, country in enumerate(COUNTRIES):
            if country in last_csv_file:
                start_country_idx = idx
                break

    for country_idx, search_country in enumerate(COUNTRIES[start_country_idx:], start=start_country_idx):
        # Check if "search_country" contains " ", replace with "+" if it does
        search_country_formatted = search_country
        if " " in search_country:
            search_country_formatted = search_country.replace(" ", "+")
        print("----------------------------------------")
        print(f"Scraping country: {search_country}")

        SEARCH_URL = f"{BASE_URL}/search?q={search_country_formatted}&type=brewery&sort="
        driver.get(SEARCH_URL)
        click_show_more('more_search')  # Ensure all breweries are loaded
        soup = get_soup()
        breweries = parse_breweries(soup)

        # ===== TO DELETE =====
        breweries = breweries[:1]  # Limit to 1 brewery for testing
        # ====================

        beer_data = []

        for brewery_idx, (brewery_name, brewery_url) in enumerate(breweries):
            if country_idx == start_country_idx and brewery_idx < start_brewery_idx:
                continue

            print("==================================================")
            print(f"Scraping brewery {brewery_idx + 1}/{len(breweries)}: {brewery_name}")

            # Step B: Scrape beers from each brewery
            driver.get(f"{brewery_url}/beer")
            click_show_more('more-list-items')  # Ensure all beers are loaded
            soup = get_soup()
            beers = parse_beers(soup)

            # ===== TO DELETE =====
            beers = beers[:1]  # Limit to 1 brewery for testing
            # ====================

            # Step C: Scrape brewery details
            country = parse_breweries_details(soup)
            for beer_idx, (beer_name, beer_url) in enumerate(beers):
                if country_idx == start_country_idx and brewery_idx == start_brewery_idx and beer_idx < start_beer_idx:
                    continue

                print(f"Scraping beer {beer_idx + 1}/{len(beers)}: {beer_name}")

                # Step D: Scrape beer details
                driver.get(beer_url)
                beer_soup = get_soup()
                abv, drink_category, latest_review_date, official_description, photo_link = parse_beer_details(beer_soup)

                # Collect beer data
                beer_data = {
                    'Brewery Index': brewery_idx + 1,
                    'Beer Index': beer_idx + 1,
                    'Scraping Remarks / Errors': '',  # Add any remarks if needed
                    'Date of latest review': latest_review_date,
                    'Expression Name': beer_name,
                    'Producer': brewery_name,
                    'Bottler': '', # Leave blank for now
                    'Country of Origin': country,
                    'Drink Type': 'Beer',
                    'Drink Category': drink_category,
                    'Age': '',
                    'ABV': abv,
                    '88B Website Review Link': '', # Leave blank for now
                    'Official Description': official_description,
                    'Producer Website Link': '',
                    'Photo Link': photo_link
                }

                # Step E: Save the collected data to CSV
                # Save for each beer scraped

                csv_file_path = f'{DIRECTORY}/{search_country}.csv'

                # Check if the CSV file exists
                if os.path.exists(csv_file_path):
                    # Load the existing CSV file into a DataFrame
                    df = pd.read_csv(csv_file_path)
                    # Append the new data to the DataFrame
                    df = pd.concat([df, pd.DataFrame([beer_data])], ignore_index=True)
                else:
                    # Create a new DataFrame if the CSV file does not exist
                    df = pd.DataFrame([beer_data])

                # Save the DataFrame to a CSV file
                df.to_csv(csv_file_path, index=False)

# Run the scraper
scrape_untappd()

# Close the driver
driver.quit()

---

## Webscrapper 2: Scrape Top Rated Results
Scrape from `https://untappd.com/brewery/top_rated?country={country}&brewery_type={brewery_type}`

In [4]:
# IMPORT PACKAGES
# ============
# === BEAUTIFULSOUP ===
from bs4 import BeautifulSoup

# === DOTENV ===
from dotenv import load_dotenv

# === OS ===
import os

# === PANDAS ===
import pandas as pd

# === PICKLE ===
import pickle

# === SELENIUM ===
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# === TIME ===
import time

# =============================================================================

# SAVE LOGIN DETAILS
# ============

COOKIES_FILE_PATH = 'cookies.pkl'

def save_cookies(driver, path=COOKIES_FILE_PATH):
    with open(path, 'wb') as file:
        pickle.dump(driver.get_cookies(), file)

def load_cookies(driver, path=COOKIES_FILE_PATH):
    with open(path, 'rb') as file:
        cookies = pickle.load(file)
        for cookie in cookies:
            driver.add_cookie(cookie)

# =============================================================================

# Constants
BASE_URL = "https://untappd.com"
LOGIN_URL = f"{BASE_URL}/login"
HOME_URL = f"{BASE_URL}/home"

# Credentials
load_dotenv()
USERNAME = os.getenv('USERNAME')
PASSWORD = os.getenv('PASSWORD')

# Countries
COUNTRIES = [
    "united states",
    "canada",
    "england",
    "scotland",
    "ireland",
    "northern ireland",
    # "wales",
    # "germany",
    # "belgium",
    # "austria",
    # "netherlands",
    # "poland",
    # "czech republic",
    # "norway",
    # "france",
    # "denmark",
    # "sweden"
]

# Define the directory containing the CSV files
DIRECTORY = "../Results/Untappd Top Rated"

def last_csv(countries):
    csv_files = []
    
    for country in countries:
        # Construct the full file name with .csv extension
        file_name = f"{country}.csv"
        file_path = os.path.join(DIRECTORY, file_name)
        
        # Check if the file exists in the directory
        if os.path.exists(file_path):
            csv_files.append(file_path)  # Store the full path instead of just the file name
    
    # Return the path of the last CSV file found, or None if no files were found
    return csv_files[-1] if csv_files else None

def last_record(file_path):
    if file_path is not None:
        try:
            # Load the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Check if the DataFrame is not empty
            if not df.empty:
                # Return the last row of the DataFrame
                return df.iloc[-1]
            else:
                return None
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return None
    else:
        return None

# Initialize Selenium WebDriver
driver = webdriver.Chrome()
driver.maximize_window()

def login():
    # MANUAL STEPS
    # ============
    # 1. Verify the CAPTCHA
    # 2. Press the Enter key to continue after verifying the CAPTCHA
    # 3. Press "Enter" key in VSCode terminal to continue (ensures that login has been successful)

    driver.get(LOGIN_URL)
    wait = WebDriverWait(driver, 10)

    # Load cookies if they exist and check if already logged in
    if os.path.exists(COOKIES_FILE_PATH):
        load_cookies(driver)
        driver.get(HOME_URL)  # Navigate to home to check if login was successful
        if driver.current_url == HOME_URL:
            print("Logged in using saved cookies")
            return
        else:
            print("Failed to log in with saved cookies. Proceeding with manual login.")

    # Enter username
    username_input = wait.until(EC.presence_of_element_located((By.ID, 'username')))
    username_input.send_keys(USERNAME)

    # Enter password
    password_input = driver.find_element(By.ID, 'password')
    password_input.send_keys(PASSWORD)

    try:
        # Wait for the CAPTCHA and solve it manually if it appears
        captcha_frame = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'iframe[title="reCAPTCHA"]')))
        if captcha_frame:
            print("Please solve the CAPTCHA manually if it appears, then press Enter to continue...")
            input()
    except:
        # No CAPTCHA present, continue with login
        pass

    # Wait until redirected to HOME_URL
    wait.until(EC.url_to_be(HOME_URL))
    print("Login successful, redirected to home")

    # Save cookies after successful login
    save_cookies(driver)

def get_soup():
    time.sleep(2)  # Ensure the page has fully loaded
    return BeautifulSoup(driver.page_source, 'html.parser')

def click_show_more(class_name):
    time.sleep(2)  # Wait for new content to load

    # Check if there is an "announcement" modal
    try:
        announcement_modal = driver.find_element(By.CLASS_NAME, 'announcement')
        if announcement_modal:
            track_button = announcement_modal.find_element(By.CLASS_NAME, 'track-click')
            track_button.click()
    except:
        pass

    while True:
        try:
            # Scroll to the bottom of the page
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Wait for new content to load

            # Find and click the "Show More" button
            show_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, class_name))
            )
            ActionChains(driver).move_to_element(show_more_button).click().perform()
            time.sleep(2)  # Ensure the new content has loaded
        except Exception as e:
            break

def parse_breweries(soup):
    breweries = []
    for brewery in soup.select('div.beer-details'):
        name = brewery.select_one('p.name a').text.strip()
        link = BASE_URL + brewery.select_one('p.name a')['href']
        breweries.append((name, link))
    print(f"Found {len(breweries)} breweries")
    return breweries

def parse_breweries_details(soup):
    country = soup.select_one('p.brewery').text.strip() if soup.select_one('p.brewery') else ""
    return country

def parse_beers(soup):
    beers = []
    for beer in soup.select('div.beer-details'):
        name = beer.select_one('p.name a').text.strip()
        link = BASE_URL + beer.select_one('p.name a')['href']
        beers.append((name, link))
    print(f"Found {len(beers)} beers")
    return beers

def parse_beer_details(soup):
    abv = soup.select_one('p.abv').text.strip() if soup.select_one('p.abv') else ""
    drink_category = soup.select_one('div.name p.style').text.strip() if soup.select_one('div.name p.style') else ""
    latest_review_date = soup.select_one('div.bottom a.time')['data-gregtime'] if soup.select_one('div.bottom a.time') else ""
    official_description = soup.select_one('div.beer-descrption-read-less').text.strip() if soup.select_one('div.beer-descrption-read-less') else ""
    if official_description.endswith(" Show Less"):
        official_description = official_description[:-10]  # Remove the last 10 characters (" Show Less")
    photo_link = soup.select_one('a.label.image-big')['data-image'] if soup.select_one('a.label.image-big') else ""
    return abv, drink_category, latest_review_date, official_description, photo_link

def scrape_untappd():
    login()  # Perform login

    # Get all brewery types except "all"
    driver.get(f"{BASE_URL}/brewery/top_rated?country=&brewery_type=")
    soup = get_soup()
    brewery_types = [option['data-value-slug'] for option in soup.select('select#filter_picker option') if option['value'] != 'all']

    # Extract last scraped details
    last_csv_file = last_csv(COUNTRIES)  # Get the last CSV file path
    last_row = last_record(last_csv_file)  # Get the last row from the last CSV file

    start_brewery_idx = 0
    start_beer_idx = 0
    start_country_idx = 0

    if last_csv_file is not None and last_row is not None:
        # Continue from last scraped brewery and beer
        start_brewery_idx = last_row['Brewery Index'] - 1
        start_beer_idx = last_row['Beer Index']
        country = last_csv_file.split('/')[-1].split('.')[0]
        # Find the index of the country to start from
        for idx, country in enumerate(COUNTRIES):
            if country in last_csv_file:
                start_country_idx = idx
                break

    # Initialize global brewery index counter
    global_brewery_idx = 0

    for country_idx, search_country in enumerate(COUNTRIES[start_country_idx:], start=start_country_idx):
        # Check if "search_country" contains " ", replace with "-" if it does
        search_country_formatted = search_country
        if " " in search_country:
            search_country_formatted = search_country.replace(" ", "-")
        print("----------------------------------------")
        print(f"Scraping country: {search_country}")

        for brewery_type in brewery_types:
            print("++++++++++++++++++++++++++++++++++++++++")
            print(f"Scraping brewery type: {brewery_type}")
            SEARCH_URL = f"{BASE_URL}/brewery/top_rated?country={search_country_formatted}&brewery_type={brewery_type}"
            driver.get(SEARCH_URL)
            time.sleep(2)  # Wait for the page to load
            soup = get_soup()
            breweries = parse_breweries(soup)

            # Determine if we need to skip this type based on the global index
            if global_brewery_idx + len(breweries) <= start_brewery_idx:
                global_brewery_idx += len(breweries)
                continue

            beer_data = []

            for brewery_idx, (brewery_name, brewery_url) in enumerate(breweries):
                if global_brewery_idx < start_brewery_idx:
                    global_brewery_idx += 1
                    continue

                print("==================================================")
                print(f"Scraping brewery {brewery_idx + 1}: {brewery_name}")

                # Step B: Scrape beers from each brewery
                driver.get(f"{brewery_url}/beer")

                click_show_more('more-list-items')  # Ensure all beers are loaded

                soup = get_soup()
                beers = parse_beers(soup)

                # Step C: Scrape brewery details
                country = parse_breweries_details(soup)
                for beer_idx, (beer_name, beer_url) in enumerate(beers):
                    if global_brewery_idx == start_brewery_idx and beer_idx < start_beer_idx:
                        continue

                    print(f"Scraping beer {beer_idx + 1}/{len(beers)}: {beer_name}")

                    # Step D: Scrape beer details
                    driver.get(beer_url)
                    beer_soup = get_soup()
                    abv, drink_category, latest_review_date, official_description, photo_link = parse_beer_details(beer_soup)

                    # Collect beer data
                    beer_data = {
                        'Brewery Index': global_brewery_idx + 1,
                        'Beer Index': beer_idx + 1,
                        'Scraping Remarks / Errors': '',  # Add any remarks if needed
                        'Date of latest review': latest_review_date,
                        'Expression Name': beer_name,
                        'Producer': brewery_name,
                        'Bottler': '', # Leave blank for now
                        'Country of Origin': country,
                        'Drink Type': 'Beer',
                        'Drink Category': drink_category,
                        'Age': '',
                        'ABV': abv,
                        '88B Website Review Link': '', # Leave blank for now
                        'Official Description': official_description,
                        'Producer Website Link': '',
                        'Photo Link': photo_link
                    }

                    # Step E: Save the collected data to CSV
                    # Save for each beer scraped

                    csv_file_path = f'{DIRECTORY}/{search_country}.csv'

                    # Check if the CSV file exists
                    if os.path.exists(csv_file_path):
                        # Load the existing CSV file into a DataFrame
                        df = pd.read_csv(csv_file_path)
                        # Append the new data to the DataFrame
                        df = pd.concat([df, pd.DataFrame([beer_data])], ignore_index=True)
                    else:
                        # Create a new DataFrame if the CSV file does not exist
                        df = pd.DataFrame([beer_data])

                    # Save the DataFrame to a CSV file
                    df.to_csv(csv_file_path, index=False)

                # Increment the global brewery index
                global_brewery_idx += 1

# Run the scraper
scrape_untappd()

# Close the driver
driver.quit()

Logged in using saved cookies
----------------------------------------
Scraping country: united states
++++++++++++++++++++++++++++++++++++++++
Scraping brewery type: macro_brewery
Found 50 breweries
++++++++++++++++++++++++++++++++++++++++
Scraping brewery type: micro_brewery
Found 50 breweries
Scraping brewery 1: Brujos Brewing
Found 50 beers
Scraping beer 7/50: TDH Mulciber (2024)
Scraping beer 8/50: Magister Ignis (2023)
Scraping beer 9/50: Lord of the Scorched Church (2023)
Scraping beer 10/50: Sitra Achra (2022)
Scraping beer 11/50: Thou (2024)
Scraping beer 12/50: Sixes
Scraping beer 13/50: Loneliness (2024)
Scraping beer 14/50: San Nosferatu (2023)
Scraping beer 15/50: Mulciber (2023)
Scraping beer 16/50: Signum Flavum
Scraping beer 17/50: El Loro de Oro
Scraping beer 18/50: The Pact
Scraping beer 19/50: Secular (2023)
Scraping beer 20/50: Cuatro Jinetes: Nelson


KeyboardInterrupt: 