In [50]:
import requests
from bs4 import BeautifulSoup
import time

def get_review_links(base_url):
    page = 1
    review_links = []

    while True:
        url = f"{base_url}page/{page}/" if page > 1 else base_url
        print(f"Scraping page: {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all h2.review-title tags
        titles = soup.find_all('h2', class_='review-title')
        if not titles:
            break  # No more review blocks

        for h2 in titles:
            a_tag = h2.find('a', href=True)
            if a_tag:
                review_links.append(a_tag['href'])

        page += 1
        time.sleep(0.1)

    return review_links



In [51]:
# urls = get_review_links("https://www.coffeereview.com/coffee-origins/ethiopia/")
# print("\n".join(urls[:5]))  # Print first few review URLs

In [52]:
def parse_review(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    content = soup.find('div', class_='entry-content')
    if not content:
        return None

    lines = content.get_text(separator="\n").split("\n")
    lines = [line.strip() for line in lines if line.strip()]

    data = {"URL": url}

    # First few lines: Rating, Company, Coffee Name
    try:
        data["Rating"] = lines[0]
        data["Company"] = lines[1]
        data["Coffee Name"] = lines[2]
    except IndexError:
        return None

    # Label-value parsing
    for i, line in enumerate(lines):
        if line.startswith("Roaster Location:"):
            data["Roaster Location"] = lines[i + 1]
        elif line.startswith("Coffee Origin:"):
            data["Coffee Origin"] = lines[i + 1]
        elif line.startswith("Roast Level:"):
            data["Roast Level"] = lines[i + 1]
        elif line.startswith("Agtron:"):
            data["Agtron"] = lines[i + 1]
        elif line.startswith("Est. Price:"):
            data["Est. Price"] = lines[i + 1]
        elif line.startswith("Review Date:"):
            data["Review Date"] = lines[i + 1]
        elif line.startswith("Aroma:"):
            data["Aroma"] = lines[i + 1]
        elif line.startswith("Acidity/Structure:"):
            data["Acidity"] = lines[i + 1]
        elif line.startswith("Body:"):
            data["Body"] = lines[i + 1]
        elif line.startswith("Flavor:"):
            data["Flavor"] = lines[i + 1]
        elif line.startswith("Aftertaste:"):
            data["Aftertaste"] = lines[i + 1]
        elif line.startswith("With Milk:"):
            data["With Milk"] = lines[i + 1]
        elif line == "Notes":
            data["Notes"] = lines[i + 1]
        elif line.startswith("Bottom Line"):
            data["Bottom Line"] = line[len("Bottom Line"):].strip()
        elif line == "Blind Assessment":
            data["Blind Assessment"] = lines[i + 1]

    return data


In [53]:
import pandas as pd

base_url = "https://www.coffeereview.com/coffee-origins/ethiopia/"
review_urls = get_review_links(base_url)

all_reviews = []
for url in review_urls:
    print(f"Parsing review: {url}")
    review = parse_review(url)
    if review:
        all_reviews.append(review)
    time.sleep(0.1)

df = pd.DataFrame(all_reviews)
df.to_csv("ethiopian_coffee_reviews.csv", index=False)
print("Saved to ethiopian_coffee_reviews.csv")


Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/2/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/3/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/4/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/5/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/6/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/7/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/8/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/9/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/10/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/11/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/12/
Scraping page: https://www.coffeereview.com/coffee-origins/ethiopia/page/13/
Scraping page: 

In [None]:
worst_coffee = df[df["Rating"] == df['Rating'].min()]
display(worst_coffee)
display(worst_coffee["URL"])

Unnamed: 0,URL,Rating,Company,Coffee Name,Roaster Location,Coffee Origin,Roast Level,Agtron,Est. Price,Review Date,Aroma,Body,Flavor,Aftertaste,With Milk,Blind Assessment,Notes,Bottom Line,Acidity
559,https://www.coffeereview.com/review/original-g...,80,Golden Ratio,Original Gold Coffee Pouches,"Austin, Texas",Ethiopia,,0/0,$14.99/seven single-serve pouches,April 2021,6,7,6,5,,Evaluated at a steeping time of 6 minutes. Thi...,The Golden Ratio company aims at selling North...,,6


559    https://www.coffeereview.com/review/original-g...
Name: URL, dtype: object

In [65]:
best_coffee = df[df["Rating"] == df['Rating'].max()]
display(best_coffee)
display(best_coffee["URL"])

Unnamed: 0,URL,Rating,Company,Coffee Name,Roaster Location,Coffee Origin,Roast Level,Agtron,Est. Price,Review Date,Aroma,Body,Flavor,Aftertaste,With Milk,Blind Assessment,Notes,Bottom Line,Acidity
510,https://www.coffeereview.com/review/testi-ayla...,97,Barrington Coffee Roasting,Testi Ayla Double Ethiopia,"Lee, Massachusetts","Sidama Zone, Oromia Region, Ethiopia",Light,64/86,$54.95/12 ounces,August 2021,9,9,10,9,,"High-toned, floral-driven, lyrically sweet. Wi...",This exceptional coffee was selected as the No...,,10
608,https://www.coffeereview.com/review/ardent-eth...,97,JBC Coffee Roasters,Ardent Ethiopia Natural,"Madison, Wisconsin","Sidamo (also Sidama) growing region, south-cen...",Medium-Light,57/77,$35.00/8 ounces,November 2020,10,9,10,9,,"Delicately sweet-tart, richly and intricately ...",This exceptional coffee was selected as the No...,,9
617,https://www.coffeereview.com/review/ethiopia-n...,97,Kakalove Cafe,Ethiopia Natural Guji D Minor Special Lot,"Chia-Yi, Taiwan","Guji Zone, Oromia Region, southern Ethiopia",Medium-Light,60/76,NT $400/8 ounces,October 2020,10,9,10,9,,"Luminous, clear, confident, lively. Passion fr...",This exceptional coffee was selected as the No...,,9


510    https://www.coffeereview.com/review/testi-ayla...
608    https://www.coffeereview.com/review/ardent-eth...
617    https://www.coffeereview.com/review/ethiopia-n...
Name: URL, dtype: object

In [66]:
base_url = "https://www.coffeereview.com/coffee-origins/yemen/"
review_urls = get_review_links(base_url)

all_reviews = []
for url in review_urls:
    print(f"Parsing review: {url}")
    review = parse_review(url)
    if review:
        all_reviews.append(review)
    time.sleep(0.1)

df = pd.DataFrame(all_reviews)
df.to_csv("yemen_coffee_reviews.csv", index=False)
print("Saved to yemen_coffee_reviews.csv")

Scraping page: https://www.coffeereview.com/coffee-origins/yemen/
Scraping page: https://www.coffeereview.com/coffee-origins/yemen/page/2/
Scraping page: https://www.coffeereview.com/coffee-origins/yemen/page/3/
Parsing review: https://www.coffeereview.com/review/yemen-al-obrah/
Parsing review: https://www.coffeereview.com/review/yemen-ismaili/
Parsing review: https://www.coffeereview.com/review/yemen-haraaz-3/
Parsing review: https://www.coffeereview.com/review/yemen-al-kawlah-special-reserve-natural/
Parsing review: https://www.coffeereview.com/review/yemen-natural-hamoud-abdullah-al-khisheni/
Parsing review: https://www.coffeereview.com/review/yemen-natural-sanaa-manakhah-jaadi/
Parsing review: https://www.coffeereview.com/review/yemen-haraaz-red-mahal-aqeeq-ul-station-natural/
Parsing review: https://www.coffeereview.com/review/yemen-al-mashtal-al-burhani-2/
Parsing review: https://www.coffeereview.com/review/haraaz-red-yemen/
Parsing review: https://www.coffeereview.com/review/yem

In [67]:
base_url = "https://www.coffeereview.com/coffee-origins/kenya/"
review_urls = get_review_links(base_url)

all_reviews = []
for url in review_urls:
    print(f"Parsing review: {url}")
    review = parse_review(url)
    if review:
        all_reviews.append(review)
    time.sleep(0.1)

df = pd.DataFrame(all_reviews)
df.to_csv("kenya_coffee_reviews.csv", index=False)
print("Saved to kenya_coffee_reviews.csv")

Scraping page: https://www.coffeereview.com/coffee-origins/kenya/
Scraping page: https://www.coffeereview.com/coffee-origins/kenya/page/2/
Scraping page: https://www.coffeereview.com/coffee-origins/kenya/page/3/
Scraping page: https://www.coffeereview.com/coffee-origins/kenya/page/4/
Scraping page: https://www.coffeereview.com/coffee-origins/kenya/page/5/
Scraping page: https://www.coffeereview.com/coffee-origins/kenya/page/6/
Scraping page: https://www.coffeereview.com/coffee-origins/kenya/page/7/
Scraping page: https://www.coffeereview.com/coffee-origins/kenya/page/8/
Scraping page: https://www.coffeereview.com/coffee-origins/kenya/page/9/
Scraping page: https://www.coffeereview.com/coffee-origins/kenya/page/10/
Scraping page: https://www.coffeereview.com/coffee-origins/kenya/page/11/
Parsing review: https://www.coffeereview.com/review/kenya-gondo-peaberry/
Parsing review: https://www.coffeereview.com/review/kenya-kirinyaga-cimba-geisha-washed/
Parsing review: https://www.coffeerevie

In [70]:
origins = ["uganda", "rwanda", "burundi", "tanzania", "zimbabwe", "malawi", "zambia", # africa
           "hawaii", "mexico", "guatemala", "el-salvador", "honduras", "nicaragua", "costa-rica", "panama",
           "columbia", "brazil", "peru", "ecuador", "bolivia", "india", "sumatra", "sulawesi-celebes",
           "java", "papua-new-guinea", "thailand", "vietnam", "east-timor", "jamaica", "haiti", "dominican-republic", "puerto-rico"]

In [72]:
failed = []
for origin in origins:
    base_url = f"https://www.coffeereview.com/coffee-origins/{origin}/"
    review_urls = get_review_links(base_url)

    all_reviews = []
    for url in review_urls:
        print(f"Parsing review: {url}")
        review = parse_review(url)
        if review:
            all_reviews.append(review)
        # time.sleep(0.1)

    df = pd.DataFrame(all_reviews)
    if df.shape[0] == 0:
        failed.append(origin)
    else:
        df.to_csv(f"coffee-reviews\{origin}_coffee_reviews.csv", index=False)
        print(f"Saved to {origin}_coffee_reviews.csv")

Scraping page: https://www.coffeereview.com/coffee-origins/uganda/
Scraping page: https://www.coffeereview.com/coffee-origins/rwanda/
Scraping page: https://www.coffeereview.com/coffee-origins/burundi/
Scraping page: https://www.coffeereview.com/coffee-origins/tanzania/
Scraping page: https://www.coffeereview.com/coffee-origins/zimbabwe/
Scraping page: https://www.coffeereview.com/coffee-origins/malawi/
Scraping page: https://www.coffeereview.com/coffee-origins/zambia/
Scraping page: https://www.coffeereview.com/coffee-origins/hawaii/
Scraping page: https://www.coffeereview.com/coffee-origins/hawaii/page/2/
Scraping page: https://www.coffeereview.com/coffee-origins/hawaii/page/3/
Scraping page: https://www.coffeereview.com/coffee-origins/hawaii/page/4/
Scraping page: https://www.coffeereview.com/coffee-origins/hawaii/page/5/
Scraping page: https://www.coffeereview.com/coffee-origins/hawaii/page/6/
Scraping page: https://www.coffeereview.com/coffee-origins/hawaii/page/7/
Scraping page: 

In [None]:
# import requests
# from bs4 import BeautifulSoup

# url = 'https://www.coffeereview.com/coffee-origins/ethiopia/page/2/'
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')

# articles = soup.find_all('article')
# for article in articles:
#     print(article.prettify())
#     break  # Just one for inspection


In [75]:
# base = "https://www.coffeereview.com/advanced-search/"
# params = "?keyword&search=Search+Now&locations=all&score_all=on&score_96_100=on&score_93_95=on&score_90_92=on&score_85_89=on&score_85=on"

# for page in range(1, 6):  # Change 6 to however many pages you want
#     if page == 1:
#         url = base + params
#     else:
#         url = f"{base}page/{page}/{params}"
    
#     print("Scraping:", url)

# base = "https://www.coffeereview.com/advanced-search/"
# params = "?keyword&search=Search+Now&locations=all&score_all=on&score_96_100=on&score_93_95=on&score_90_92=on&score_85_89=on&score_85=on"

def get_review_links(base, params):
    page = 1
    review_links = []

    while True:
        # url = f"{base_url}page/{page}/" if page > 1 else base_url
        if page == 1:
            url = base + params
        else:
            url = f"{base}page/{page}/{params}"
        print(f"Scraping page: {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all h2.review-title tags
        titles = soup.find_all('h2', class_='review-title')
        if not titles:
            break  # No more review blocks

        for h2 in titles:
            a_tag = h2.find('a', href=True)
            if a_tag:
                review_links.append(a_tag['href'])

        page += 1
        # time.sleep(0.1)

    return review_links

In [76]:
base = "https://www.coffeereview.com/advanced-search/"
params = "?keyword&search=Search+Now&locations=all&score_all=on&score_96_100=on&score_93_95=on&score_90_92=on&score_85_89=on&score_85=on"
review_urls = get_review_links(base, params)

all_reviews = []
for url in review_urls:
    print(f"Parsing review: {url}")
    review = parse_review(url)
    if review:
        all_reviews.append(review)
    # time.sleep(0)

df = pd.DataFrame(all_reviews)
df.to_csv("all_coffee_reviews.csv", index=False)
print("Saved to all_coffee_reviews.csv")

Scraping page: https://www.coffeereview.com/advanced-search/?keyword&search=Search+Now&locations=all&score_all=on&score_96_100=on&score_93_95=on&score_90_92=on&score_85_89=on&score_85=on
Scraping page: https://www.coffeereview.com/advanced-search/page/2/?keyword&search=Search+Now&locations=all&score_all=on&score_96_100=on&score_93_95=on&score_90_92=on&score_85_89=on&score_85=on
Scraping page: https://www.coffeereview.com/advanced-search/page/3/?keyword&search=Search+Now&locations=all&score_all=on&score_96_100=on&score_93_95=on&score_90_92=on&score_85_89=on&score_85=on
Scraping page: https://www.coffeereview.com/advanced-search/page/4/?keyword&search=Search+Now&locations=all&score_all=on&score_96_100=on&score_93_95=on&score_90_92=on&score_85_89=on&score_85=on
Scraping page: https://www.coffeereview.com/advanced-search/page/5/?keyword&search=Search+Now&locations=all&score_all=on&score_96_100=on&score_93_95=on&score_90_92=on&score_85_89=on&score_85=on
Scraping page: https://www.coffeerevi

In [77]:
df.shape[0]

2899

In [78]:
df.head()

Unnamed: 0,URL,Rating,Company,Coffee Name,Roaster Location,Coffee Origin,Roast Level,Agtron,Est. Price,Review Date,Aroma,Acidity,Body,Flavor,Aftertaste,Blind Assessment,Notes,Bottom Line,With Milk
0,https://www.coffeereview.com/review/teapot-rui...,91,Teapot Coffee & Tea,Teapot Ruili Geisha,"Meishan Township, Alishan, Taiwan","Ruili Village, Meishan Township, Alishan, Chia...",Medium-Light,61/79,"NT $1,200/8 ounces",May 2025,8.0,8.0,8,9,8,"Crisply sweet, delicately tart. Dried red plum...",Produced by,,
1,https://www.coffeereview.com/review/ethiopia-b...,94,Caoban Coffee,Ethiopia Buku Abel Bastet Station Geisha Washe...,"Taipei, Taiwan","Guji Zone, Oromia region, southern Ethiopia",Medium,48/64,NT $600/226 grams,May 2025,9.0,,9,9,8,"Evaluated as espresso. Sweetly tart, very choc...","Produced by smallholding farmers, entirely of ...",,9.0
2,https://www.coffeereview.com/review/nicaragua-...,93,Rusty Dog Coffee,Nicaragua Jinotega Natural Microlot,"Madison, Wisconsin","Jinotega Department, Nicaragua",Medium-Light,57/75,$24.50/12 ounces,May 2025,,8.0,9,9,8,"Richly sweet, gently tart. Pomegranate, baking...",Produced by Angelina Lopez of Finca Los Papale...,,
3,https://www.coffeereview.com/review/rwanda-huy...,93,Rusty Dog Coffee,Rwanda Huye Peaberry,"Madison, Wisconsin","Huye District, Rwanda",Medium-Light,58/76,$24.50/12 ounces,May 2025,9.0,8.0,9,9,8,"Citrus-sweet. Tangerine, cocoa nib, rose hips,...",Produced by smallholding farmers surrounding H...,,
4,https://www.coffeereview.com/review/colombia-e...,95,Utopian Coffee,Colombia Edwin Noreña Geisha,"Fort Wayne, Indiana","Quindio Department, Colombia",Light,62/84,$29.00/8 ounces,May 2025,9.0,9.0,9,9,9,"Richly floral, deeply fruit-forward. Cherry co...",Produced by Edwin Noreña of Finca Campo Hermos...,,


In [79]:
df[df["Rating"] == df["Rating"].min()]

Unnamed: 0,URL,Rating,Company,Coffee Name,Roaster Location,Coffee Origin,Roast Level,Agtron,Est. Price,Review Date,Aroma,Acidity,Body,Flavor,Aftertaste,Blind Assessment,Notes,Bottom Line,With Milk
147,https://www.coffeereview.com/review/pikes-plac...,78,Starbucks Coffee,Pike’s Place Roast,"Seattle, Washington",Latin America,Dark,36/42,$9.99/12 ounces,February 2025,6,5,6,6,5,"Harsh, bitter. Charcoal, snap pea, soapy flora...",No origin information provided except “Latin A...,,
465,https://www.coffeereview.com/review/fighting-f...,78,Rudy,Fighting for Justice Bold Coffee,"Miami, Florida",Honduras,Dark,32/38,$29.99/32 ounces,August 2024,6,5,6,6,6,"Burnt, acrid, harsh, sweetly composty. Charcoa...","Comprised of 100 percent Honduras coffee, cert...",,


In [82]:
df[df["Rating"] == df["Rating"].max()]["URL"]

324     https://www.coffeereview.com/review/kahiko-ora...
874     https://www.coffeereview.com/review/gw03-princ...
1136    https://www.coffeereview.com/review/wilton-ben...
2631    https://www.coffeereview.com/review/finca-soph...
2648    https://www.coffeereview.com/review/gw01-finca...
Name: URL, dtype: object