In [None]:
import os
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
import requests

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}


def extract_bandcamp_info(band_name):
    """
    Extracts Bandcamp info for a given band using requests and BeautifulSoup.
    """
    search_url = f"https://bandcamp.com/search?q={band_name.replace(' ', '+')}&item_type=b"
    response = requests.get(search_url, headers=HEADERS)
    print(response)
    time.sleep(2)  # polite delay

    soup = BeautifulSoup(response.text, "html.parser")
    result = soup.find("div", class_="result-info")
    print(result)

    info = {
        "band": band_name,
        "bandcamp_url": None,
        "location": None,
        "genre": None,
        "tags": []
    }

    if not result:
        print("not result")
        print(f"[DEBUG] No result-info found for: {band_name}")
        return info

    subhead = result.find("div", class_="subhead")
    info["location"] = subhead.text.strip() if subhead else None

    itemurl = result.find("div", class_="itemurl")
    raw_url = itemurl.a['href'] if itemurl and itemurl.a else None
    info["bandcamp_url"] = raw_url.split('?')[0] if raw_url else None

    genre_tag = result.find("div", class_="genre")
    info["genre"] = genre_tag.text.replace("genre:", "").strip() if genre_tag else None

    tags_div = result.find("div", class_="tags")
    if tags_div:
        raw_tags = tags_div.get_text(separator=",").replace("tags:", "")
        info["tags"] = [tag.strip() for tag in raw_tags.split(",") if tag.strip()]

    print(f"[ENRICHED] {info}")
    return info


def fetch_band_photo(bandcamp_url):
    """
    Fetches the band photo image URL from a Bandcamp artist page.
    """
    try:
        resp = requests.get(bandcamp_url, headers=HEADERS)
        print(f"Fetch Band Photo: {resp}")
        soup = BeautifulSoup(resp.text, "html.parser")
        img = soup.find("img", class_="band-photo")
        if img and img.get('src'):
            return img['src']
    except Exception as e:
        print("exception in fetch_band_photo")
        print(f"[DEBUG] Failed to fetch band photo from {bandcamp_url}: {e}")
        print(Exception)
    return None


def enrich_missing_band(band_name, bandcamp_df, csv_path="bandcamp.csv"):
    info = extract_bandcamp_info(band_name)
    new_row = {
        "band": band_name,
        "location": info["location"],
        "bandcamp_url": info["bandcamp_url"],
        "genre": info["genre"],
        "tags": ", ".join(info["tags"])
    }
    bandcamp_df = pd.concat([bandcamp_df, pd.DataFrame([new_row])], ignore_index=True)
    bandcamp_df.to_csv(csv_path, index=False)
    return new_row, bandcamp_df


def enrich(df, bandcamp_lookup, csv_path="bandcamp.csv", progress_callback=None):
    """
    Enriches the DataFrame with Bandcamp info and fills missing event images.
    """
    if os.path.exists(csv_path):
        bandcamp_df = pd.read_csv(csv_path)
    else:
        bandcamp_df = pd.DataFrame(columns=["band", "location", "bandcamp_url", "genre", "tags"])

    def enrich_band_info(band_name):
        nonlocal bandcamp_df, bandcamp_lookup

        if pd.isna(band_name) or band_name.strip() == "":
            return "", None
        band_name = band_name.strip()

        band_info = bandcamp_lookup.get(band_name)

        if not band_info or pd.isna(band_info.get("bandcamp_url")) or not band_info.get("bandcamp_url"):
            new_info, bandcamp_df = enrich_missing_band(band_name, bandcamp_df, csv_path)
            bandcamp_lookup[band_name] = new_info
            band_info = new_info

        url = band_info.get("bandcamp_url")
        text = ""
        if not url:
            text = f"\n{band_name}: No Bandcamp found"
        else:
            text = (
                f"\n{band_name}: {url}"
                f"\nGenre: {band_info.get('genre', '') or 'N/A'}"
                f"\nTags: {band_info.get('tags', '') or 'N/A'}"
                f"\nLocation: {band_info.get('location', '') or 'N/A'}"
            )
        return text, url

    total = len(df)
    for idx, row in df.iterrows():
        if progress_callback:
            progress_callback(idx + 1, total, row.get("Headliner", "Unknown"))

        # Collect Bandcamp enrichment and first bandcamp URL
        enrichments = ""
        first_bc_url = None
        for col in ["Headliner", "Supporting Band 1", "Supporting Band 2", "Supporting Band 3"]:
            text, url = enrich_band_info(row.get(col, ""))
            enrichments += text
            if not first_bc_url and url:
                first_bc_url = url

        # Update More Info
        df.at[idx, "More Info"] = (row.get("More Info", "") or "") + enrichments

        # If event has no image, try fetching from Bandcamp
        if not row.get("Image URL") and first_bc_url:
            photo = fetch_band_photo(first_bc_url)
            if photo:
                df.at[idx, "Image URL"] = photo

    return df

from combine import combine
from scrape_Tamarack import scrape_Tamarack

df = scrape_Tamarack()

df

<Response [200]>


Unnamed: 0,Venue,Date,Start Time,End Time,Start DateTime,End DateTime,Title,Image URL,Event URL,Headliner,Supporting Band 1,Supporting Band 2,Supporting Band 3
0,Tamarack,"Fri, May 30",9:00 PM,12:00 AM,2025-05-30 21:00:00-07:00,2025-05-31 00:00:00-07:00,"taxi, pocket full of crumbs, first day back",,https://www.google.com/calendar/event?eid=cnAy...,taxi,pocket full of crumbs,first day back,
1,Tamarack,"Sat, June 07",7:00 PM,11:00 PM,2025-06-07 19:00:00-07:00,2025-06-07 23:00:00-07:00,"Amity St, Celadon, Typoe, Poser Emo",,https://www.google.com/calendar/event?eid=ZWdn...,Amity St,Celadon,Typoe,Poser Emo
2,Tamarack,"Thu, June 12",8:00 PM,10:00 PM,2025-06-12 20:00:00-07:00,2025-06-12 22:00:00-07:00,comedy,,https://www.google.com/calendar/event?eid=YWNu...,comedy,,,
3,Tamarack,"Thu, June 19",6:00 PM,10:00 PM,2025-06-19 18:00:00-07:00,2025-06-19 22:00:00-07:00,"Doghouse, Gramma, Mugslug, Parisian Mansluts",,https://www.google.com/calendar/event?eid=Mm01...,Doghouse,Gramma,Mugslug,Parisian Mansluts
4,Tamarack,"Sat, July 05",7:00 PM,11:30 PM,2025-07-05 19:00:00-07:00,2025-07-05 23:30:00-07:00,Joey Molinaro (Pittsburgh grind violin) and fr...,,https://www.google.com/calendar/event?eid=bjl1...,Joey Molinaro (Pittsburgh grind violin) and fr...,,,
5,Tamarack,"Sat, August 02",6:00 PM,10:00 PM,2025-08-02 18:00:00-07:00,2025-08-02 22:00:00-07:00,The Band Cope - Live! with Hazy Portraits,,https://www.google.com/calendar/event?eid=MGw0...,The Band Cope - Live! with Hazy Portraits,,,


In [2]:
def load_bandcamp_lookup(): # --> future: extend to creating lookups of any csv passed to it for access speed.
    bandcamp_df = pd.read_csv("bandcamp.csv")
    bandcamp_df = bandcamp_df.drop_duplicates(subset="band", keep="first")
    return bandcamp_df.set_index("band").to_dict(orient="index")

bandcamp_lookup = load_bandcamp_lookup()

df = enrich(df,bandcamp_lookup=bandcamp_lookup)
df

[DEBUG] No result-info found for: taxi
[DEBUG] No result-info found for: pocket full of crumbs
[DEBUG] No result-info found for: first day back
[DEBUG] No result-info found for: Amity St
[DEBUG] No result-info found for: Celadon
[DEBUG] No result-info found for: Typoe
[DEBUG] No result-info found for: Poser Emo
[DEBUG] No result-info found for: comedy
[DEBUG] No result-info found for: Doghouse
[DEBUG] No result-info found for: Gramma
[DEBUG] No result-info found for: Mugslug
[DEBUG] No result-info found for: Parisian Mansluts
[DEBUG] No result-info found for: Joey Molinaro (Pittsburgh grind violin) and friends
[DEBUG] No result-info found for: The Band Cope - Live! with Hazy Portraits


Unnamed: 0,Venue,Date,Start Time,End Time,Start DateTime,End DateTime,Title,Image URL,Event URL,Headliner,Supporting Band 1,Supporting Band 2,Supporting Band 3,More Info
0,Tamarack,"Fri, May 30",9:00 PM,12:00 AM,2025-05-30 21:00:00-07:00,2025-05-31 00:00:00-07:00,"taxi, pocket full of crumbs, first day back",,https://www.google.com/calendar/event?eid=cnAy...,taxi,pocket full of crumbs,first day back,,\ntaxi: No Bandcamp found\npocket full of crum...
1,Tamarack,"Sat, June 07",7:00 PM,11:00 PM,2025-06-07 19:00:00-07:00,2025-06-07 23:00:00-07:00,"Amity St, Celadon, Typoe, Poser Emo",,https://www.google.com/calendar/event?eid=ZWdn...,Amity St,Celadon,Typoe,Poser Emo,\nAmity St: No Bandcamp found\nCeladon: No Ban...
2,Tamarack,"Thu, June 12",8:00 PM,10:00 PM,2025-06-12 20:00:00-07:00,2025-06-12 22:00:00-07:00,comedy,,https://www.google.com/calendar/event?eid=YWNu...,comedy,,,,\ncomedy: No Bandcamp found
3,Tamarack,"Thu, June 19",6:00 PM,10:00 PM,2025-06-19 18:00:00-07:00,2025-06-19 22:00:00-07:00,"Doghouse, Gramma, Mugslug, Parisian Mansluts",,https://www.google.com/calendar/event?eid=Mm01...,Doghouse,Gramma,Mugslug,Parisian Mansluts,\nDoghouse: No Bandcamp found\nGramma: No Band...
4,Tamarack,"Sat, July 05",7:00 PM,11:30 PM,2025-07-05 19:00:00-07:00,2025-07-05 23:30:00-07:00,Joey Molinaro (Pittsburgh grind violin) and fr...,,https://www.google.com/calendar/event?eid=bjl1...,Joey Molinaro (Pittsburgh grind violin) and fr...,,,,\nJoey Molinaro (Pittsburgh grind violin) and ...
5,Tamarack,"Sat, August 02",6:00 PM,10:00 PM,2025-08-02 18:00:00-07:00,2025-08-02 22:00:00-07:00,The Band Cope - Live! with Hazy Portraits,,https://www.google.com/calendar/event?eid=MGw0...,The Band Cope - Live! with Hazy Portraits,,,,\nThe Band Cope - Live! with Hazy Portraits: N...
