In [7]:
import requests
from bs4 import BeautifulSoup
import time

In [3]:
def scrape_event_urls(base_url, pages):
    event_urls = []
    for page in range(1, pages + 1):
        url = f"{base_url}/page/{page}"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        for link in soup.find_all('a', href=True):
            if '/events/' in link['href']:
                event_urls.append(link['href'])
    return event_urls

base_url = "https://visitseattle.org/events"
pages = 10  # Number of pages to scrape
event_urls = scrape_event_urls(base_url, pages)
for url in event_urls:
    print(url)
time.sleep(1)

https://visitseattle.org/things-to-do/events/
https://visitseattle.org/things-to-do/events/
https://visitseattle.org/things-to-do/events/festivals/
https://visitseattle.org/things-to-do/events/submit-your-event/
https://visitseattle.org/events/corteo/
https://visitseattle.org/events/corteo/
https://visitseattle.org/events/bewitcher/
https://visitseattle.org/events/bewitcher/
https://visitseattle.org/events/burn-burn-burn/
https://visitseattle.org/events/burn-burn-burn/
https://visitseattle.org/events/carly-ann-calbero/
https://visitseattle.org/events/carly-ann-calbero/
https://visitseattle.org/events/el-primer-instinto/
https://visitseattle.org/events/el-primer-instinto/
https://visitseattle.org/events/gary-janetti/
https://visitseattle.org/events/gary-janetti/
https://visitseattle.org/events/jammah/
https://visitseattle.org/events/jammah/
https://visitseattle.org/events/saturday-family-concert-the-harmonica-pocket/
https://visitseattle.org/events/saturday-family-concert-the-harmonica-

In [None]:
def scrape_event_details(event_urls):
    events = []
    for url in event_urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        def get_text_bs(selector):
            element = soup.select_one(selector)
            return element.get_text(strip=True) if element else 'Not Available'

        name = get_text_bs('#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > h1')
        date = get_text_bs('body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > h4 > span:nth-child(1)')
        location = get_text_bs('body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > h4 > span:nth-child(2)')
        type = get_text_bs('#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > a:nth-child(3)')
        region = get_text_bs('#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > a:nth-child(4)')
        
        events.append({
            "name": name,
            "date": date,
            "location": location,
            "type": type,
            "region": region
        })
    return events

event_details = scrape_event_details(event_urls)


In [None]:
import csv

def save_events_to_csv(events, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["name", "date", "location", "type", "region"])
        writer.writeheader()
        for event in events:
            writer.writerow(event)

save_events_to_csv(event_details, 'events.csv')


In [5]:
import pandas as pd

events_df = pd.read_csv('events.csv')

print(events_df.head())


            name                   date              location           type  \
0  Not Available          Not Available         Not Available  Not Available   
1  Not Available          Not Available         Not Available  Not Available   
2  Not Available          Not Available         Not Available  Not Available   
3  Not Available          Not Available         Not Available  Not Available   
4         Corteo  Now through 1/20/2024  Climate Pledge Arena        Theatre   

                        region  
0                Not Available  
1                Not Available  
2                Not Available  
3                Not Available  
4  Queen Anne / Seattle Center  


In [None]:
def get_location_coordinates(location_name):
    url = f"https://nominatim.openstreetmap.org/search?q={location_name}&format=json&limit=1"
    response = requests.get(url)
    if response.ok:
        data = response.json()
        return (data[0]['lat'], data[0]['lon']) if data else (None, None)
    return (None, None)

# 添加新列用于存储经纬度
events_df['latitude'] = None
events_df['longitude'] = None

# 对每个事件地点进行 API 调用
for index, row in events_df.iterrows():
    if row['location'] != 'Not Available':
        lat, lon = get_location_coordinates(row['location'])
        events_df.at[index, 'latitude'] = lat
        events_df.at[index, 'longitude'] = lon


In [9]:
import pandas as pd
import requests 
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def create_session_with_retry():
    session = requests.Session()
    retries = Retry(
        total=5, 
        backoff_factor=1,  
        status_forcelist=[500, 502, 503, 504]  
    )
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    return session


session_with_retry = create_session_with_retry()

def get_location_coordinates(location_name):
    url = f"https://nominatim.openstreetmap.org/search?q={location_name}&format=json&limit=1"
    response = session_with_retry.get(url)
    if response.ok:
        data = response.json()
        return (float(data[0]['lat']), float(data[0]['lon'])) if data else (None, None)
    return (None, None)

def get_weather_data(lat, lon):
    gridpoint_url = f"https://api.weather.gov/points/{lat},{lon}"
    response = session_with_retry.get(gridpoint_url)
    if response.ok:
        gridpoint_data = response.json()
        forecast_url = gridpoint_data['properties']['forecast']
        forecast_response = session_with_retry.get(forecast_url)
        if forecast_response.ok:
            forecast_data = forecast_response.json()
            return forecast_data['properties']['periods'][0]['detailedForecast']
    return "Weather data not available"

df = pd.read_csv('events.csv')

df['latitude'] = None
df['longitude'] = None
df['weather'] = None

for index, row in df.iterrows():
    location = f"{row['location']}, {row['region']}"
    lat, lon = get_location_coordinates(location)
    if lat and lon:
        df.at[index, 'latitude'] = lat
        df.at[index, 'longitude'] = lon
        weather = get_weather_data(lat, lon)
        df.at[index, 'weather'] = weather

df.to_csv('updated_events.csv', index=False)


KeyboardInterrupt: 