In [12]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import csv
from datetime import datetime

In [55]:
## Traverse all pages
base_url = "https://visitseattle.org/events/page/"
page = 1
selector = "div.search-result-preview > div > h3 > a"
events_link = []

while True:
    response = requests.get(base_url+str(page))
    # Exit the loop if no next page is found
    if response.status_code != 200:
        break
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Scrape data 
    a_elements = soup.select(selector)
    events_link += [x['href'] for x in a_elements]
    
    page += 1  # Increment to the next page
    time.sleep(0.5)

In [89]:
## Get the detail of event
name_selector = "div:nth-child(1) > div.medium-6.columns.event-top > h1"
date_selector = "div:nth-child(1) > div.medium-6.columns.event-top > h4 > span:nth-child(1)"
location_selector = "div:nth-child(1) > div.medium-6.columns.event-top > h4 > span:nth-child(2)"
type_selector = "div:nth-child(1) > div.medium-6.columns.event-top > a:nth-child(3)"
region_selector = "div:nth-child(1) > div.medium-6.columns.event-top > a:nth-child(4)"

data = []

for event in events_link:
    res = requests.get(event)
    soup = BeautifulSoup(res.content, 'html.parser')
    e_name = soup.select_one(name_selector).get_text().strip() if soup.select_one(name_selector) else 'Name Not Found'
    e_date = soup.select_one(date_selector).get_text().strip() if soup.select_one(date_selector) else 'Date Not Found'
    e_location = soup.select_one(location_selector).get_text().strip() if soup.select_one(location_selector) else 'Location Not Found'
    e_type = soup.select_one(type_selector).get_text().strip() if soup.select_one(type_selector) else 'Type Not Found'
    e_region = soup.select_one(region_selector).get_text().strip() if soup.select_one(region_selector) else 'Region Not Found'
    data.append({
        "Name": e_name, 
        "Date": e_date, 
        "Location": e_location,
        "Type": e_type,
        "Region": e_region
    })
    
df = pd.DataFrame(data)
csv_file = 'events.csv'
df.to_csv(csv_file, index=False)


In [24]:
def preprocess_csv(input_file, output_file):
    updated_rows = []
    today = datetime.now().date()
    
    with open(input_file, mode='r') as infile:
        reader = csv.reader(infile)
        headers = next(reader)  # Store the header row
        updated_rows.append(headers)

        for row in reader:
            # Skip rows with 'ongoing' in the 'Date' column, period events, and dates before today
            if row[1].lower() == 'ongoing' or 'through' in row[1] or datetime.strptime(row[1], '%m/%d/%Y').date() < today:
                continue

            # Update the 'Region' value
            if row[4] == 'Downtown':
                row[4] = 'Seattle Downtown'
            elif row[4] == 'South':
                row[4] = 'South Seattle'
            elif row[4] == 'North':
                row[4] = 'North Seattle'
            
            updated_rows.append(row)

    with open(output_file, mode='w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerows(updated_rows)

preprocess_csv('events.csv', 'preprocessed_events.csv')

In [30]:
# Use OpenStreeetMap API to get (lat, lon) based on location name
def get_lat_lon(location):
    query_params = {
        "q": location,
        "format": "jsonv2"
    }
    response = requests.get("https://nominatim.openstreetmap.org/search.php", params=query_params)
    data = response.json()
    if data:  # Check if data is not empty
        return data[0]['lat'], data[0]['lon']
    else:
        print(f"No data found for location: {location}")
        return None, None

get_lat_lon("Seattle Downtown")

('47.5989361', '-122.3279543')

In [31]:
# Loop up the weather
def get_weather(lat, lon, date):
    try:
        # Convert date string to datetime object for comparison
        target_date = datetime.strptime(date, '%m/%d/%Y')

        # Get the API URL for the specific point
        url = f"https://api.weather.gov/points/{lat},{lon}"
        res = requests.get(url)
        point_dict = res.json()

        # Check if 'properties' key is present
        if 'properties' in point_dict:
            forecast_url = point_dict['properties']['forecast']
            res = requests.get(forecast_url)
            forecast_data = res.json()

            # Search for the weather on the target date
            if 'properties' in forecast_data and 'periods' in forecast_data['properties']:
                for period in forecast_data['properties']['periods']:
                    period_start = datetime.strptime(period['startTime'].split('T')[0], '%Y-%m-%d')
                    if period_start.date() == target_date.date():
                        return period['detailedForecast']  # Return the forecast data for the matching date
            else:
                print("Weather data not available for the specified date.")
        else:
            print("Failed to retrieve forecast URL.")
    except Exception as e:
        print(f"An error occurred: {e}")

lat, lon = get_lat_lon("Seattle Downtown")
weather_data = get_weather(lat, lon, "1/30/2024")
print(weather_data)

A chance of rain. Mostly cloudy, with a high near 58. East wind 3 to 9 mph. Chance of precipitation is 40%. New rainfall amounts less than a tenth of an inch possible.


In [33]:
# Read the events CSV file
with open('preprocessed_events.csv', mode='r') as file:
    reader = csv.reader(file)
    events = list(reader)

# Add weather data to the events
for event in events[1:]:  # Skip the header
    lat, lon = get_lat_lon(event[4])
    weather = get_weather(lat, lon, event[1])
    event.append(weather)

# Write the updated events to a new CSV file
with open('updated_events.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(events)