In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
import calendar
import os

In [2]:
def get_weather_data(url_prefix, date_str):
    """
    Extracts weather observations from the Weather Underground website for a given date.

    Args:
        url_prefix (str): Prefix of the URL for a determined station
        date_str (str): Date in YYYY-M-D format (e.g., '2025-3-13')

    Returns:
        pd.DataFrame: DataFrame containing the weather observations
    """
    url = f"{url_prefix}/date/{date_str}"
    driver.get(url)

    try:
        # Wait until the table is visible
        table_xpath = '//*[@id="inner-content"]/div[2]/div[1]/div[5]/div[1]/div/lib-city-history-observation/div/div[2]/table'
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, table_xpath))
        )

        # Get table HTML
        table = driver.find_element(By.XPATH, table_xpath)
        table_html = table.get_attribute("outerHTML")

        # Parse HTML with BeautifulSoup
        soup = BeautifulSoup(table_html, "html.parser")

        # Extract headers
        headers = [th.text.strip() for th in soup.find_all("th")]

        # Extract data rows
        data = []
        for row in soup.find_all("tr")[1:]:  # Skip the header row
            cols = row.find_all("td")
            row_data = [col.get_text(strip=True) for col in cols]
            data.append(row_data)

        # Create DataFrame
        df = pd.DataFrame(data, columns=headers)
        df["date"] = date_str  # Add date column for reference

        return df

    except Exception as e:
        print(f"Error retrieving data for {date_str}: {e}")
        return None

# # Example usage for one day
# df_weather = get_weather_data("2025-3-13")
# if df_weather is not None:
#     print(df_weather.head())

In [3]:
def convert_weather_data(df):
    """
    Cleans and converts weather data:
    - Converts date and time to datetime format.
    - Converts temperatures (°F to °C) and rounds to 1 decimal.
    - Converts wind speeds (mph to km/h) and rounds to 1 decimal.
    - Converts pressure (inHg to hPa) and rounds to 1 decimal.
    - Converts precipitation (inches to mm) and rounds to 1 decimal.
    - Converts humidity to integer.
    """
    # Ensure all columns are strings before using .str functions
    df["Temperature"] = df["Temperature"].astype(str).str.extract(r"(\d+)").astype(float)
    df["Temperature"] = ((df["Temperature"] - 32) * 5/9).round(0)  # Convert to Celsius

    df["Dew Point"] = df["Dew Point"].astype(str).str.extract(r"(\d+)").astype(float)
    df["Dew Point"] = ((df["Dew Point"] - 32) * 5/9).round(0)  # Convert to Celsius

    df["Humidity"] = df["Humidity"].astype(str).str.extract(r"(\d+)").astype(float).astype(int)

    df["Wind Speed"] = df["Wind Speed"].astype(str).str.extract(r"(\d+)").astype(float) * 1.60934
    df["Wind Speed"] = df["Wind Speed"].round(1)

    df["Wind Gust"] = df["Wind Gust"].astype(str).str.extract(r"(\d+)").astype(float) * 1.60934
    df["Wind Gust"] = df["Wind Gust"].round(1)

    df["Pressure"] = df["Pressure"].astype(str).str.extract(r"([\d.]+)").astype(float) * 33.8639
    df["Pressure"] = df["Pressure"].round(1)

    df["Precip."] = df["Precip."].astype(str).str.extract(r"([\d.]+)").astype(float) * 25.4
    df["Precip."] = df["Precip."].round(1)

    return df


def fix_temperature_errors(df, columns=['Temperature', 'Dew Point']):
    """
    Replaces erroneous -18.0 values with NaN if a jump greater than 5 degrees occurs before or after a streak.
    """
    for col in columns:
        mask = df[col] == -18.0
        streaks = mask.astype(int).groupby((mask != mask.shift()).cumsum()).transform('sum')

        for idx in df[mask].index:
            # Get neighbors of the streak
            prev_idx = idx - 1 if idx > 0 else None
            next_idx = idx + 1 if idx + 1 < len(df) else None

            # Only check start and end of streaks
            if ((prev_idx is None or abs(df.at[prev_idx, col] - (-18.0)) > 5) or
                (next_idx is None or abs(df.at[next_idx, col] - (-18.0)) > 5)):
                df.loc[mask & (streaks > 0), col] = float('nan')
                break
    return df

def correct_data(df):
    """
    Performs several corrections, especially observations with incorrect dates and temperature errors
    """
    
    # Convert given_date to datetime
    given_date_dt = datetime.strptime(df["date"][0], "%Y-%m-%d")

    # Initialize date column with the given date
    df['Date_dt'] = given_date_dt
    
    # Convert 'Time' to datetime (without date, just time)
    df['Time_dt'] = pd.to_datetime(df['Time'], format='%I:%M %p').dt.time

    # Detect anomalies: Time jumps backward (indicating previous day's data inclusion)
    for i in range(1, len(df)):
        if df.loc[i, 'Time_dt'] < df.loc[i - 1, 'Time_dt']:
            split_index = i
            break
    else:
        split_index = None  # If no disorder, all rows belong to the given date

    # Assign correct dates based on detected split point
    if split_index is not None:
        # Previous day's data
        df.loc[:split_index - 1, 'Date_dt'] = given_date_dt - timedelta(days=1)
        # Check if later observations are from the next day
        for j in range(split_index, len(df) - 1):
            if df.loc[j + 1, 'Time_dt'] < df.loc[j, 'Time_dt']:
                df.loc[j + 1:, 'Date_dt'] = given_date_dt + timedelta(days=1)
                break
                
    # Create the full Datetime column
    df['Datetime'] = pd.to_datetime(df['Date_dt'].dt.strftime('%Y-%m-%d') + ' ' + df['Time'], format='%Y-%m-%d %I:%M %p')

    df = fix_temperature_errors(df)

    # Drop unnecessary columns
    df.drop(columns=['Time', 'Time_dt', 'date', 'Date_dt'], inplace=True)

    return df



In [4]:
# Global variable to store the WebDriver instance
selenium_driver = None  

def get_selenium_driver():
    global selenium_driver
    print(f"Current selenium_driver: {selenium_driver}")  # Debug print

    if selenium_driver is None:
        # Initialize Selenium WebDriver
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")  # Run in headless mode (no GUI)
        selenium_driver = webdriver.Chrome(options=options)
        print("Selenium WebDriver initialized.")
    else:
        print("Selenium WebDriver already initialized.")
    
    return selenium_driver

# Example usage
driver = get_selenium_driver()

Current selenium_driver: None
Selenium WebDriver initialized.


In [None]:
# Specify start and end year/month
start_year, start_month = 2025, 3
end_year, end_month = 2025, 3

# URL prefix for the Wweather Underground station you want to scrape
url_prefix = "https://www.wunderground.com/history/daily/es/zaragoza/LEZG"

# Generate list of (year, month) pairs to iterate over
months_list = []
current_year, current_month = start_year, start_month

while (current_year, current_month) <= (end_year, end_month):
    months_list.append((current_year, current_month))
    if current_month == 12:
        current_month = 1
        current_year += 1
    else:
        current_month += 1

# Create the "data" folder if it doesn't exist
output_folder = "data"
os.makedirs(output_folder, exist_ok=True)

# Loop through months with tqdm progress bar
for year, month in months_list:
    
    # Get the number of days in the current month
    num_days = calendar.monthrange(year, month)[1]

    today = datetime.today().date()
    
    # Generate all dates for the current month excluding today and future
    dates = [datetime(year, month, day) for day in range(1, num_days + 1) 
             if datetime(year, month, day).date() < today]
    
    all_data = []  # List to store DataFrames for this month

    # Loop through the dates with tqdm for progress tracking
    for current_date in tqdm(dates, desc=f"Processing {year}-{month:02d} weather data"):
        date_str = current_date.strftime("%Y-%m-%d").lstrip("0").replace("-0", "-")  # Works on all OS
        df_weather = get_weather_data(url_prefix, date_str)
    
        if df_weather is not None:
            # Remove rows where the 'Time' column is None or empty
            df_weather = df_weather[df_weather["Time"].notna()]  # Remove None values
            df_weather = df_weather[df_weather["Time"].str.strip() != ""]  # Remove empty strings
            
            # Reset index after dropping rows
            df_weather.reset_index(drop=True, inplace=True)
            
            df_weather = convert_weather_data(df_weather)
            df_weather_clean = correct_data(df_weather)
            
            all_data.append(df_weather_clean)
    
        current_date += timedelta(days=1)
    
    # Concatenate all daily DataFrames into one
    df_weather_month = pd.concat(all_data, ignore_index=True)
    
    # Save to CSV
    file_name = os.path.join(output_folder, f"{year}_{month:02d}.csv")
    df_weather_month.to_csv(file_name, index=False)

In [None]:
# Display last month downloaded
df_weather_month

In [None]:
# Represent last month downloaded

import plotly.express as px

# Ensure datetime column is sorted for proper time series visualization
df_weather_month = df_weather_month.sort_values(by="Datetime")

# Create interactive plot
fig = px.line(
    df_weather_month,
    x="Datetime",
    y="Temperature",
    title="Monthly Temperature Trend",
    labels={"Datetime": "Date", "Temperature": "Temperature (°C)"},
    markers=False  # Adds points on the line or not
)

# Customize layout
fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Temperature (°C)",
    xaxis=dict(showgrid=True),
    yaxis=dict(showgrid=True),
    template="plotly_dark"  # Optional: use "plotly_white" for light theme
)

# Show plot
fig.show()

In [None]:
# Check if the datetime column is in ascending order
if df_weather_month["Datetime"].is_monotonic_increasing:
    print("✅ The datetime column is in ascending order.")
else:
    print("❌ The datetime column is NOT in ascending order.")

In [None]:
def close_selenium_driver():
    global selenium_driver
    if selenium_driver is not None:
        selenium_driver.quit()
        selenium_driver = None  # Reset to None after closing
        print("Selenium WebDriver closed.")
    else:
        print("No active Selenium WebDriver to close.")

close_selenium_driver()