## Get Weather Data, Web Scraping

In [31]:
# Web scraping, get weather data for a specific geographical location over a 
# given period of time
# https://open-meteo.com/
# https://archive-api.open-meteo.com/v1/era5


In [32]:
# Import Pandas data manipulation and analysis library
import pandas as pd

# Import requests library
# The requests library is one of the most popular Python libraries for making
# HTTP requests (like GET, POST, PUT, DELETE). It allows Python programs to
# send and receive data from web servers
import requests


In [33]:
# A query to Open-Meteo's historical weather API.
# Base URL: https://archive-api.open-meteo.com/v1/era5
# This endpoint provides ERA5 reanalysis weather data.
# Query Parameters:
# latitude=52.52 and longitude=13.41 - these coordinates point to Berlin, Germany.
# start_date=2021-01-01 and end_date=2021-12-31 - request data for the entire
#   year of 2021.
# hourly=temperature_2m - request the hourly temperature at 2 meters above ground.
url = "https://archive-api.open-meteo.com/v1/era5?latitude=52.52&longitude=13.41&start_date=2021-01-01&end_date=2021-12-31&hourly=temperature_2m"

# Send the request and parse JSON response
response = requests.get(url)
response.json()


{'latitude': 52.54833,
 'longitude': 13.407822,
 'generationtime_ms': 0.11730194091796875,
 'utc_offset_seconds': 0,
 'timezone': 'GMT',
 'timezone_abbreviation': 'GMT',
 'elevation': 38.0,
 'hourly_units': {'time': 'iso8601', 'temperature_2m': '°C'},
 'hourly': {'time': ['2021-01-01T00:00',
   '2021-01-01T01:00',
   '2021-01-01T02:00',
   '2021-01-01T03:00',
   '2021-01-01T04:00',
   '2021-01-01T05:00',
   '2021-01-01T06:00',
   '2021-01-01T07:00',
   '2021-01-01T08:00',
   '2021-01-01T09:00',
   '2021-01-01T10:00',
   '2021-01-01T11:00',
   '2021-01-01T12:00',
   '2021-01-01T13:00',
   '2021-01-01T14:00',
   '2021-01-01T15:00',
   '2021-01-01T16:00',
   '2021-01-01T17:00',
   '2021-01-01T18:00',
   '2021-01-01T19:00',
   '2021-01-01T20:00',
   '2021-01-01T21:00',
   '2021-01-01T22:00',
   '2021-01-01T23:00',
   '2021-01-02T00:00',
   '2021-01-02T01:00',
   '2021-01-02T02:00',
   '2021-01-02T03:00',
   '2021-01-02T04:00',
   '2021-01-02T05:00',
   '2021-01-02T06:00',
   '2021-01-02T07

In [34]:
# A query to Open-Meteo's historical weather API.
# url - base URL to the ERA5 historical weather API endpoint.
# date - date for Weather Data
# params - parameters for the API Request
#   latitude=41.85, longitude=-87.65 - location of Chicago, IL
#   start_date & end_date - limits the data to June 1, 2023
#   hourly - you’re asking for four hourly weather variables
#   temperature_2m - temperature 2 meters above ground
#   windspeed_10m - wind speed at 10 meters
#   rain - hourly rain in millimeters
#   precipitation - total precipitation (rain + snow, etc.)
url = "https://archive-api.open-meteo.com/v1/era5"
date = "2024-06-01"
params = {
    "latitude": 41.85,
    "longitude": -87.65,
    "start_date": date,
    "end_date": date,
    "hourly": "temperature_2m,windspeed_10m,rain,precipitation",
}
response = requests.get(url, params=params)
response.json()


{'latitude': 41.862915,
 'longitude': -87.64877,
 'generationtime_ms': 0.12004375457763672,
 'utc_offset_seconds': 0,
 'timezone': 'GMT',
 'timezone_abbreviation': 'GMT',
 'elevation': 179.0,
 'hourly_units': {'time': 'iso8601',
  'temperature_2m': '°C',
  'windspeed_10m': 'km/h',
  'rain': 'mm',
  'precipitation': 'mm'},
 'hourly': {'time': ['2024-06-01T00:00',
   '2024-06-01T01:00',
   '2024-06-01T02:00',
   '2024-06-01T03:00',
   '2024-06-01T04:00',
   '2024-06-01T05:00',
   '2024-06-01T06:00',
   '2024-06-01T07:00',
   '2024-06-01T08:00',
   '2024-06-01T09:00',
   '2024-06-01T10:00',
   '2024-06-01T11:00',
   '2024-06-01T12:00',
   '2024-06-01T13:00',
   '2024-06-01T14:00',
   '2024-06-01T15:00',
   '2024-06-01T16:00',
   '2024-06-01T17:00',
   '2024-06-01T18:00',
   '2024-06-01T19:00',
   '2024-06-01T20:00',
   '2024-06-01T21:00',
   '2024-06-01T22:00',
   '2024-06-01T23:00'],
  'temperature_2m': [21.2,
   18.8,
   18.3,
   17.7,
   17.2,
   16.7,
   16.1,
   15.9,
   15.7,
   15.

In [35]:
# Extract part
from datetime import datetime
from dateutil.relativedelta import relativedelta
current_datetime = datetime.now() - relativedelta(months=2)
formatted_datetime = current_datetime.strftime("%Y-%m-%d")
url = "https://archive-api.open-meteo.com/v1/era5"
params = {
    "latitude": 41.85,
    "longitude": -87.65,
    "start_date": formatted_datetime,
    "end_date": formatted_datetime,
    "hourly": "temperature_2m,wind_speed_10m,rain,precipitation",
}
response = requests.get(url, params=params)
weather_data = response.json()
weather_data


{'latitude': 41.862915,
 'longitude': -87.64877,
 'generationtime_ms': 0.08404254913330078,
 'utc_offset_seconds': 0,
 'timezone': 'GMT',
 'timezone_abbreviation': 'GMT',
 'elevation': 179.0,
 'hourly_units': {'time': 'iso8601',
  'temperature_2m': '°C',
  'wind_speed_10m': 'km/h',
  'rain': 'mm',
  'precipitation': 'mm'},
 'hourly': {'time': ['2025-02-23T00:00',
   '2025-02-23T01:00',
   '2025-02-23T02:00',
   '2025-02-23T03:00',
   '2025-02-23T04:00',
   '2025-02-23T05:00',
   '2025-02-23T06:00',
   '2025-02-23T07:00',
   '2025-02-23T08:00',
   '2025-02-23T09:00',
   '2025-02-23T10:00',
   '2025-02-23T11:00',
   '2025-02-23T12:00',
   '2025-02-23T13:00',
   '2025-02-23T14:00',
   '2025-02-23T15:00',
   '2025-02-23T16:00',
   '2025-02-23T17:00',
   '2025-02-23T18:00',
   '2025-02-23T19:00',
   '2025-02-23T20:00',
   '2025-02-23T21:00',
   '2025-02-23T22:00',
   '2025-02-23T23:00'],
  'temperature_2m': [-2.3,
   -2.0,
   -2.7,
   -3.2,
   -3.7,
   -4.1,
   -4.4,
   -4.6,
   -4.8,
   -5

In [36]:
# Get hourly timestamps from a weather data
weather_data["hourly"]


{'time': ['2025-02-23T00:00',
  '2025-02-23T01:00',
  '2025-02-23T02:00',
  '2025-02-23T03:00',
  '2025-02-23T04:00',
  '2025-02-23T05:00',
  '2025-02-23T06:00',
  '2025-02-23T07:00',
  '2025-02-23T08:00',
  '2025-02-23T09:00',
  '2025-02-23T10:00',
  '2025-02-23T11:00',
  '2025-02-23T12:00',
  '2025-02-23T13:00',
  '2025-02-23T14:00',
  '2025-02-23T15:00',
  '2025-02-23T16:00',
  '2025-02-23T17:00',
  '2025-02-23T18:00',
  '2025-02-23T19:00',
  '2025-02-23T20:00',
  '2025-02-23T21:00',
  '2025-02-23T22:00',
  '2025-02-23T23:00'],
 'temperature_2m': [-2.3,
  -2.0,
  -2.7,
  -3.2,
  -3.7,
  -4.1,
  -4.4,
  -4.6,
  -4.8,
  -5.1,
  -5.2,
  -5.4,
  -5.3,
  -4.9,
  -3.8,
  -2.1,
  0.0,
  1.5,
  2.6,
  3.6,
  4.3,
  4.5,
  5.0,
  4.2],
 'wind_speed_10m': [12.0,
  10.1,
  9.9,
  10.7,
  10.6,
  10.4,
  9.6,
  9.4,
  10.0,
  10.2,
  10.6,
  9.9,
  9.6,
  9.3,
  10.8,
  11.2,
  10.2,
  8.5,
  8.7,
  6.3,
  5.1,
  4.2,
  1.7,
  3.8],
 'rain': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0

In [37]:
# Get hourly timestamps and time from a weather data
weather_data["hourly"]["time"]


['2025-02-23T00:00',
 '2025-02-23T01:00',
 '2025-02-23T02:00',
 '2025-02-23T03:00',
 '2025-02-23T04:00',
 '2025-02-23T05:00',
 '2025-02-23T06:00',
 '2025-02-23T07:00',
 '2025-02-23T08:00',
 '2025-02-23T09:00',
 '2025-02-23T10:00',
 '2025-02-23T11:00',
 '2025-02-23T12:00',
 '2025-02-23T13:00',
 '2025-02-23T14:00',
 '2025-02-23T15:00',
 '2025-02-23T16:00',
 '2025-02-23T17:00',
 '2025-02-23T18:00',
 '2025-02-23T19:00',
 '2025-02-23T20:00',
 '2025-02-23T21:00',
 '2025-02-23T22:00',
 '2025-02-23T23:00']

In [38]:
# Get hourly timestamps and temperature_2m from a weather data
weather_data["hourly"]["temperature_2m"]


[-2.3,
 -2.0,
 -2.7,
 -3.2,
 -3.7,
 -4.1,
 -4.4,
 -4.6,
 -4.8,
 -5.1,
 -5.2,
 -5.4,
 -5.3,
 -4.9,
 -3.8,
 -2.1,
 0.0,
 1.5,
 2.6,
 3.6,
 4.3,
 4.5,
 5.0,
 4.2]

In [39]:
# Creating a new dictionary called weather_data_filtered that selects and 
# renames specific fields from the original weather_data
weather_data_filtered = {
    "datetime": weather_data["hourly"]["time"],
    "tempretaure": weather_data["hourly"]["temperature_2m"],
    "wind_speed": weather_data["hourly"]["wind_speed_10m"],
    "rain": weather_data["hourly"]["rain"],
    "precipitation": weather_data["hourly"]["precipitation"],
}
weather_data_filtered


{'datetime': ['2025-02-23T00:00',
  '2025-02-23T01:00',
  '2025-02-23T02:00',
  '2025-02-23T03:00',
  '2025-02-23T04:00',
  '2025-02-23T05:00',
  '2025-02-23T06:00',
  '2025-02-23T07:00',
  '2025-02-23T08:00',
  '2025-02-23T09:00',
  '2025-02-23T10:00',
  '2025-02-23T11:00',
  '2025-02-23T12:00',
  '2025-02-23T13:00',
  '2025-02-23T14:00',
  '2025-02-23T15:00',
  '2025-02-23T16:00',
  '2025-02-23T17:00',
  '2025-02-23T18:00',
  '2025-02-23T19:00',
  '2025-02-23T20:00',
  '2025-02-23T21:00',
  '2025-02-23T22:00',
  '2025-02-23T23:00'],
 'tempretaure': [-2.3,
  -2.0,
  -2.7,
  -3.2,
  -3.7,
  -4.1,
  -4.4,
  -4.6,
  -4.8,
  -5.1,
  -5.2,
  -5.4,
  -5.3,
  -4.9,
  -3.8,
  -2.1,
  0.0,
  1.5,
  2.6,
  3.6,
  4.3,
  4.5,
  5.0,
  4.2],
 'wind_speed': [12.0,
  10.1,
  9.9,
  10.7,
  10.6,
  10.4,
  9.6,
  9.4,
  10.0,
  10.2,
  10.6,
  9.9,
  9.6,
  9.3,
  10.8,
  11.2,
  10.2,
  8.5,
  8.7,
  6.3,
  5.1,
  4.2,
  1.7,
  3.8],
 'rain': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
 

In [40]:
# Convert filtered weather data into a pandas DataFrame and show its 
# first 5 rows
weather_df = pd.DataFrame(weather_data_filtered)
weather_df.head()


Unnamed: 0,datetime,tempretaure,wind_speed,rain,precipitation
0,2025-02-23T00:00,-2.3,12.0,0.0,0.0
1,2025-02-23T01:00,-2.0,10.1,0.0,0.0
2,2025-02-23T02:00,-2.7,9.9,0.0,0.0
3,2025-02-23T03:00,-3.2,10.7,0.0,0.0
4,2025-02-23T04:00,-3.7,10.6,0.0,0.0


In [41]:
# Get a concise summary of the weather_df
weather_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   datetime       24 non-null     object 
 1   tempretaure    24 non-null     float64
 2   wind_speed     24 non-null     float64
 3   rain           24 non-null     float64
 4   precipitation  24 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.1+ KB


In [42]:
# Convert the datetime column to proper datetime64[ns] format
weather_df["datetime"] = pd.to_datetime(weather_df["datetime"])
weather_df.head()


Unnamed: 0,datetime,tempretaure,wind_speed,rain,precipitation
0,2025-02-23 00:00:00,-2.3,12.0,0.0,0.0
1,2025-02-23 01:00:00,-2.0,10.1,0.0,0.0
2,2025-02-23 02:00:00,-2.7,9.9,0.0,0.0
3,2025-02-23 03:00:00,-3.2,10.7,0.0,0.0
4,2025-02-23 04:00:00,-3.7,10.6,0.0,0.0


In [43]:
# Weather data transformation part alltogether
# - build a filtered weather_data version of original JSON file
# - convert it to a DataFrame
# - parse the datetime columnt to proper datetime64[ns] format
# - export weather_data to CSV

weather_data_filtered = {
    "datetime": weather_data["hourly"]["time"],
    "tempretaure": weather_data["hourly"]["temperature_2m"],
    "wind_speed": weather_data["hourly"]["wind_speed_10m"],
    "rain": weather_data["hourly"]["rain"],
    "precipitation": weather_data["hourly"]["precipitation"],
}

weather_df = pd.DataFrame(weather_data_filtered)

weather_df["datetime"] = pd.to_datetime(weather_df["datetime"])

weather_df.to_csv("weather_data_date.csv", index=False)
