# Data Quality and Data Wrangling 
## Course Code: DLBDSDQDW01

## Task 2: Scrape the web

This notebook describes the implementation of Task 2 of the Data Quality and Data Wrangling course (DLBDSDQDW01), it contains the code use for experimentation and the creation of the visualization according to the requirements in the task description.

### Data sources
The data was collected from the following sources:

1. [OpenWeather](https://openweathermap.org/api/one-call-3#concept): for weather data such as temperature, humidity, pressure, etc.
2. USGS Earthquake data: https://earthquake.usgs.gov/fdsnws/event/1/
3. GeoBoundries
4. XXX

In [None]:
# used packages
import requests
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import re
import time
import folium
from dotenv import load_dotenv
from pathlib import Path
from datetime import date, datetime, timezone, timedelta
from zoneinfo import ZoneInfo
from shapely.geometry import Point

# Define location for time series

In [None]:
# Cities in Germany
cities = ["Berlin" , "Munich", "Hamburg", "Baden-Baden"]

# Weather data

Since the API provide data only for one timestamp and not the the summary for the whole day, the request will be run every 2 hours, to get 12 readings per day.

In [None]:
def fetch(url, headers:dict=None, params:dict=None):
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()

In [None]:
# geocoding endpoint - to get coordinates of the cities
url_geocoding = "http://api.openweathermap.org/geo/1.0/direct?"
params_geocoding = {"q":None, "limit":1, "appid" : OPENWEATHER_API_KEY}

In [None]:
responses_cities = []
for city in cities:
    params_geocoding["q"] = city
    response = fetch(url_geocoding, params=params_geocoding)
    responses_cities.append(response)
    time.sleep(1)

In [None]:
responses_cities

In [None]:
city_coordinates = {"city" : [], "country" : [], "lat" : [], "lon" : []}
for response_list in responses_cities: # a response list is the response of the API, a list containing a dictionary
    for response in response_list: # response is the dictionary containing the information of the countries
        city_coordinates["city"].append(response["name"])
        city_coordinates["lat"].append(response["lat"]) # Y
        city_coordinates["lon"].append(response["lon"]) # X
        city_coordinates["country"].append(response["country"])
df_geolocations = pd.DataFrame(city_coordinates)
df_geolocations.to_csv("geocoding_openweather.csv", index=False)

In [None]:
df_geolocations["geometry"] = df_geolocations.apply(lambda row: Point(row["lon"], row["lat"]), axis=1)
df_geolocations = gpd.GeoDataFrame(df_geolocations, geometry="geometry", crs="EPSG:4326")
df_geolocations

In [None]:
df_geolocations.explore()

In [None]:
# weather endpoint - to get weather of the locations
url_weather = "https://pro.openweathermap.org/data/2.5/weather"

In [None]:
berlin = ZoneInfo("Europe/Berlin")

In [None]:
today = datetime.now(berlin).replace(microsecond=0)
today = today.isoformat()

responses_weather = []
#today = datetime.strftime(date.today(), "%Y-%m-%d")
for nrow, record in df_geolocations.iterrows():
    lon = record["lon"]
    lat = record["lat"]
    params = {"units":"metric",
              "lon" : lon,
              "lat" : lat,
              "date" : today,
              "appid" : OPENWEATHER_API_KEY}
    response = requests.get(url_weather, params)
    response.raise_for_status()
    responses_weather.append(response)
    time.sleep(1)

In [None]:
responses_weather[0].json()

In [None]:
weather = {"name" : [], # city name - it might not match no automatic geocoding by the API
           "temperature" : [], # Temperature
           "temperature_max" : [], # Max temp at the moment
           "temperature_min" : [], # Min temp at the moment
           "feels_like" : [], # Human perception of the weather
           "humidity":[], #
           "wind_speed":[], # in m/
           "wind_direction" : [],
           "description":[],
           "timestamp":[]}

for response in responses_weather:
    weather_data = response.json()
    weather["name"].append(weather_data["name"])
    weather["temperature"].append(weather_data["main"]["temp"])
    weather["temperature_max"].append(weather_data["main"]["temp_max"])
    weather["temperature_min"].append(weather_data["main"]["temp_min"])
    weather["feels_like"].append(weather_data["main"]["feels_like"])
    weather["humidity"].append(weather_data["main"]["humidity"])
    weather["wind_speed"].append(weather_data["wind"]["speed"])
    weather["wind_direction"].append(weather_data["wind"]["deg"])
    weather["description"].append(weather_data["weather"][0]["description"])
    weather["timestamp"].append(datetime.fromtimestamp(weather_data["dt"]))
    
df_weather = pd.DataFrame(weather)
df_weather

In [None]:
# commbine into one dataframe for the weather data
df_weather = pd.concat([df_geolocations, df_weather], axis=1)
df_weather

In [None]:
# if weather data exists load it, if not
if Path("weather_data.csv").exists():
    print("loading latest data")
    history_df = pd.read_csv("weather_data.csv")
    # this dataframe is the final weather data. Store in staging area to combine later with further data
    df_weather = pd.concat([history_df, df_weather], axis=0).sort_values(by=["city", "timestamp"], ascending=False)
    df_weather.to_csv("weather_data.csv", index=False)
else:
    print("weather_data.csv does not exist. Latest data will be stored")
    # place latest data in staging area

# Air quality

In [None]:
headers = {"X-API-Key":OPENAQ_API_KEY}

In [None]:
# Step 1: Find nearby locations
resp_loc = requests.get(
    "https://api.openaq.org/v3/locations",
    headers=headers,
    params={"coordinates": "52.5200,13.4050", "radius": 5000, "limit": 5}
)

In [None]:
locations = resp_loc.json()

In [None]:
locations.keys()

In [None]:
locations["results"][0]

In [None]:
locations["results"][0]["sensors"][0]

In [None]:
sensor_info = {"sensor_id":[], "param_display_name":[], "param_name":[], "units":[]}
for sensor in locations["results"][0]["sensors"]:
    sensor_info["sensor_id"].append(sensor["id"])
    sensor_info["param_display_name"].append(sensor["parameter"]["displayName"])
    sensor_info["param_name"].append(sensor["parameter"]["name"])
    sensor_info["units"].append(sensor["parameter"]["units"])
df_sensor_info = pd.DataFrame(sensor_info)
df_sensor_info

# Earthquake data

Parameters according the API documentation:

In [None]:
# URL
url = r"https://earthquake.usgs.gov/fdsnws/event/1/"

In [None]:
# According to the API documentation all times use UTC
# Time in Germany should be specified when making a request
berlin = ZoneInfo("Europe/Berlin")
tokyo = ZoneInfo("Asia/Tokyo")
now = datetime.now(tokyo).replace(microsecond=0)
yesterday = now - timedelta(days=5)

In [None]:
# Query parameters
params = {"method" : "query", # submit a data request
          "format" : "geojson", # reponse format
          "minlatitude" : 24.0, # Get earthquakes in Japan
          "maxlatitude" : 46.0,
          "minlongitude" : 122.0,
          "maxlongitude" : 146,
          "limit" : 100, # Limit results to this value
          "starttime": yesterday.isoformat(), # the API expect ISO time format, here it is set
          "endtime" : now.isoformat(),
          "orderby" : "time"} # sort the results from most recent to oldest

In [None]:
m = folium.Map(location=[40,140], zoom_start=5)
m

In [None]:
# get the data from the API
response = requests.get(url, params=params)
earthquakes = response.json()

In [None]:
# Use the same names of the reponse in the dict for easier iteration
records = {"time":[], # time when the event ocurred - in milliseconds since the epoch
           "mag":[], # magnitude of the event - combine with magType for interpretation
           "magType":[], # magnitude types are described in the API documentation - must be mapped to a name easier to understand
           "alert":[],
           "tsunami":[],
           "place":[],
           "coordinates":[]}

In [None]:
for earthquake in earthquakes["features"]:
    for feature in records:
        if feature in earthquake["properties"]:
            records[feature].append(earthquake["properties"][feature])
        else:
            records[feature].append(earthquake["geometry"][feature])

In [None]:
mag_type_description = {
    "Mw": "Moment Magnitude",
    "Ms": "Surface Wave Magnitude",
    "Mb": "Body Wave Magnitude",
    "ML": "Local (Richter) Magnitude",
    "mB": "Broad-band Body Wave Magnitude",
    "Mb_Lg": "Lg-Wave Magnitude",
    "MD": "Duration Magnitude",
    "MH": "Hand-calculated Magnitude",
    "MI": "Intensity-derived Magnitude",
    "Me": "Energy Magnitude",
    "Mg": "Surface Wave from Ground Displacement",
    "MWb": "Moment Magnitude from Body Waves",
    "Mwr": "Regional Moment Magnitude",
    "MwC": "Centroid Moment Magnitude",
    "MwB": "Body-wave Derived Moment Magnitude",
    "mww": "Moment Magnitude from W-phase"
}


In [None]:
#TODO: set alert to black in case no alert
df = pd.DataFrame(records)
df["geometry"] = df.coordinates.apply(lambda coord: Point(coord[:2]))
df["depth"] = df.coordinates.apply(lambda coord: coord[-1])
df = df.drop("coordinates", axis=1)
df = df.rename(columns={"time":"timestamp", "mag":"magnitude", "magType":"scale"})
df = df.replace(mag_type_description)
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")

In [None]:
df

In [None]:
geodf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
geodf

In [None]:
geodf.explore()