In [1]:
import requests
from bs4 import BeautifulSoup
from JapanHorseRaceAnalytics.data.weather import get_session_id

# Station

In [2]:
import functools


@functools.lru_cache()
def get_prefecture_codes():
    req = requests.post(
        "https://www.data.jma.go.jp/gmd/risk/obsdl/top/station", data={"pd": "00"}
    )
    req.raise_for_status()
    bs4 = BeautifulSoup(req.content, "html.parser")

    result = {}

    for prefecture in bs4.find_all("div", {"class": "prefecture"}):
        name = prefecture.text
        id_ = prefecture.find("input", {"name": "prid"}).get("value")
        result[name] = id_

    return result


get_prefecture_codes()

{'宗谷': '11',
 '留萌': '13',
 '上川': '12',
 '網走・北見・紋別': '17',
 '空知': '15',
 '後志': '16',
 '石狩': '14',
 '日高': '22',
 '十勝': '20',
 '根室': '18',
 '檜山': '24',
 '胆振': '21',
 '釧路': '19',
 '渡島': '23',
 '青森': '31',
 '秋田': '32',
 '岩手': '33',
 '石川': '56',
 '山形': '35',
 '宮城': '34',
 '富山': '55',
 '新潟': '54',
 '福島': '36',
 '山口': '81',
 '島根': '68',
 '鳥取': '69',
 '兵庫': '63',
 '京都': '61',
 '福井': '57',
 '岐阜': '52',
 '長野': '48',
 '群馬': '42',
 '栃木': '41',
 '茨城': '40',
 '広島': '67',
 '岡山': '66',
 '滋賀': '60',
 '山梨': '49',
 '埼玉': '43',
 '長崎': '84',
 '佐賀': '85',
 '福岡': '82',
 '大阪': '62',
 '奈良': '64',
 '三重': '53',
 '愛知': '51',
 '静岡': '50',
 '神奈川': '46',
 '東京': '44',
 '千葉': '45',
 '熊本': '86',
 '大分': '83',
 '愛媛': '73',
 '香川': '72',
 '和歌山': '65',
 '鹿児島': '88',
 '宮崎': '87',
 '高知': '74',
 '徳島': '71',
 '沖縄': '91',
 '南極': '99'}

In [3]:
import re


@functools.lru_cache()
def get_stations(*, prefecture_code: str):
    """
    prefecture_code: "45"
    """
    result = []

    req = requests.post(
        "https://www.data.jma.go.jp/gmd/risk/obsdl/top/station",
        data={"pd": prefecture_code},
    )
    req.raise_for_status()
    bs4 = BeautifulSoup(req.content, "html.parser")

    for station in bs4.find_all("div", {"class": "station"}):
        stid = station.find("input", {"name": "stid"}).get("value")
        stname = station.find("input", {"name": "stname"}).get("value")
        prid = station.find("input", {"name": "prid"}).get("value")

        kansoku_raw = station.find("input", {"name": "kansoku"}).get("value")
        kansoku = {
            "rain": kansoku_raw[0] == "1",
            "wind": kansoku_raw[1] == "1",
            "temp": kansoku_raw[2] == "1",
            "sun": kansoku_raw[3] == "1",
            "snow": kansoku_raw[4] == "1",
        }

        lat = re.search(r"北緯：(\d+)度(\d+\.\d+)分", station["title"]).groups()
        lat = float(lat[0]) + float(lat[1]) / 60

        lon = re.search(r"東経：(\d+)度(\d+\.\d+)分", station["title"]).groups()
        lon = float(lon[0]) + float(lon[1]) / 60

        alt = re.search(r"標高：(.+)m", station["title"]).groups()[0]
        alt = float(alt)

        kana = re.search(r"カナ:(.+)", station["title"]).groups()[0]

        result.append(
            {
                "station_code": stid,
                "station_name": stname,
                "prefecture_code": prid,
                "observations": kansoku,
                "latitude": lat,
                "longitude": lon,
                "station_name_kana": kana,
                "altitude_meters": alt,
            }
        )

    return result


@functools.lru_cache()
def get_station_by_name(*, name: str):
    prefectures = get_prefecture_codes()
    for prefecture in prefectures:
        for station in get_stations(prefecture_code=prefectures[prefecture]):
            if station["station_name"] == name:
                return station
    raise ValueError(f"Station {name} not found")


get_station_by_name(name="船橋")

{'station_code': 'a1236',
 'station_name': '船橋',
 'prefecture_code': '45',
 'observations': {'rain': True,
  'wind': True,
  'temp': True,
  'sun': False,
  'snow': False},
 'latitude': 35.711666666666666,
 'longitude': 140.04333333333332,
 'station_name_kana': 'フナバシ',
 'altitude_meters': 28.0}

# Periods

In [4]:
def get_period_codes():
    """
    Does not handle <select> tag
    O(n)
    """
    req = requests.get("https://www.data.jma.go.jp/gmd/risk/obsdl/top/element")
    soup = BeautifulSoup(req.content, "html.parser")
    result = {}
    for label in soup.find("div", {"id": "aggrgPeriod"}).find_all("label"):
        input_tag = label.find("input")
        span_tag = label.find("span")
        if input_tag and span_tag:
            result[span_tag.text.strip()] = input_tag.get("value")
    return result


get_period_codes()

{'時別値': '9', '日別値': '1', '半旬別値': '2', '旬別値': '4', '月別値': '5', '3か月別値': '6'}

# Elements

In [5]:
def get_element_codes(*, period_code: str):
    """
    period_code: "9"
    O(n)
    """
    req = requests.post(
        "https://www.data.jma.go.jp/gmd/risk/obsdl/top/element",
        data={"aggrgPeriod": period_code, "isTypeNumber": "1"},
    )
    soup = BeautifulSoup(req.content, "html.parser")
    result = {}
    for input_tag in soup.find_all("input", {"type": "checkbox", "name": "element"}):
        result[input_tag["id"]] = input_tag["value"]
    return result


get_element_codes(period_code=get_period_codes()["時別値"])

{'気温': '201',
 '降水量': '101',
 '降雪の深さ': '503',
 '積雪の深さ': '501',
 '日照時間': '401',
 '風向・風速': '301',
 '全天日射量': '610',
 '現地気圧': '601',
 '海面気圧': '602',
 '相対湿度': '605',
 '蒸気圧': '604',
 '露点温度': '612',
 '天気': '703',
 '雲量': '607',
 '視程': '704'}

# Single CSV Download

In [104]:
from typing import List
import io
import pandas as pd
import re
import pytz
import datetime


def download_hourly_csv(
    *,
    station_code: str,
    element_codes: List[str],
    start_date: datetime.date,
    end_date: datetime.date,
):
    """
    element_codes: ["201", "101", "503"]
    """
    params = {
        "PHPSESSID": get_session_id(),
        "aggrgPeriod": 9,
        "csvFlag": 1,
        "disconnectFlag": 1,
        "downloadFlag": "true",
        "elementNumList": str([[code, ""] for code in element_codes]).replace("'", '"'),
        "huukouFlag": 0,
        "interAnnualFlag": 1,
        "jikantaiFlag": 0,
        "jikantaiList": str([1, 24]),
        "kijiFlag": 0,
        "optionNumList": [],
        "rmkFlag": 1,
        "stationNumList": str([station_code]).replace("'", '"'),
        "ymdList": str(
            [
                str(start_date.year),
                str(end_date.year),
                str(start_date.month),
                str(end_date.month),
                str(start_date.day),
                str(end_date.day),
            ]
        ).replace("'", '"'),
        "ymdLiteral": 1,
        "youbiFlag": 0,
    }
    req = requests.post("https://www.data.jma.go.jp/risk/obsdl/show/table", data=params)
    req.raise_for_status()
    download_timestamp_line = next(req.iter_lines()).decode("shift-jis")
    download_timestamp = re.search(
        r"ダウンロードした時刻：(.+)", download_timestamp_line
    ).groups()[0]
    download_timestamp = datetime.datetime.strptime(
        download_timestamp, "%Y/%m/%d %H:%M:%S"
    )
    download_timestamp = pytz.timezone("Asia/Tokyo").localize(download_timestamp)
    df = pd.read_csv(
        io.StringIO(req.content.decode("cp932")), skiprows=[0, 1, 2], header=[0, 1, 2]
    )
    df.columns = [
        "_".join(
            [re.sub(r"\(.+\)", "", x) for x in col if not x.startswith("Unnamed:")]
        )
        for col in df.columns.to_flat_index()
    ]
    df["年月日時"] = pd.to_datetime(df["年月日時"]).astype("datetime64[ms]")
    df["download_timestamp"] = download_timestamp
    df["station_name"] = (
        req.content.decode("cp932").splitlines()[2].strip(",").split(",")[0]
    )
    df = df[
        (df["年月日時"].dt.date >= pd.Timestamp(start_date).date())
        & (df["年月日時"].dt.date <= pd.Timestamp(end_date).date())
    ]
    return df

In [35]:
station_code = get_station_by_name(name="船橋")["station_code"]
start_date = datetime.date(2001, 1, 1)
end_date = start_date + datetime.timedelta(days=21)
element_codes = list(get_element_codes(period_code=get_period_codes()["時別値"]).values())

result = download_hourly_csv(
    station_code=station_code,
    element_codes=element_codes,
    start_date=start_date,
    end_date=end_date,
)

result

NameError: name 'get_station_by_name' is not defined

# Bulk download

In [105]:
import time


def bulk_download(
    station_name, download_start_date, download_end_date, output_dir, delay_seconds=1
):
    station_code = get_station_by_name(name=station_name)["station_code"]
    period_code = get_period_codes()["時別値"]
    element_codes = list(get_element_codes(period_code=period_code).values())

    # 25 day intervals starting from 2001-01-01 to 2024-01-01
    for start_date in pd.date_range(
        start=download_start_date, end=download_end_date, freq="25D"
    ):
        end_date = start_date + datetime.timedelta(days=25)
        result = download_hourly_csv(
            station_code=station_code,
            element_codes=element_codes,
            start_date=start_date,
            end_date=end_date,
        )
        download_timestamp = (
            str(result.at[0, "download_timestamp"])
            .replace(" ", "_")
            .replace(":", "-")
            .replace("+", "-")
        )
        output_file = f"{output_dir}/station_name={station_name}&start_date={start_date.date()}&end_date={end_date.date()}&download_timestamp={download_timestamp}.csv"
        result.to_csv(output_file, index=False)
        print(f"Downloaded {station_name} {start_date.date()}-{end_date.date()}")
        time.sleep(delay_seconds)

In [9]:
station_names = [
    # "札幌",  # 札幌
    # "函館",  # 函館
    # "福島",  # 福島
    # "新潟",  # 新潟
    # "府中",  # 東京
    # "船橋",  # 中山
    # "名古屋",  # 中京
    # "京都",  # 京都
    "大阪",  # 阪神
    "八幡",  # 小倉
]

output_dir = "data/jma"
download_start_date = datetime.date(2001, 1, 1)
download_end_date = datetime.date(2024, 1, 1)

for station_name in station_names:
    bulk_download(
        station_name,
        download_start_date,
        download_end_date,
        output_dir,
        delay_seconds=10,
    )

Downloaded 大阪 2001-01-01-2001-01-26
Downloaded 大阪 2001-01-26-2001-02-20
Downloaded 大阪 2001-02-20-2001-03-17
Downloaded 大阪 2001-03-17-2001-04-11
Downloaded 大阪 2001-04-11-2001-05-06
Downloaded 大阪 2001-05-06-2001-05-31
Downloaded 大阪 2001-05-31-2001-06-25
Downloaded 大阪 2001-06-25-2001-07-20
Downloaded 大阪 2001-07-20-2001-08-14
Downloaded 大阪 2001-08-14-2001-09-08
Downloaded 大阪 2001-09-08-2001-10-03
Downloaded 大阪 2001-10-03-2001-10-28
Downloaded 大阪 2001-10-28-2001-11-22
Downloaded 大阪 2001-11-22-2001-12-17
Downloaded 大阪 2001-12-17-2002-01-11
Downloaded 大阪 2002-01-11-2002-02-05
Downloaded 大阪 2002-02-05-2002-03-02
Downloaded 大阪 2002-03-02-2002-03-27
Downloaded 大阪 2002-03-27-2002-04-21
Downloaded 大阪 2002-04-21-2002-05-16
Downloaded 大阪 2002-05-16-2002-06-10
Downloaded 大阪 2002-06-10-2002-07-05
Downloaded 大阪 2002-07-05-2002-07-30
Downloaded 大阪 2002-07-30-2002-08-24
Downloaded 大阪 2002-08-24-2002-09-18
Downloaded 大阪 2002-09-18-2002-10-13
Downloaded 大阪 2002-10-13-2002-11-07
Downloaded 大阪 2002-11-07-200

# Load files to database

Business key is combination of `station_name` and `年月日時`.

On conflict, take the record with the latest `download_timestamp`.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f


spark = SparkSession.builder.config("spark.jars", "postgresql-42.7.1.jar").getOrCreate()

df_weather_hourly = spark.read.csv("data/jma", header=True).withColumn(
    "weather_hourly_sk", f.monotonically_increasing_id()
)

options = {
    "url": "jdbc:postgresql://localhost:5432/jrdb",
    "user": "admin",
    "password": "admin",
    "driver": "org.postgresql.Driver",
    "dbtable": "jrdb_raw.weather_hourly",
}

df_weather_hourly.write.mode("overwrite").format("jdbc").options(**options).save()

24/01/20 09:31:23 WARN Utils: Your hostname, Hanks-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.40.105 instead (on interface en0)
24/01/20 09:31:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/01/20 09:31:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/20 09:31:26 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                