<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/hyougo%5Chyougo_json_isoformat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install jsonschema



In [2]:
!pip install pycurl
!pip install retry

Collecting pycurl
[?25l  Downloading https://files.pythonhosted.org/packages/ef/05/4b773f74f830a90a326b06f9b24e65506302ab049e825a3c0b60b1a6e26a/pycurl-7.43.0.5.tar.gz (216kB)
[K     |█▌                              | 10kB 16.8MB/s eta 0:00:01[K     |███                             | 20kB 2.1MB/s eta 0:00:01[K     |████▌                           | 30kB 2.8MB/s eta 0:00:01[K     |██████                          | 40kB 3.1MB/s eta 0:00:01[K     |███████▌                        | 51kB 2.5MB/s eta 0:00:01[K     |█████████                       | 61kB 2.8MB/s eta 0:00:01[K     |██████████▋                     | 71kB 3.0MB/s eta 0:00:01[K     |████████████                    | 81kB 3.4MB/s eta 0:00:01[K     |█████████████▋                  | 92kB 3.5MB/s eta 0:00:01[K     |███████████████                 | 102kB 3.3MB/s eta 0:00:01[K     |████████████████▋               | 112kB 3.3MB/s eta 0:00:01[K     |██████████████████▏             | 122kB 3.3MB/s eta 0:00:01[K

In [3]:
PCR_XLSX = "https://web.pref.hyogo.lg.jp/kk03/documents/pcr.xlsx"
YOUSEI_XLSX = "https://web.pref.hyogo.lg.jp/kk03/documents/yousei.xlsx"
KANJA_HTML = "https://web.pref.hyogo.lg.jp/kk03/corona_kanjyajyokyo.html"

DOWNLOAD_DIR = "download"
DATA_DIR = "data"

In [4]:
from retry import retry
import pathlib

# ダウンロード

In [5]:
@retry(tries=5, delay=5, backoff=3)
def get_file(url, dir="."):

    r = requests.get(url)

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)

    return p

# SCHEMA

In [6]:
AGE_SCHEMA = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "properties": {
        "data": {
            "type": "object",
            "additionalProperties": {
                "default": 0,
                "type": "integer"
            }
        },
        "last_update": {
            "format": "date-time",
            "type": "string"
        }
    },
    "required": [
        "data",
        "last_update"
    ],
}

In [7]:
AGE_SUMMARY_SCHEMA = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "data": {
            "type": "object",
            "additionalProperties": {
                "type": "array",
                "items": {"default": 0, "type": "integer"},
            },
        },
        "labels": {
            "type": "array",
            "items": {"pattern": "^[0-9]{1,2}/[0-9]{1,2}$", "type": "string"},
        },
        "last_update": {
            "format": "date-time",
            "type": "string",
        },
    },
    "required": ["data", "labels", "last_update"],
}

In [8]:
CLUSTERS_SCHEMA = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "data": {
            "type": "array",
            "items": {
                "type": "object",
                "oneOf": [
                    {
                        "properties": {"日付": {"type": "string", "format": "date-time"}},
                    },
                    {
                        "additionalProperties": {"type": "integer"},
                    }
                ],
            },
        },
        "last_update": {"type": "string", "format": "date-time"},
    },
    "required": ["data", "last_update"],
}

In [9]:
CLUSTERS_SUMMARY_SCHEMA = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "data": {
            "type": "object",
            "additionalProperties": {"default": 0, "type": "integer"},
        },
        "last_update": {
            "format": "date-time",
            "type": "string",
        },
    },
    "required": ["data", "last_update"],
}

In [10]:
INSPECTIONS_SCHEMA = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "data": {
            "type": "array",
            "items": {
                "type": "object",
                "additionalProperties": False,
                "properties": {
                    "判明日": {"type": "string", "format": "date"},
                    "地方衛生研究所等": {"type": "integer"},
                    "民間検査機関等": {
                        "type": "object",
                        "additionalProperties": {"type": "integer"},
                    },
                    "陽性確認": {"type": "integer"},
                },
                "required": ["判明日", "地方衛生研究所等", "民間検査機関等", "陽性確認"],
            },
        },
        "last_update": {"type": "string", "format": "date-time"},
    },
    "required": ["data", "last_update"],
}

In [11]:
INSPECTIONS_SUMMARY_SCHEMA = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "data": {
            "type": "object",
            "additionalProperties": {
                "type": "array",
                "items": {"default": 0, "type": "integer"},
            },
        },
        "labels": {
            "type": "array",
            "items": {"pattern": r"^[0-9]{1,2}/[0-9]{1,2}$", "type": "string"},
        },
        "last_update": {
            "format": "date-time",
            "type": "string",
        },
    },
    "required": ["data", "labels", "last_update"],
}

In [12]:
MAIN_SUMMARY_SCHEMA = {
    "$schema": "http://json-schema.org/draft-06/schema#",
    "$ref": "#/definitions/Main",
    "definitions": {
        "Main": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "attr": {"type": "string"},
                "value": {"type": "integer", "default": 0},
                "children": {
                    "type": "array",
                    "items": {"$ref": "#/definitions/Inspections"},
                },
                "last_update": {
                    "format": "date-time",
                    "type": "string",
                },
            },
            "required": ["attr", "children", "last_update", "value"],
            "title": "Main",
        },
        "Inspections": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "attr": {"type": "string"},
                "value": {"type": "integer", "default": 0},
                "children": {
                    "type": "array",
                    "items": {"$ref": "#/definitions/Patients"},
                },
            },
            "required": ["attr", "children", "value"],
            "title": "Inspections",
        },
        "Patients": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "attr": {"type": "string"},
                "value": {"type": "integer", "default": 0},
                "children": {
                    "type": "array",
                    "items": {"$ref": "#/definitions/Symptoms"},
                },
            },
            "required": ["attr", "value"],
            "title": "Patients",
        },
        "Symptoms": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "attr": {"type": "string"},
                "value": {"type": "integer", "default": 0},
            },
            "required": ["attr", "value"],
            "title": "Symptoms",
        },
    },
}

In [13]:
PATIENTS_SCHEMA = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "$ref": "#/definitions/Main",
    "definitions": {
        "Main": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "data": {"type": "array", "items": {"$ref": "#/definitions/Datum"}},
                "last_update": {"type": "string"},
            },
            "required": ["data", "last_update"],
            "title": "Main",
        },
        "Datum": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "No": {"type": "integer"},
                "居住地": {"type": "string"},
                "年代": {"$ref": "#/definitions/Age"},
                "性別": {"$ref": "#/definitions/Sex"},
                "備考": {"type": "string"},
                "退院": {"type": "null"},
                "date": {"type": "string", "format": "date"},
                "リリース日": {"type": "string", "format": "date"},
                "曜日": {"$ref": "#/definitions/Week"},
            },
            "required": ["date", "リリース日", "備考", "居住地", "年代", "性別", "曜日", "No", "退院"],
            "title": "Datum",
        },
        "Age": {
            "type": "string",
            "enum": [
                "10歳未満",
                "10代",
                "20代",
                "30代",
                "40代",
                "50代",
                "60代",
                "70代",
                "80代",
                "90歳以上",
                "非公表",
            ],
            "title": "Age",
        },
        "Sex": {"type": "string", "enum": ["男性", "女性", "非公表"], "title": "Sex"},
        "Week": {
            "type": "string",
            "enum": ["月", "火", "水", "木", "金", "土", "日"],
            "title": "Week",
        },
    },
}

In [14]:
PATIENTS_SUMMARY_SCHEMA = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "data": {
            "type": "array",
            "items": {
                "type": "object",
                "additionalProperties": False,
                "properties": {
                    "日付": {"type": "string", "format": "date"},
                    "小計": {"default": 0, "type": "integer"},
                },
                "required": ["小計", "日付"],
            },
        },
        "last_update": {
            "format": "date-time",
            "type": "string",
        },
    },
    "required": ["data", "last_update"],
}

# データラングリング

In [15]:
import datetime
import json

import jsonschema
import pandas as pd
import requests

In [16]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [17]:
def dumps_json(file_name, json_data, dir=DATA_DIR):

    p = pathlib.Path(dir, file_name)

    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="w") as fw:
        json.dump(json_data, fw, ensure_ascii=False, indent=4)

In [18]:
def json_check(fn, d):

    r = requests.get(
        "https://raw.githubusercontent.com/stop-covid19-hyogo/covid19-scraping/gh-pages/"
        + fn
    )

    result = d == r.json()

    if not result:

        print(fn)

In [19]:
# 最終更新日
JST = datetime.timezone(datetime.timedelta(hours=+9))
dt_now = datetime.datetime.now(JST)

In [20]:
last_update = dt_now.replace(hour=0, minute=0, second=0, microsecond=0)
# last_update -= datetime.timedelta(days=1)

## pcr.xlsx

In [21]:
pcr_path = get_file(PCR_XLSX, DOWNLOAD_DIR)

In [22]:
df_pcr = pd.read_excel(pcr_path, index_col="年月日").fillna(0).astype(int)

In [23]:
df_pcr.rename(
    columns={
        "検査件数（合計）": "合計",
        "うち地方衛生研究所等によるPCR検査件数": "地方衛生研究所等",
        "うち民間検査機関等によるPCR検査件数": "民間検査機関等_PCR検査",
        "うち民間検査機関等による抗原検査件数": "民間検査機関等_抗原検査",
        "陽性件数": "陽性確認",
    },
    inplace=True,
)

In [24]:
df_pcr["民間検査機関等"] = df_pcr["民間検査機関等_PCR検査"] + df_pcr["民間検査機関等_抗原検査"] 

In [25]:
df_pcr["日付"] = df_pcr.index.map(lambda d: pd.Timestamp(d, tz='Asia/Tokyo').isoformat())

In [26]:
# inspections_summary

df_insp_sum = df_pcr.loc[:, ["地方衛生研究所等", "民間検査機関等"]].copy()

labels = df_insp_sum.index.map(lambda x: f"{x.month}/{x.day}")

In [27]:
inspections_summary = {
    "data": df_insp_sum.to_dict(orient="list"),
    "labels": labels.tolist(),
    "last_update": last_update.isoformat(),
}

In [28]:
jsonschema.validate(inspections_summary, INSPECTIONS_SUMMARY_SCHEMA)

dumps_json("inspections_summary.json", inspections_summary)

In [29]:
# inspections

df_insp = df_pcr.loc[:, ["地方衛生研究所等", "民間検査機関等_PCR検査", "民間検査機関等_抗原検査", "陽性確認"]].copy()
df_insp["判明日"] = df_insp.index.strftime("%Y-%m-%d")

In [30]:
df_insp.sort_index(inplace=True)

In [31]:
insp_dict = [
    {
        "判明日": row["判明日"],
        "地方衛生研究所等": row["地方衛生研究所等"],
        "民間検査機関等": {"PCR検査": row["民間検査機関等_PCR検査"], "抗原検査": row["民間検査機関等_抗原検査"]},
        "陽性確認": row["陽性確認"],
    }
    for _, row in df_insp.iterrows()
]

In [32]:
inspections = {
    "data": insp_dict,
    "last_update": last_update.isoformat(),
}

In [33]:
# jsonschema.validate(inspections, INSPECTIONS_SCHEMA)
dumps_json("inspections.json", inspections)

In [34]:
# parent_summary

df_pts = df_pcr.loc[:, ["日付", "陽性確認"]].copy()

df_pts.rename(columns={"陽性確認": "小計"}, inplace=True)

In [35]:
patients_summary = {
    "data": df_pts.to_dict(orient="records"),
    "last_update": last_update.isoformat(),
}

In [36]:
jsonschema.validate(patients_summary, PATIENTS_SUMMARY_SCHEMA)

dumps_json("patients_summary.json", patients_summary)

## kanjya.xlsx

In [37]:
p = get_file(KANJA_HTML, DOWNLOAD_DIR)

soup = BeautifulSoup(p.open(encoding='utf-8'), "html.parser")

In [38]:
tag = soup.find("a", class_="icon_excel")

link = urljoin(KANJA_HTML, tag.get("href"))

In [39]:
kanja_path = get_file(link, DOWNLOAD_DIR)

In [40]:
df_head = pd.read_excel(kanja_path, header=None, skiprows=3).dropna(how="all", axis=1)

df_head.columns = ["".join(i).strip() for i in df_head.head(2).fillna("").T.values]
df_tmp = df_head.iloc[2:, :].copy().reset_index(drop=True)

df_kanja = df_tmp[df_tmp["番号"].notnull()].copy()

df_kanja.dropna(how="all", axis=1, inplace=True)

df_kanja.columns = df_kanja.columns.map(lambda s: s.replace("\n", ""))

df_kanja["番号"] = df_kanja["番号"].astype(int)
df_kanja["年代"] = df_kanja["年代"].astype(str)
df_kanja["年代"] = df_kanja["年代"].replace({"10?[歳代]未満": "10歳未満", "90": "90歳以上", "([1-8]0$)": r"\1代"}, regex=True)

flg_is_serial = df_kanja["発表日"].astype("str").str.isdigit()

fromSerial = pd.to_datetime(df_kanja.loc[flg_is_serial, "発表日"].astype(float), unit="D", origin=pd.Timestamp("1899/12/30"))
fromString = pd.to_datetime(df_kanja.loc[~flg_is_serial, "発表日"])

df_kanja["発表日"] = pd.concat([fromString, fromSerial])

df_kanja["備考欄"] = df_kanja["備考欄"].str.replace("\n", "")

df_kanja.set_index("番号", inplace=True)

In [41]:
# 欠番
df_kanja = df_kanja.drop(738)

In [42]:
# 居住地確認
df_kanja["居住地"].value_counts()

神戸市             311
西宮市             133
調査中              97
尼崎市              94
伊丹健康福祉事務所管内      81
姫路市              64
宝塚市              49
芦屋市              39
伊丹市              36
明石市              26
加古川市             22
川西市              18
宝塚健康福祉事務所管内      17
三田市              17
加東健康福祉事務所管内      13
神戸市外             12
高砂市              10
淡路市               9
稲美町               7
西宮市外              7
大阪市               6
洲本健康福祉事務所管内       4
芦屋健康福祉事務所管内       4
大阪府               3
赤穂健康福祉事務所管内       3
三木市               2
赤穂市               2
龍野健康福祉事務所管内       2
西脇市               2
中播磨健康福祉事務所管内      2
東京都               2
加西市               2
丹波健康福祉事務所管内       2
福崎町               1
神戸市内              1
播磨町               1
加古川健康事務所管内        1
猪名川町              1
丹波市               1
会社員               1
Name: 居住地, dtype: int64

In [43]:
nlist = df_kanja.loc[~df_kanja["居住地"].str.endswith(("都", "府", "県", "市", "町", "市内", "市外", "事務所管内")) & ~(df_kanja["居住地"] == "調査中"), "居住地"].unique()

In [44]:
df_kanja[df_kanja["居住地"].isin(nlist)]

Unnamed: 0_level_0,発表日,年代,性別,管轄,居住地,職業,発症日,渡航歴,備考欄,認定こども園,北播磨総合医療センター,宝塚第一病院,仁恵病院,神戸市中央市民病院,神戸赤十字病院,神戸西警察署,神戸市環境局,グリーンアルス関係,介護保険通所事業所,ライブ関係,海外渡航関係,その他,行動歴調査中,特定できず
番号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
829,2020-07-19,20代,女性,尼崎,会社員,尼崎市,44025,なし,行動歴調査中,,,,,,,,,,,,,,○,


In [45]:
s = df_kanja["居住地"].mask(df_kanja["居住地"].isin(nlist) & df_kanja["職業"].str.endswith(("都", "府", "県", "市", "町", "市内", "市外", "事務所管内")), df_kanja["職業"])

In [46]:
df_kanja["職業"] = df_kanja["職業"].mask(df_kanja["居住地"].isin(nlist), df_kanja["居住地"])

In [47]:
df_kanja["居住地"] = s

In [48]:
df_kanja.to_csv("kanja.tsv", sep="\t")

In [49]:
# 陽性患者数（累計）

len(df_kanja)

1105

In [50]:
# 陽性患者数（日別）
"""
df_pts = (
    df_kanja["発表日"]
    .value_counts()
    .sort_index()
    .asfreq("D", fill_value=0)
    .reset_index()
)

df_pts["日付"] = df_pts["index"].dt.strftime("%Y-%m-%d")

df_pts.rename(columns={"発表日": "小計"}, inplace=True)

df_pts.drop("index", axis=1, inplace=True)

df_pts
"""

'\ndf_pts = (\n    df_kanja["発表日"]\n    .value_counts()\n    .sort_index()\n    .asfreq("D", fill_value=0)\n    .reset_index()\n)\n\ndf_pts["日付"] = df_pts["index"].dt.strftime("%Y-%m-%d")\n\ndf_pts.rename(columns={"発表日": "小計"}, inplace=True)\n\ndf_pts.drop("index", axis=1, inplace=True)\n\ndf_pts\n'

In [51]:
"""
patients_summary = {
    "data": df_pts.to_dict(orient="records"),
    "last_update": last_update.strftime("%Y-%m-%d %H:%M"),
}
"""

'\npatients_summary = {\n    "data": df_pts.to_dict(orient="records"),\n    "last_update": last_update.strftime("%Y-%m-%d %H:%M"),\n}\n'

In [52]:
# jsonschema.validate(patients_summary, PATIENTS_SUMMARY_SCHEMA)

# dumps_json("patients_summary.json", patients_summary)

In [53]:
# 陽性患者情報

df_pt = df_kanja.loc[:, ["発表日", "居住地", "年代", "性別", "備考欄"]].sort_index().reset_index()
df_pt.head(10)

df_pt["退院"] = None

df_pt["date"] = df_pt["発表日"].dt.strftime("%Y-%m-%d")
df_pt["リリース日"] = df_pt["発表日"].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo').isoformat())

week = ["月", "火", "水", "木", "金", "土", "日"]

df_pt["曜日"] = df_pt["発表日"].dt.dayofweek.apply(lambda x: week[x])

df_pt["備考欄"] = df_pt["備考欄"].str.replace("NO.|N0.|NO,|N0,|No,", "No.")
df_pt["備考欄"] = df_pt["備考欄"].str.replace("・", "、")
df_pt["備考欄"] = df_pt["備考欄"].fillna("")
df_pt.rename(columns={"番号": "No", "備考欄": "備考"}, inplace=True)

df_pt.drop("発表日", axis=1, inplace=True)

df_pt

Unnamed: 0,No,居住地,年代,性別,備考,退院,date,リリース日,曜日
0,1,西宮市,40代,男性,特定できず,,2020-03-01,2020-03-01T00:00:00+09:00,日
1,2,神戸市,40代,男性,感染経路確認済,,2020-03-03,2020-03-03T00:00:00+09:00,火
2,3,神戸市,40代,女性,2/15、16に大阪のライブハウスArcに参加,,2020-03-03,2020-03-03T00:00:00+09:00,火
3,4,福崎町,50代,女性,2/15、16に大阪のライブハウスArcに参加,,2020-03-05,2020-03-05T00:00:00+09:00,木
4,5,姫路市,40代,男性,2/19に大阪のSoap operaライブに参加,,2020-03-06,2020-03-06T00:00:00+09:00,金
...,...,...,...,...,...,...,...,...,...
1100,1102,加東健康福祉事務所管内,10歳未満,男性,陽性患者の濃厚接触者,,2020-07-29,2020-07-29T00:00:00+09:00,水
1101,1103,加東健康福祉事務所管内,10歳未満,男性,陽性患者の濃厚接触者,,2020-07-29,2020-07-29T00:00:00+09:00,水
1102,1104,加東健康福祉事務所管内,20代,男性,陽性患者の濃厚接触者,,2020-07-29,2020-07-29T00:00:00+09:00,水
1103,1105,龍野健康福祉事務所管内,40代,男性,行動歴調査中,,2020-07-29,2020-07-29T00:00:00+09:00,水


In [54]:
patients = {
    "data": df_pt.to_dict(orient="records"),
    "last_update": last_update.isoformat(),
}

In [55]:
jsonschema.validate(patients, PATIENTS_SCHEMA)

dumps_json("patients.json", patients)

In [56]:
# 年代集計

age_list = ["10歳未満","10代", "20代", "30代", "40代", "50代", "60代", "70代", "80代", "90歳以上", "非公表"]

df_age = df_kanja["年代"].value_counts().sort_index().reindex(age_list, fill_value=0)

df_age = df_age.astype(int)

In [57]:
age = {
    "data": df_age.to_dict(),
    "last_update": last_update.isoformat(),
}

In [58]:
jsonschema.validate(age, AGE_SCHEMA)

dumps_json("age.json", age)

In [59]:
df_ages = pd.crosstab(df_kanja["発表日"], df_kanja["年代"]).reindex(
    age_list, axis=1, fill_value=0
)

if df_pcr.index[-1] not in df_ages.index:
    df_ages.loc[df_pcr.index[-1]] = 0

df_ages = df_ages.astype(int)
df_ages.sort_index(inplace=True)

df_agesum = df_ages.asfreq("D", fill_value=0)

df_agesum

年代,10歳未満,10代,20代,30代,40代,50代,60代,70代,80代,90歳以上,非公表
発表日,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-03-01,0,0,0,0,1,0,0,0,0,0,0
2020-03-02,0,0,0,0,0,0,0,0,0,0,0
2020-03-03,0,0,0,0,2,0,0,0,0,0,0
2020-03-04,0,0,0,0,0,0,0,0,0,0,0
2020-03-05,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2020-07-25,2,5,7,2,2,3,2,0,1,0,0
2020-07-26,0,5,17,4,7,3,6,3,3,0,1
2020-07-27,0,0,3,4,2,1,2,0,0,0,0
2020-07-28,0,2,6,11,6,4,1,2,1,0,0


In [60]:
labels = df_agesum.index.map(lambda d: f"{d.month}/{d.day}")

In [61]:
age_summary = {
    "data": df_agesum.to_dict(orient="list"),
    "labels": labels.tolist(),
    "last_update": last_update.isoformat(),
}

In [62]:
jsonschema.validate(age_summary, AGE_SUMMARY_SCHEMA)

dumps_json("age_summary.json", age_summary)

In [63]:
# クラスタ概要

df_cluster_sum = df_kanja.loc[:, "認定こども園":"特定できず"].copy().notnull().sum()

In [64]:
clusters_summary = {
    "data": df_cluster_sum.to_dict(),
    "last_update": last_update.isoformat(),
}

In [65]:
jsonschema.validate(clusters_summary, CLUSTERS_SUMMARY_SCHEMA)

dumps_json("clusters_summary.json", clusters_summary)

In [66]:
# クラスタ

df_clusters = df_kanja.loc[:, "認定こども園":"特定できず"].copy().fillna(0)

In [67]:
df_clusters[df_clusters != 0] = 1

In [68]:
df_clusters["発表日"] = df_kanja["発表日"]

In [69]:
pv_clusters = df_clusters.pivot_table(index="発表日", aggfunc="sum")

In [70]:
if df_pcr.index[-1] not in pv_clusters.index:
    pv_clusters.loc[df_pcr.index[-1]] = 0

pv_clusters.sort_index(inplace=True)

In [71]:
pv_clusters = pv_clusters.asfreq("D", fill_value=0)

pv_clusters["日付"] = pv_clusters.index.map(lambda d: pd.Timestamp(d, tz='Asia/Tokyo').isoformat())

In [72]:
clusters = {
    "data": pv_clusters.to_dict(orient="recodes"),
    "last_update": last_update.isoformat(),
}

In [73]:
jsonschema.validate(clusters, CLUSTERS_SCHEMA)
dumps_json("clusters.json", clusters)

In [74]:
# 重複者

(df_kanja.loc[:, "認定こども園":].copy().notnull().sum(axis=1) > 1).sum()

4

## yousei.xlsx

In [75]:
yousei_path = get_file(YOUSEI_XLSX, DOWNLOAD_DIR)

In [76]:
df_yousei = pd.read_excel(yousei_path, index_col="発表年月日")

df_yousei.columns = df_yousei.columns.map(lambda s: s.replace("（累計）", "").strip())

# df_yousei.index += pd.to_timedelta("1 days")

df_yousei.rename(columns={"入院中（宿泊療養を含む）": "入院中", "中等症以下": "軽症・中等症", "陽性者数": "陽性患者数"}, inplace=True)

df_yousei.drop("発表時間", axis=1, inplace=True)

df_yousei

Unnamed: 0_level_0,検査実施人数,陽性患者数,入院中,軽症・中等症,重症,死亡,退院
発表年月日,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-03-10,444,25,24,20,4,0,1
2020-03-11,540,37,35,31,4,1,1
2020-03-12,650,46,44,40,4,1,1
2020-03-13,744,56,54,50,4,1,1
2020-03-14,824,67,64,57,7,1,2
...,...,...,...,...,...,...,...
2020-07-25,23180,965,124,123,1,45,796
2020-07-26,23551,1014,158,154,4,45,811
2020-07-27,24011,1026,170,166,4,45,811
2020-07-28,24520,1059,203,195,8,45,811


In [77]:
d = df_yousei.iloc[-1].to_dict()

In [78]:
d

{'入院中': 204,
 '検査実施人数': 25096,
 '死亡': 45,
 '軽症・中等症': 198,
 '退院': 905,
 '重症': 6,
 '陽性患者数': 1105}

In [79]:
main_summary = {
    "attr": "検査実施人数",
    "value": d["検査実施人数"],
    "children": [
        {
            "attr": "陽性患者数",
            "value": d["陽性患者数"],
            "children": [
                {
                    "attr": "入院中",
                    "value": d["入院中"],
                    "children": [
                        {"attr": "軽症・中等症", "value": d["軽症・中等症"]},
                        {"attr": "重症", "value": d["重症"]},
                    ],
                },
                {"attr": "死亡", "value": d["死亡"]},
                {"attr": "退院", "value": d["退院"]},
            ],
        }
    ],
    "last_update": last_update.isoformat(),
}

In [80]:
jsonschema.validate(main_summary, MAIN_SUMMARY_SCHEMA)

dumps_json("main_summary.json", main_summary)

In [81]:
ser_cur = df_yousei["入院中"].reindex(df_pcr.index)

In [82]:
df_current = pd.DataFrame({"小計": ser_cur.combine_first(df_pcr["陽性確認"].cumsum())}).diff().fillna(0).astype(int)

In [83]:
df_current["日付"] = df_current.index.map(lambda d: pd.Timestamp(d, tz='Asia/Tokyo').isoformat())

In [84]:
df_cur_pts = df_current.loc[:, ["日付", "小計"]].copy()

In [85]:
current_patients = {
    "data": df_cur_pts.to_dict(orient="records"),
    "last_update": last_update.isoformat(),
}

In [86]:
jsonschema.validate(current_patients, PATIENTS_SUMMARY_SCHEMA)

dumps_json("current_patients.json", current_patients)

# チェック

In [87]:
json_check("inspections.json", inspections)
json_check("inspections_summary.json", inspections_summary)

inspections.json
inspections_summary.json


In [88]:
json_check("age.json", age)
json_check("age_summary.json", age_summary)

age.json
age_summary.json


In [89]:
json_check("patients_summary.json", patients_summary)
json_check("patients.json", patients)
json_check("current_patients.json", current_patients)

patients_summary.json
patients.json
current_patients.json


In [90]:
json_check("clusters.json", clusters)
json_check("clusters_summary.json", clusters_summary)

clusters.json
clusters_summary.json


In [91]:
json_check("main_summary.json", main_summary)

main_summary.json
