In [1]:
import time
import json
import logging
import requests
import pandas as pd
from pprint import pprint
from bs4 import BeautifulSoup

In [2]:
LOG_FILENAME = "lastlog.log"
JSON_FILENAME = "ncov_{ctime}.json".format(ctime=time.strftime('%Y%m%d-%H%M%S'))
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
with open(LOG_FILENAME, 'w') as f:
    f.writelines([LOG_FORMAT, '\n'])
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG, format=LOG_FORMAT)

logging.debug("This is a debug log.")
logging.info("This is a info log.")
logging.warning("This is a warning log.")
logging.error("This is a error log.")
logging.critical("This is a critical log.")

In [3]:
response = requests.get("https://ncov.dxy.cn/ncovh5/view/pneumonia")
HTML = response.text.encode(response.encoding).decode()
soup = BeautifulSoup(HTML)
soup.prettify

<bound method Tag.prettify of <!DOCTYPE html>
<html lang="zh-cn" style="filter: none;" xmlns:layout="http://www.ultraq.net.nz/web/thymeleaf/layout"><head>
<link href="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/umi.bundle.css?t=1581081930848" rel="stylesheet"/>
<meta charset="utf-8"/>
<meta content="width=device-width,initial-scale=1,user-scalable=0,viewport-fit=cover" name="viewport"/>
<meta content="#000000" name="theme-color"/>
<title>全国新型肺炎疫情实时动态 - 丁香园·丁香医生</title>
<script>
        window.routerBase = "/ncovh5/view";
    </script>
<script charset="utf-8" src="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/vendors~p__Pneumonia~p__Pneumonia__recommend-list~p__Pneumonia__rumor-list~p__Pneumonia__timeline.async.5ccb549d.js"></script><script charset="utf-8" src="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/vendors~p__Pneumonia~p__Pneumonia__rumor-list.async.9184546f.js"></script><link href="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/vendors~p__Pneumonia.async.3091a051.css" rel="stylesh

In [4]:
text = soup.contents[1].text
def parse_text(text):
    subtext = text[text.find("window.getAreaStat")::]
    subtext = subtext[subtext.find("[{")::]
    # print(subtext)
    num = 0
    chars = []
    for c in subtext:
        chars.append(c)
        if c == '[':
            num += 1
        if c == ']':
            num -= 1
        if num == 0:
            break
    return ''.join(chars)
counting_json = json.loads(parse_text(text))
print(counting_json)

[{'provinceName': '湖北省', 'provinceShortName': '湖北', 'confirmedCount': 24953, 'suspectedCount': 0, 'curedCount': 1115, 'deadCount': 699, 'comment': '', 'locationId': 420000, 'cities': [{'cityName': '武汉', 'confirmedCount': 13603, 'suspectedCount': 0, 'curedCount': 698, 'deadCount': 545, 'locationId': 420100}, {'cityName': '孝感', 'confirmedCount': 2313, 'suspectedCount': 0, 'curedCount': 38, 'deadCount': 26, 'locationId': 420900}, {'cityName': '黄冈', 'confirmedCount': 2041, 'suspectedCount': 0, 'curedCount': 105, 'deadCount': 36, 'locationId': 421100}, {'cityName': '随州', 'confirmedCount': 953, 'suspectedCount': 0, 'curedCount': 23, 'deadCount': 9, 'locationId': 421300}, {'cityName': '荆州', 'confirmedCount': 941, 'suspectedCount': 0, 'curedCount': 30, 'deadCount': 11, 'locationId': 421000}, {'cityName': '襄阳', 'confirmedCount': 907, 'suspectedCount': 0, 'curedCount': 31, 'deadCount': 5, 'locationId': 420600}, {'cityName': '黄石', 'confirmedCount': 703, 'suspectedCount': 0, 'curedCount': 43, 'dea

In [5]:
country_df = pd.read_json(json.dumps(counting_json))
country_df.to_json(open(JSON_FILENAME, 'w'))
country_df

Unnamed: 0,provinceName,provinceShortName,confirmedCount,suspectedCount,curedCount,deadCount,comment,locationId,cities
0,湖北省,湖北,24953,0,1115,699,,420000,"[{'cityName': '武汉', 'confirmedCount': 13603, '..."
1,浙江省,浙江,1048,0,127,0,,330000,"[{'cityName': '温州', 'confirmedCount': 438, 'su..."
2,广东省,广东,1034,0,88,1,,440000,"[{'cityName': '深圳', 'confirmedCount': 339, 'su..."
3,河南省,河南,914,0,86,3,参考卫健委统计口径，部分县区与地级市合并,410000,"[{'cityName': '信阳', 'confirmedCount': 176, 'su..."
4,湖南省,湖南,772,0,112,0,,430000,"[{'cityName': '长沙', 'confirmedCount': 190, 'su..."
5,安徽省,安徽,733,0,47,0,,340000,"[{'cityName': '合肥', 'confirmedCount': 128, 'su..."
6,江西省,江西,698,0,55,0,,360000,"[{'cityName': '南昌', 'confirmedCount': 168, 'su..."
7,江苏省,江苏,439,0,43,0,,320000,"[{'cityName': '苏州', 'confirmedCount': 72, 'sus..."
8,重庆市,重庆,426,0,31,2,,500000,"[{'cityName': '万州区', 'confirmedCount': 78, 'su..."
9,山东省,山东,407,0,38,0,,370000,"[{'cityName': '青岛', 'confirmedCount': 46, 'sus..."


In [6]:
provinces_df = dict()
for idx in country_df.index:
    se = country_df.iloc[idx]
    if se['cities']:
        provinces_df[se['provinceName']] = pd.read_json(json.dumps(se['cities']))
pprint(provinces_df)

{'上海市':    cityName  confirmedCount  suspectedCount  curedCount  deadCount  locationId
0    外地来沪人员              94               0          14          1          -1
1      浦东新区              52               0           6          0      310115
2       徐汇区              16               0           0          0      310104
3       宝山区              16               0           0          0      310113
4       静安区              15               0           2          0      310106
5       松江区              13               0           0          0      310117
6       闵行区              12               0           1          0      310112
7       长宁区              11               0           0          0      310105
8       奉贤区               9               0           0          0      310120
9       虹口区               7               0           0          0      310109
10      杨浦区               7               0           0          0      310110
11      普陀区               7               0 