In [1]:
import time
import json
import logging
import requests
import pandas as pd
from pprint import pprint
from bs4 import BeautifulSoup

In [2]:
LOG_FILENAME = "lastlog.log"
JSON_FILENAME = "ncov_{ctime}.json".format(ctime=time.strftime('%Y%m%d-%H%M%S'))
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
with open(LOG_FILENAME, 'w') as f:
    f.writelines([LOG_FORMAT, '\n'])
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG, format=LOG_FORMAT)

logging.debug("This is a debug log.")
logging.info("This is a info log.")
logging.warning("This is a warning log.")
logging.error("This is a error log.")
logging.critical("This is a critical log.")

In [3]:
response = requests.get("https://ncov.dxy.cn/ncovh5/view/pneumonia")
HTML = response.text.encode(response.encoding).decode()
soup = BeautifulSoup(HTML)
soup.prettify

<bound method Tag.prettify of <!DOCTYPE html>
<html lang="zh-cn" xmlns:layout="http://www.ultraq.net.nz/web/thymeleaf/layout"><head>
<link href="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/umi.bundle.css?t=1580864327599" rel="stylesheet"/>
<meta charset="utf-8"/>
<meta content="width=device-width,initial-scale=1,user-scalable=0,viewport-fit=cover" name="viewport"/>
<meta content="#000000" name="theme-color"/>
<title>全国新型肺炎疫情实时动态 - 丁香园·丁香医生</title>
<script>
        window.routerBase = "/ncovh5/view";
    </script>
<script charset="utf-8" src="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/vendors~p__Pneumonia~p__Pneumonia__recommend-list~p__Pneumonia__rumor-list~p__Pneumonia__timeline.async.5ccb549d.js"></script><script charset="utf-8" src="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/vendors~p__Pneumonia~p__Pneumonia__rumor-list.async.9184546f.js"></script><link href="//assets.dxycdn.com/gitrepo/ncov-mobile/dist/vendors~p__Pneumonia.async.9f1a94c3.css" rel="stylesheet" type="text/css"/>

In [4]:
text = soup.contents[1].text
def parse_text(text):
    subtext = text[text.find("window.getAreaStat")::]
    subtext = subtext[subtext.find("[{")::]
    # print(subtext)
    num = 0
    chars = []
    for c in subtext:
        chars.append(c)
        if c == '[':
            num += 1
        if c == ']':
            num -= 1
        if num == 0:
            break
    return ''.join(chars)
counting_json = json.loads(parse_text(text))
print(counting_json)

[{'provinceName': '湖北省', 'provinceShortName': '湖北', 'confirmedCount': 16678, 'suspectedCount': 0, 'curedCount': 522, 'deadCount': 479, 'comment': '', 'locationId': 420000, 'cities': [{'cityName': '武汉', 'confirmedCount': 8351, 'suspectedCount': 0, 'curedCount': 371, 'deadCount': 362, 'locationId': 420100}, {'cityName': '黄冈', 'confirmedCount': 1645, 'suspectedCount': 0, 'curedCount': 52, 'deadCount': 25, 'locationId': 421100}, {'cityName': '孝感', 'confirmedCount': 1462, 'suspectedCount': 0, 'curedCount': 6, 'deadCount': 18, 'locationId': 420900}, {'cityName': '襄阳', 'confirmedCount': 735, 'suspectedCount': 0, 'curedCount': 7, 'deadCount': 2, 'locationId': 420600}, {'cityName': '荆州', 'confirmedCount': 713, 'suspectedCount': 0, 'curedCount': 10, 'deadCount': 9, 'locationId': 421000}, {'cityName': '随州', 'confirmedCount': 706, 'suspectedCount': 0, 'curedCount': 9, 'deadCount': 8, 'locationId': 421300}, {'cityName': '黄石', 'confirmedCount': 509, 'suspectedCount': 0, 'curedCount': 17, 'deadCount'

In [5]:
country_df = pd.read_json(json.dumps(counting_json))
country_df.to_json(open(JSON_FILENAME, 'w'))
country_df

Unnamed: 0,provinceName,provinceShortName,confirmedCount,suspectedCount,curedCount,deadCount,comment,locationId,cities
0,湖北省,湖北,16678,0,522,479,,420000,"[{'cityName': '武汉', 'confirmedCount': 8351, 's..."
1,浙江省,浙江,829,0,62,0,,330000,"[{'cityName': '温州', 'confirmedCount': 340, 'su..."
2,广东省,广东,813,0,30,0,,440000,"[{'cityName': '深圳', 'confirmedCount': 271, 'su..."
3,河南省,河南,764,0,41,2,待明确地区：治愈 16；参考卫健委统计口径，部分县区与地级市合并,410000,"[{'cityName': '信阳', 'confirmedCount': 138, 'su..."
4,湖南省,湖南,593,0,31,0,,430000,"[{'cityName': '长沙', 'confirmedCount': 148, 'su..."
5,安徽省,安徽,530,0,20,0,,340000,"[{'cityName': '合肥', 'confirmedCount': 93, 'sus..."
6,江西省,江西,476,0,20,0,,360000,"[{'cityName': '南昌', 'confirmedCount': 121, 'su..."
7,重庆市,重庆,366,0,14,2,,500000,"[{'cityName': '万州区', 'confirmedCount': 65, 'su..."
8,江苏省,江苏,308,0,12,0,,320000,"[{'cityName': '苏州', 'confirmedCount': 55, 'sus..."
9,四川省,四川,301,0,23,1,,510000,"[{'cityName': '成都', 'confirmedCount': 92, 'sus..."


In [6]:
provinces_df = dict()
for idx in country_df.index:
    se = country_df.iloc[idx]
    if se['cities']:
        provinces_df[se['provinceName']] = pd.read_json(json.dumps(se['cities']))
pprint(provinces_df)

{'上海市':    cityName  confirmedCount  suspectedCount  curedCount  deadCount  locationId
0    外地来沪人员              84               0           6          1          -1
1      浦东新区              45               0           0          0      310115
2       徐汇区              12               0           0          0      310104
3       宝山区              12               0           0          0      310113
4       静安区              12               0           0          0      310106
5       闵行区              11               0           0          0      310112
6       长宁区               9               0           0          0      310105
7       松江区               9               0           0          0      310117
8       普陀区               6               0           0          0      310107
9       杨浦区               6               0           0          0      310110
10      奉贤区               6               0           0          0      310120
11      虹口区               5               0 

In [7]:
with open(LOG_FILENAME, 'r') as f:
    for e in f.readlines():
        print(e, end='')

%(asctime)s - %(levelname)s - %(message)s
2020-02-05 09:17:22,960 - DEBUG - This is a debug log.
2020-02-05 09:17:22,960 - INFO - This is a info log.
2020-02-05 09:17:22,961 - ERROR - This is a error log.
2020-02-05 09:17:22,961 - CRITICAL - This is a critical log.
2020-02-05 09:17:22,977 - DEBUG - Starting new HTTPS connection (1): ncov.dxy.cn:443
2020-02-05 09:17:23,132 - DEBUG - https://ncov.dxy.cn:443 "GET /ncovh5/view/pneumonia HTTP/1.1" 200 None
