In [None]:
### 第二步：获取资源

### 2.1 在线下载Github上BlankerL整合的疫情数据资源

    - 每日更新，最新的疫情数据  
    - requests库进行下载  
    - tqdm库下载进度条展示  

In [4]:
# 导包
import requests as requests
import datetime
from tqdm import tqdm # 显示进度条， 不同的终端中可能加载显示的方式会有点不一样


In [None]:
# DXYArea.csv每日3-5:00am更新，获取当日的下载链接（且当日的新生成的数据作者有帮我们做js-proxy加速下载）
StrToday = datetime.datetime.today().strftime('%Y.%m.%d')
url = "https://github.91chifun.workers.dev//https://github.com/BlankerL/DXY-COVID-19-Data/releases/download/"+StrToday+"/DXYArea.csv"       # 最新疫情文件文件地址
# url = "https://github.91chifun.workers.dev//https://github.com/BlankerL/DXY-COVID-19-Data/releases/download/2021.01.13/DXYRumors.csv"     # 用于测试脚本是否可行，能下载
# url = "https://img.ithome.com/newsuploadfiles/2020/10/20201010_100849_225.jpg"                                                            # 用于测试脚本是否可行，能下载

res = requests.get(url, timeout = 5000, stream=True)            # 大文件以流的模式进行下载
file_size = int(res.headers.get('Content-Length'))              # 获取视频的总大小
res.encoding="UTF-8"                                            # 直接用requests 返回的Response对象的encoding属性调整编码 
pbar = tqdm(total=file_size)                                    # 设置进度条的长度
with open("AAA.csv","wb") as f:                                 # wb：以二进制方式写入文件
    for chunk in res.iter_content(1024 * 1024):                 # 变量名定义成：chunk，是因为采用HTTP协议chunk编码(分块传输编码)
        f.write(chunk)                                          # r.content：以二进制方式读取文件
        pbar.set_description('Downloading......')               
        pbar.update(1024 * 1024)                                # 更新进度条长度
    pbar.set_description('Download completed!!!')               # 下载完提示
    pbar.close()                                                # 下载完，关闭pbar，不关闭下载打印进度条会出现：嵌套打印

In [None]:
### <font color=green>为了演示方便，本地已经下载好了，直接使用pandas库读取csv文件</font>


In [1]:
import pandas as pd
df = pd.read_csv("\GitHub\Visualization-and-Analysis-of-COVID-19-Global-Epidemic\Raw_data\DXYArea.csv")

In [2]:
# 查看数据情况
df.info()

# 重新确定一下缺失值的列
df.isnull().any()

# 获取含有空值的行数据
df[df.isnull().T.any()]

df.describe()
# 查看数学统计中，一些值是否正常，比如最小值的正负性合理否？

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410436 entries, 0 to 410435
Data columns (total 19 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   continentName            410392 non-null  object 
 1   continentEnglishName     410392 non-null  object 
 2   countryName              410436 non-null  object 
 3   countryEnglishName       385806 non-null  object 
 4   provinceName             410436 non-null  object 
 5   provinceEnglishName      385806 non-null  object 
 6   province_zipCode         410436 non-null  int64  
 7   province_confirmedCount  410436 non-null  int64  
 8   province_suspectedCount  410433 non-null  float64
 9   province_curedCount      410436 non-null  int64  
 10  province_deadCount       410436 non-null  int64  
 11  cityName                 120031 non-null  object 
 12  cityEnglishName          114970 non-null  object 
 13  city_zipCode             118765 non-null  float64
 14  city

In [2]:
# updateTime的类型是object类型，先对其进行转换
df['updateTime'] = pd.to_datetime(df['updateTime'],format='%Y-%m-%d')
# 修改updateTime的时间格式，normalize和date输出的格式都是2020-01-01，无法良好的剔除年份
# 但是strftime性能极差，33w条数据用了2.8秒，normalize()函数只需要0.04秒，date需要0.15秒，但是都保留了年份
df['updateTime'] = df['updateTime'].dt.normalize()
# df['updateTime'] = df['updateTime'].dt.date
# df['updateTime'] = df['updateTime'].apply(lambda x : x.strftime('%m-%d'))

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337329 entries, 0 to 337328
Data columns (total 19 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   continentName            337285 non-null  object        
 1   continentEnglishName     337285 non-null  object        
 2   countryName              337329 non-null  object        
 3   countryEnglishName       318863 non-null  object        
 4   provinceName             337329 non-null  object        
 5   provinceEnglishName      318863 non-null  object        
 6   province_zipCode         337329 non-null  int64         
 7   province_confirmedCount  337329 non-null  int64         
 8   province_suspectedCount  337326 non-null  float64       
 9   province_curedCount      337329 non-null  int64         
 10  province_deadCount       337329 non-null  int64         
 11  updateTime               337329 non-null  datetime64[ns]
 12  cityName        

In [3]:
# 数据清洗
# 根据疫情每日更新时间，和省份这两列来判断重复值，并删除重复值，仅保留第一列数据
df.drop_duplicates(subset=['updateTime','provinceName'],keep='first',inplace=True)
df

Unnamed: 0,continentName,continentEnglishName,countryName,countryEnglishName,provinceName,provinceEnglishName,province_zipCode,province_confirmedCount,province_suspectedCount,province_curedCount,province_deadCount,updateTime,cityName,cityEnglishName,city_zipCode,city_confirmedCount,city_suspectedCount,city_curedCount,city_deadCount
0,北美洲,North America,美国,United States of America,美国,United States of America,971002,8338091,0.0,3323354,222210,2020-10-22,,,,,,,
1,欧洲,Europe,法国,France,法国,France,961002,957421,0.0,107652,34048,2020-10-22,,,,,,,
2,亚洲,Asia,印度,India,印度,India,953003,7706946,0.0,6795103,116616,2020-10-22,,,,,,,
3,欧洲,Europe,英国,United Kingdom,英国,United Kingdom,961007,789229,0.0,539,44158,2020-10-22,,,,,,,
4,南美洲,South America,巴西,Brazil,巴西,Brazil,973003,5300649,0.0,4721593,155459,2020-10-22,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337311,亚洲,Asia,中国,China,云南省,Yunnan,530000,1,0.0,0,0,2020-01-22,,,,,,,
337319,亚洲,Asia,中国,China,吉林省,Jilin,220000,0,1.0,0,0,2020-01-22,,,,,,,
337325,亚洲,Asia,中国,China,台湾,Taiwan,710000,1,0.0,0,0,2020-01-22,,,,,,,
337326,亚洲,Asia,中国,Hongkong,香港,Hongkong,810000,0,117.0,0,0,2020-01-22,,,,,,,


In [4]:
# 查看需要补全英文名的国家，绘制疫情地图时需要，一共需要收集补全34个国家英文名
df1 = df[df['provinceEnglishName'].isnull().values==True]
df1.drop_duplicates(subset=['provinceName'],keep='first',inplace=True)
# df1['provinceName'],df1.shape[0]
# 将其导入到另一个数组中
df1_provinceName = df1['provinceName']
df1_provinceName.reset_index(drop=True,inplace=True)
df1_provinceName

0         吉尔吉斯斯坦
1             黑山
2          瓜德罗普岛
3            阿鲁巴
4            马约特
5         赞比亚共和国
6            佛得角
7            南苏丹
8        美属维尔京群岛
9          刚果（布）
10         荷属圣马丁
11     特克斯和凯科斯群岛
12           百慕大
13       荷兰加勒比地区
14          库拉索岛
15         几内亚比绍
16         也门共和国
17        布隆迪共和国
18           科摩罗
19         厄立特里亚
20     北马里亚纳群岛联邦
21        圣巴泰勒米岛
22       英属维尔京群岛
23          多米尼克
24          格林那达
25           东帝汶
26           格陵兰
27        新喀里多尼亚
28    圣皮埃尔和密克隆群岛
29       圣其茨和尼维斯
30         蒙特塞拉特
31         福克兰群岛
32           安圭拉
33           科索沃
Name: provinceName, dtype: object

In [5]:
df1_provinceEnglishName = pd.Series(['Kyrgyzstan', 'Montenegro', 'Guadeloupe', 'Aruba', 'Mayotte', 'Republic of Zambia', 'Cape Verde', 'South Sudan', 'United States Virgin Islands', 'Congo (Brazzaville)', 'Sint Maarten', 'Turks and Caicos Islands', 'Bermuda', 'Netherlands Caribbean', 'Curacao Island', 'Guinea-Bissau', 'Republic of Yemen', 'Republic of Burundi', 'Comoros', 'Eritrean', 'Saint-Pierre et Miquelon', 'Saint-Barthelemy', 'British Virgin Islands', 'dominica', 'Grinnada', 'Timor-Leste', 'Greenland', 'New Caledonia', 'St. Pierre and Miquelon Islands', 'Saint Kitts and Nevis', 'Montserrat', 'Falklands', 'Anguilla', 'Kosovo'], name= 'provinceEnglishName', index = df1_provinceName)
df1_provinceEnglishName
# for i in range(len(df1_provinceEnglishName)):
#     # df1.iloc[i,3] = df1.iloc[i,5] = df1_provinceEnglishName[df1.iloc[i,4]]
#     print(df1_provinceEnglishName.iloc[0,i])
# df1

# type(df1_provinceEnglishName)
# df2 = pd.concat([df1_provinceName, df1_provinceEnglishName],axis=1)
# df2


provinceName
吉尔吉斯斯坦                             Kyrgyzstan
黑山                                 Montenegro
瓜德罗普岛                              Guadeloupe
阿鲁巴                                     Aruba
马约特                                   Mayotte
赞比亚共和国                     Republic of Zambia
佛得角                                Cape Verde
南苏丹                               South Sudan
美属维尔京群岛          United States Virgin Islands
刚果（布）                     Congo (Brazzaville)
荷属圣马丁                            Sint Maarten
特克斯和凯科斯群岛            Turks and Caicos Islands
百慕大                                   Bermuda
荷兰加勒比地区                 Netherlands Caribbean
库拉索岛                           Curacao Island
几内亚比绍                           Guinea-Bissau
也门共和国                       Republic of Yemen
布隆迪共和国                    Republic of Burundi
科摩罗                                   Comoros
厄立特里亚                                Eritrean
北马里亚纳群岛联邦            Saint-Pierre et Miquelon
圣巴泰勒米岛               

In [6]:
# 查看有多少个countryEnglishName无的数据是空缺值还是空值；
# 空值：在pandas中的空值是""
# 缺失值：在dataframe中为nan或者naT（缺失时间），在series中为none或者nan即可
df1 = df[df['provinceEnglishName'].isnull().values==True]

# 遍历空缺值数组 - 按行数遍历
for i in range(len(df1)):
    df1.iloc[i,3] = df1.iloc[i,5] = df1_provinceEnglishName[df1.iloc[i,4]]
df1

# iloc 性能太差了
# for i in range(len(df1)):
#     df.loc[df1.iloc[i,0],:] = df1.iloc[i,:]

'''
小问题：性能改进
1. 使用Pandas库中自由度和想象力最高的apply函数;
2. intertuples相较于interrow的效率更高，遍历速度更快。

'''
for i,v in df1.iterrows():
    if pd.isnull(df.loc[i,'countryEnglishName']) & pd.isnull(df.loc[i,'provinceEnglishName']):
        df.loc[i,'countryEnglishName'] = df.loc[i,'provinceEnglishName'] = df1.loc[i,'countryEnglishName']

df

# '''
# 注意
# iterrows()遍历上万行数组后，并修改数据，无法保存最后的结果
# iterrows()返回的的只是dataframe的view 而不是copy。所以修改无法保存。
# '''
# # for index,row in df1.iterrows():
# #     # getattr(row,'provinceName')
# #     row['provinceEnglishName'] = df1_provinceEnglishName[row['provinceName']]
# # df1

# '''
# # demo
# test = {'a':[1,20,20],'b':[2,10,10],'c':[4,14,14],'d':[0,0,0]}
# df11= pd.DataFrame(test)
# for index,row in df11.iterrows():
#     row['d'] = row['a']*0.5 + row['b']*0.4
# df11
# '''

Unnamed: 0,continentName,continentEnglishName,countryName,countryEnglishName,provinceName,provinceEnglishName,province_zipCode,province_confirmedCount,province_suspectedCount,province_curedCount,province_deadCount,updateTime,cityName,cityEnglishName,city_zipCode,city_confirmedCount,city_suspectedCount,city_curedCount,city_deadCount
0,北美洲,North America,美国,United States of America,美国,United States of America,971002,8338091,0.0,3323354,222210,2020-10-22,,,,,,,
1,欧洲,Europe,法国,France,法国,France,961002,957421,0.0,107652,34048,2020-10-22,,,,,,,
2,亚洲,Asia,印度,India,印度,India,953003,7706946,0.0,6795103,116616,2020-10-22,,,,,,,
3,欧洲,Europe,英国,United Kingdom,英国,United Kingdom,961007,789229,0.0,539,44158,2020-10-22,,,,,,,
4,南美洲,South America,巴西,Brazil,巴西,Brazil,973003,5300649,0.0,4721593,155459,2020-10-22,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337311,亚洲,Asia,中国,China,云南省,Yunnan,530000,1,0.0,0,0,2020-01-22,,,,,,,
337319,亚洲,Asia,中国,China,吉林省,Jilin,220000,0,1.0,0,0,2020-01-22,,,,,,,
337325,亚洲,Asia,中国,China,台湾,Taiwan,710000,1,0.0,0,0,2020-01-22,,,,,,,
337326,亚洲,Asia,中国,Hongkong,香港,Hongkong,810000,0,117.0,0,0,2020-01-22,,,,,,,


In [9]:
# 输出Excel文件进行校验
df.to_csv('./out4Test.csv')
# df.to_excel('./cleanedData.xlsx')

# 清洗后的数据，查看空值
df.info()
df.isnull().any()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47561 entries, 0 to 337327
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   continentName            47523 non-null  object        
 1   continentEnglishName     47523 non-null  object        
 2   countryName              47561 non-null  object        
 3   countryEnglishName       47561 non-null  object        
 4   provinceName             47561 non-null  object        
 5   provinceEnglishName      47561 non-null  object        
 6   province_zipCode         47561 non-null  int64         
 7   province_confirmedCount  47561 non-null  int64         
 8   province_suspectedCount  47560 non-null  float64       
 9   province_curedCount      47561 non-null  int64         
 10  province_deadCount       47561 non-null  int64         
 11  updateTime               47561 non-null  datetime64[ns]
 12  cityName                 3214 n

continentName               True
continentEnglishName        True
countryName                False
countryEnglishName         False
provinceName               False
provinceEnglishName        False
province_zipCode           False
province_confirmedCount    False
province_suspectedCount     True
province_curedCount        False
province_deadCount         False
updateTime                 False
cityName                    True
cityEnglishName             True
city_zipCode                True
city_confirmedCount         True
city_suspectedCount         True
city_curedCount             True
city_deadCount              True
dtype: bool

In [7]:
date_list = df.drop_duplicates(subset=['updateTime'],keep='first')
date_list = date_list[['updateTime']]
date_list = date_list.reset_index(drop=True)


cyENName_list = df.drop_duplicates(subset=['countryEnglishName'],keep='first')
cyENName_list = cyENName_list[['countryEnglishName']].reset_index(drop=True)


In [97]:
'''
pyecharts v0.3.2以后，pyecharts 将不再自带地图 js 文件。如用户需要用到地图图表，可自行安装对应的地图文件包。

地图文件被分成了三个 Python 包，分别为：

全球国家地图: echarts-countries-pypkg (1.9MB)
中国省级地图: echarts-china-provinces-pypkg (730KB)
中国市级地图: echarts-china-cities-pypkg (3.8MB)

直接使用python的pip安装：

选择自己需要的安装的地图
    pip install echarts-countries-pypkg
    pip install echarts-china-provinces-pypkg
    pip install echarts-china-cities-pypkg
    pip install echarts-china-counties-pypkg
    pip install echarts-china-misc-pypkg
    pip install echarts-united-kingdom-pypkg

生成图片：

$ pip install snapshot-selenium
或
$ pip install snapshot-phantomjs
'''

from pyecharts.charts import Map  # 注意这里与老版本pyecharts调用的区别
from pyecharts import options as opts

# 每个国家地区取最前面的一行数据,并删除其他行
df4WorldMap = df.drop_duplicates(subset=['countryEnglishName'], keep='first')
# 这一步实际上作用没有发生，但是作用是为了绘制国家地图，提出中国省份
df4WorldMap = df4WorldMap[df4WorldMap['countryEnglishName'] == df4WorldMap['provinceEnglishName']]
df4WorldMap.replace('United States of America', 'United States',inplace = True)

# df4WorldMap = df4WorldMap[['province_confirmedCount', 'countryEnglishName']]
# df4WorldMap01 = pd.Series(df4WorldMap['province_confirmedCount'].values, index=df4WorldMap['countryEnglishName'])

# # 构建一个list，用于pyecharts库中Map遍历并绘图，方法一
countryEnglishName = df4WorldMap['countryEnglishName'].values.tolist()
province_confirmedCount = df4WorldMap['province_confirmedCount'].values.tolist()
# countryEnglishName

# 构建一个list，用于pyecharts库中Map遍历并绘图，方法二，不成功
# df4WorldMap01 = [(df4WorldMap.iloc[i,3], df4WorldMap.iloc[i,7]) for i in range(10)]
# df4WorldMap01

In [100]:
from pyecharts.charts import Map  # 注意这里与老版本pyecharts调用的区别
from pyecharts import options as opts
import random
# country = ['China', 'Canada', 'France', 'Japan', 'Russia', 'USA']
# data_world = [(i, random.randint(100, 200)) for i in country]
# data_world

world = (
    Map()
    .add('Global Covid-19 Map (# confirmed cases)', # 此处没取名，所以空着
        [list(z) for z in zip(countryEnglishName, province_confirmedCount)], # 数据
        'world',is_map_symbol_show=False) # 地图类型
    # .set_series_opts(label_opts=opts.LabelOpts(is_show=True))
    .set_global_opts(
        title_opts=opts.TitleOpts(title='World Map'),
        visualmap_opts=opts.VisualMapOpts(
            max_=5000000,
            is_piecewise=True,
            pieces=[{"max": 100, "min": 0, "label": "<=100","color":"#FFFFFF"},
                  {"max": 10000, "min": 100, "label": "100~10000","color":"#FFE4E1"},
                  {"max": 100000, "min": 10000, "label": "10000~100000","color":"#FF7F50"},
                  {"max": 1000000, "min": 100000, "label": "100000~1000000","color":"#F08080"},
                  {"max": 10000000, "min": 1000000, "label": "1000000~10000000","color":"#CD5C5C"},
                  {"max": 100000000, "min": 10000000, "label": ">=10000000", "color":"#8B0000"}])  # 定义图例为分段型，默认为连续的图例
    )
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .render(path='世界地图.html')
)