In [73]:
import altair as alt
import pandas as pd
import numpy as np

In [74]:
data = pd.read_html('https://www.worldometers.info/coronavirus/')[0]

In [75]:
# save data to csv
data.to_csv('coronavirus.csv')

In [76]:
data.head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop
0,China,80701,50.0,3098.0,28.0,57332.0,20271,5264.0,56.1
1,S. Korea,7313,272.0,50.0,2.0,130.0,7133,36.0,142.6
2,Iran,6566,743.0,194.0,49.0,2134.0,4238,,78.2
3,Italy,5883,,233.0,,589.0,5061,567.0,97.3
4,Germany,951,151.0,,,18.0,933,9.0,11.4


# An overall statistics
Firstly, it is always useful to have a summary of the data. Let's create a data with total amount of infected people.

In [77]:
total = pd.melt(data, id_vars = 'Country,Other') # convert to long form
total = total[ total['Country,Other'] == 'Total:']

# construct a sorting list
names = ['TotalRecovered', 'ActiveCases', 'TotalDeaths']
colors = ['green', '#52C9E0', 'red']

pic_left = alt.Chart(total[ (total['variable'] == 'TotalDeaths' )| \
                         (total['variable'] == 'ActiveCases' )| \
                         (total['variable'] == 'TotalRecovered')]).mark_bar().encode(
    
 alt.Y('variable:N', sort = names, axis = alt.Axis(title = None, labelFontSize = 15), \
      scale = alt.Scale(paddingInner=0.1)),
    
    alt.X('value:Q', axis = alt.Axis(title = 'The overall statistics', titleFontSize = 15), \
          scale = alt.Scale(domain = [0, int( total[total['variable'] == 'TotalRecovered']['value'] )] ) ),
          
    alt.Color('variable:N', legend = alt.Legend(title = 'Cases', labelFontSize = 15, titleFontSize = 15), \
             scale = alt.Scale(domain = names, range = colors))
).properties(
    height = 150,
    width = 400
)

pic_left

# Country-ranking by severity level
Currently, the country with the most serious situation is China. But how about other countries? Could we contruct a figure to show a sorted list of those countries?

In [78]:
data_1 = data.sort_values(by = 'TotalCases', ascending = False).head(11).iloc[1:11]
# set ranking range from 1 to 10
data_1['index'] = list(range(1,11))
data_1.set_index('index', inplace = True)
data_1.fillna(value = {'TotalDeaths' : 0}, inplace = True)
data_1

Unnamed: 0_level_0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,China,80701,50.0,3098.0,28.0,57332.0,20271,5264.0,56.1
2,S. Korea,7313,272.0,50.0,2.0,130.0,7133,36.0,142.6
3,Iran,6566,743.0,194.0,49.0,2134.0,4238,,78.2
4,Italy,5883,,233.0,,589.0,5061,567.0,97.3
5,Germany,951,151.0,0.0,,18.0,933,9.0,11.4
6,France,949,,16.0,,12.0,921,45.0,14.5
7,Diamond Princess,696,,7.0,,245.0,444,32.0,
8,Spain,613,88.0,17.0,7.0,30.0,566,9.0,13.1
9,Japan,461,,6.0,,76.0,379,28.0,3.6
10,USA,447,12.0,19.0,,15.0,413,8.0,1.4


In [79]:
# ranking list 
names_c = data_1['Country,Other'].to_list()

# replace NaN with 0 in Total Deaths

middle  = alt.Chart(data_1).encode(
    alt.Y('Country,Other:N', sort = names, axis=None),
    text = alt.Text('Country,Other:N'),
).mark_text(fontSize = 20).properties( height = 400)

left = alt.Chart(data_1).encode(
    alt.Y('Country,Other:N', sort = names_c, title = None, axis = None),
    alt.X('TotalCases:Q', sort = 'descending', axis = alt.Axis(title = None,orient = 'top'))
).mark_bar(color = '#E08594').properties(
width = 300,
    height = 400,
    title = '# of Total Cases'
)

# long form
right_data = data_1[['Country,Other', 'TotalRecovered', 'ActiveCases','TotalDeaths']]
right_data = pd.melt(right_data, id_vars = 'Country,Other')

right = alt.Chart(right_data).mark_bar().encode(
    alt.Y('Country,Other:N', axis = None, sort = names_c),
    alt.X('value:Q', axis = alt.Axis(title = None,orient = 'top')),
    alt.Color('variable:N', scale = alt.Scale(domain = names, range = colors),\
             legend = alt.Legend(title = 'Type of Cases', titleFontSize = 15, labelFontSize = 15))
).properties(
width = 300,
    height = 400,
    title = 'Among Cases'
)
(left | middle| right).configure_view(strokeWidth=0)

As shown above, currently China has the greatest amount of detected cases and the number of death. Including the situation in China will simply destroy the figure for showing details of other countries. Therefore, it is more helpful to exclude the cases in China if we want to look at the statistics of other affected countries explicitly.

# Cases outside China

In [80]:
data_2 = data.sort_values(by = 'TotalCases', ascending = False).head(12).iloc[2:12]
data_2['index'] = list(range(1,11))
data_2.set_index('index', inplace = True)
data_2.fillna(value = {'TotalDeaths' : 0}, inplace = True)
data_2 = data_2[ ['Country,Other','TotalRecovered', 'ActiveCases', 'TotalDeaths'] ]
data_2 = pd.melt(data_2, id_vars = ['Country,Other'])
# ranking list 
names_c2 = data_2['Country,Other'].to_list()

rest = alt.Chart(data_2).mark_bar().encode(
    alt.X('Country,Other:N', sort = names_c, axis = alt.Axis(title = None, labelFontSize = 15)),
    alt.Y('value:Q',  sort = names, axis = alt.Axis(title = None, labelFontSize = 15)),
    alt.Color('variable:N', scale = alt.Scale(domain = names, range = colors),\
             legend = alt.Legend(title = 'Type of Cases', titleFontSize = 15, labelFontSize = 15))
).properties(
width = 700,
    height = 400,
    title = { 'text' :'Cases outside China', 'fontSize' : 20, \
             'subtitle' : 'There are totally {0} cases outside China'.format(  int(data.iloc[-1, 1]) -  int(data[ data['Country,Other'] == 'China']['TotalCases']) ),\
             'subtitleFontSize' : 15}
)


rest

# Time Series Analysis 
Let's use some data to analyze the development of coronavirus in Mainland China. Firstly we load the data from the government data centre of Hong Kong. 

In [81]:
# download the date from data.gov.hk
daily = pd.read_csv('areas_in_mainland_china_have_reported_cases_eng.csv')
daily

Unnamed: 0,As of date,As of time,Mainland China,Number of reported/confirmed cases,Number of deaths,Remark
0,11/01/2020,23:59,Hubei,41,,
1,12/01/2020,23:59,Hubei,41,,
2,13/01/2020,23:59,Hubei,41,,
3,15/01/2020,23:59,Hubei,41,,
4,16/01/2020,23:59,Hubei,45,,
...,...,...,...,...,...,...
2622,07/03/2020,9:00,Ningxia Hui Autonomous Region,75,0.0,
2623,07/03/2020,9:00,Jiangsu,631,0.0,
2624,07/03/2020,9:00,Xinjiang Uygur Autonomous Region,76,3.0,
2625,07/03/2020,9:00,Qinghai,18,0.0,


In [82]:
daily['date'] = pd.to_datetime(daily['As of date'], format='%d/%m/%Y')
daily.drop(columns = ['As of date'], inplace = True)
daily.set_index('date', inplace = True)
daily[daily.index == '2020-02-26']

Unnamed: 0_level_0,As of time,Mainland China,Number of reported/confirmed cases,Number of deaths,Remark
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-26,9:00,Beijing,400,4.0,
2020-02-26,9:00,Liaoning,121,1.0,
2020-02-26,9:00,Anhui,989,6.0,
2020-02-26,9:00,Chongqing,576,6.0,
2020-02-26,9:00,Shaanxi,245,1.0,
...,...,...,...,...,...
2020-02-26,18:00,Jiangsu,631,0.0,
2020-02-26,18:00,Xinjiang Uygur Autonomous Region,76,2.0,
2020-02-26,18:00,Qinghai,18,0.0,
2020-02-26,18:00,Tibet Autonomous Region,1,0.0,


From the above figure, we know that each province or Direct-administered municipality choose a different time to update their report, some of them even report several times a day. It brings difficulties to handle the number of daily new cases. Fortunately, the data of Hubei province always gets updated at 23:59. Let's firstly focus on the data of Hubei.

In [83]:
daily_hubei = daily[ (daily['As of time'] == '23:59') ]
daily_hubei = daily_hubei.fillna( value = {'Number of deaths' : 0})
daily_hubei = daily_hubei.drop(columns = ['Remark', 'As of time', 'Mainland China'])

# construct long form
daily_hubei.rename(columns = {'Number of reported/confirmed cases' : 'Reported cases'}, inplace = True)
daily_hubei = daily_hubei.T

time = daily_hubei.columns[1:-1]
value_current = daily_hubei.iloc[0, 1:-1].to_list()
value_yesterday = daily_hubei.iloc[0, 0:-2].to_list()
new_case = [value_current[n] - value_yesterday[n] for n in range(len(value_current))]

hubei_new = pd.DataFrame({ 'date': time, 'daily new case': new_case})

In [84]:
hubei_fig = alt.Chart(hubei_new).mark_line(color = '#BF4055').encode(
    alt.X('monthdate(date):T', axis = alt.Axis( title = 'From 12.01 to 04.03', titleFontSize = 15, \
                                               labelFontSize = 12)),
    alt.Y('daily new case:Q', axis = alt.Axis( title = None, titleFontSize = 15))
).properties(
    height = 400,
    width = 700
)

annotations = [['2020-01-13', 12000, 'No evidence of H-to-H transmission,stated by Wuhan'],
               ['2020-01-24', 10000, 'Zhong:"Evidence of H-to-H tranmission"'],
               ['2020-01-29', 6000, 'Wuhan Quarantine'],
               ['2020-01-24', 15000, 'Clinical Cases included'],
               ['2020-02-25', 4000, "WHO raises risk to very high"]]
text = pd.DataFrame(annotations, columns=['date','count','note'])
text['date'] = pd.to_datetime(text['date'], format='%Y-%m-%d')

figure_text =  alt.Chart(text).encode(
alt.X('monthdate(date):T'),
     alt.Y('count:Q', axis = None),
     text=alt.Text('note:N')
   ).mark_text( align='left',  baseline='middle', dy = 0, fontSize = 13)

pointer = pd.DataFrame({
    'x':  pd.to_datetime(['2020-01-19', '2020-01-19', '2020-01-31', '2020-01-20',\
                         '2020-02-01', '2020-01-23', '2020-02-04', '2020-02-11',\
                         '2020-02-29', '2020-02-29'], format = '%Y-%m-%d'),
    'y': [11500, 100, 9500, 100, 5500, 200, 14900, 14700, 3500, 650],
    'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E', 'E']
})

line = alt.Chart(pointer).mark_line().encode(
    x='monthdate(x):T',
    y='y',
    detail='class'
)

(hubei_fig  + figure_text + line).properties(
    height = 600,
    width = 900,
    title = {'text': 'Daily new cases in Hubei', 'fontSize' : 20}
)

# Create a dot map 

In [85]:
url_json = 'https://raw.githubusercontent.com/yezongyang/china-geojson/master/china.json'
data_geojson_remote = alt.Data(url=url_json)
# chart object
background = alt.Chart(data_geojson_remote).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
    ).project('mercator')

# data cleaning
df = daily[daily.index == '2020-03-05']
df = df[ (df['As of time'] == '18:00') | (df['As of time'] == '23:59')] 
df = df.drop( columns = ['As of time', 'Remark'])
df = df.rename( columns = {'Number of reported/confirmed cases' : 'Total Cases', 'Number of deaths' : 'Death', \
                          'Mainland China': 'Province'})

coordinate = {'Liaoning' : [123.429092, 41.796768], 'Jilin': [125.324501,43.886841], 'Heilongjiang': [126.642464, 45.756966], \
             'Beijing' : [116.405289, 39.904987], 'Tianjin': [117.190186, 39.125595], 'Inner Mongolia Autonomous Region': [111.751990, 40.841490], \
             'Ningxia Hui Autonomous Region': [106.232480, 38.486440], 'Shanxi': [112.549248, 37.857014], 'Hebei': [114.502464, 38.045475], \
             'Shandong':[117.000923, 36.675808], 'Henan':[113.665413, 34.757977], 'Shaanxi':[108.948021, 34.263161], \
             'Hubei':[114.298569, 30.584354], 'Jiangsu':[118.76741, 32.041546], 'Anhui':[117.283043, 31.861191], 'Shanghai':[121.472641, 31.231707], \
             'Hunan':[112.982277, 28.19409], 'Jiangxi':[115.892151, 28.676493], 'Zhejiang':[120.15358, 30.287458], 'Fujian':[119.306236, 26.075302], \
             'Guangdong':[113.28064, 23.125177], 'Hainan':[110.199890, 20.044220], 'Guangxi Zhuang Autonomous Region':[108.320007, 22.82402], \
              'Chongqing':[106.504959, 29.533155], 'Yunnan':[102.71225, 25.040609], 'Guizhou':[106.713478, 26.578342], \
             'Sichuan':[104.065735, 30.659462], 'Gansu':[103.834170, 36.061380], 'Qinghai':[101.777820, 36.617290], \
             'Tibet Autonomous Region':[91.11450,29.644150], 'Xinjiang Uygur Autonomous Region':[87.616880, 43.826630]}

longitude = []
latitude = []

for province in df['Province'].to_list():
    coor = coordinate[province]
    longitude.append(coor[0])
    latitude.append(coor[1])

df['longitude'] = longitude
df['latitude'] = latitude

points = alt.Chart(df).mark_circle(
    size=70,
    color='red'
).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    tooltip=['Province', 'Total Cases', 'Death']
)

In [86]:
(background + points).properties(
        width=500,
        height=500,
        title = {'text' : 'The National Report of Mainland China on 05.03.2020', 'fontSize' : 20, 'orient' : 'top', \
                'subtitle': 'Move to red dots to see the details'}
    ).configure_view(strokeWidth = 0)