In [10]:
import pandas as pd
import numpy as np

In [11]:
from datetime import date 
from datetime import timedelta
from urllib.error import HTTPError

def get_latest_daily_report():
    
    today = date.today()
    day_delta = timedelta(days=1)
    data_date = today

    while True: 
        try:
            data_date_str = date.strftime(data_date, '%m-%d-%Y')
            csv_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{}.csv".format(data_date_str)
            daily_report = pd.read_csv(csv_url)
            print("Imported daily_report of {}".format(data_date_str))
            break
        except HTTPError:
            data_date -= day_delta
    print(today)
    print(data_date_str)
    return daily_report
    

daily_report = get_latest_daily_report()
type(daily_report)

Imported daily_report of 04-23-2021
2021-04-25
04-23-2021


pandas.core.frame.DataFrame

In [12]:
daily_report.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,,,,Afghanistan,2021-04-24 04:20:38,33.93911,67.709953,58542,2565,52363.0,3614.0,Afghanistan,150.384009,4.38147
1,,,,Albania,2021-04-24 04:20:38,41.1533,20.1683,130270,2367,104278.0,23625.0,Albania,4526.721801,1.816995
2,,,,Algeria,2021-04-24 04:20:38,28.0339,1.6596,120562,3190,84038.0,33334.0,Algeria,274.935308,2.645942
3,,,,Andorra,2021-04-24 04:20:38,42.5063,1.5218,13007,123,12423.0,461.0,Andorra,16834.271662,0.945645
4,,,,Angola,2021-04-24 04:20:38,-11.2027,17.8739,25279,574,23089.0,1616.0,Angola,76.914726,2.270659


In [13]:
active_patient = daily_report['Confirmed']- daily_report['Deaths']- daily_report['Recovered'] #Simple operation
active_patient

0        3614.0
1       23625.0
2       33334.0
3         461.0
4        1616.0
         ...   
3978      305.0
3979    26877.0
3980     2437.0
3981      750.0
3982     1395.0
Length: 3983, dtype: float64

In [18]:
cut_bins = [0, 1000, 10000, 100000, np.Inf]
cut_labels = ['Less than 1000', 'Between 1000 and 10000', 'Between 10000 and 100000', 'Above 100000']
confirmed_bins = pd.cut(daily_report['Confirmed'], bins=cut_bins, labels=cut_labels)
daily_report['Confirmed']
confirmed_bins.value_counts()   #count the frequency for each category 
                                #change continuous data to discrete data

Between 1000 and 10000      1980
Less than 1000               876
Between 10000 and 100000     854
Above 100000                 233
Name: Confirmed, dtype: int64

## Derive Category from Category by Map

In [26]:
tw = daily_report[daily_report['Country_Region']=='Taiwan*']
mapping_dict = {
    'Taiwan*': 'Taiwan'
}
tw['Country_Region'].map(mapping_dict)      #新的類別對應舊的類別 pass in dictionary

643    Taiwan
Name: Country_Region, dtype: object

### Check if the data is US or NonUS

In [30]:
def country_is_us(x):
    if x == 'US':
        return 'US'
    else:
        return "Not US"
daily_report['Country_Region'].map(country_is_us).value_counts()   #pass in the function
daily_report['Country_Region'].map(lambda x:'US' if x=='US' else 'Not US') #or use lambda function

0       Not US
1       Not US
2       Not US
3       Not US
4       Not US
         ...  
3978    Not US
3979    Not US
3980    Not US
3981    Not US
3982    Not US
Name: Country_Region, Length: 3983, dtype: object

## Summarizing DataFrame with Aggregate Methods

In [35]:
ttl_confirmed = daily_report["Confirmed"].sum()
print("{:,}".format(ttl_confirmed))   # give a comma for every thousand place

145,640,414


In [36]:
daily_report['Confirmed'].mean()   #Data base is not normalized because the data aren't collected equally 

36565.50690434346

In [39]:
ttl_confirmed_by_country = daily_report.groupby('Country_Region')['Confirmed'].sum()
print(ttl_confirmed_by_country["Brazil"])
#Data base is now normalized to show the average confirmed cases 

ttl_confirmed_by_country.mean()

758543.8229166666