In [7]:
from datetime import datetime
from pathlib import Path

import datefinder
import pandas as pd
import tqdm.notebook

In [2]:
data_dir = Path('data/.')
data_list = [f for f in data_dir.glob('*.csv')]
len(data_list)

1025

In [3]:
def parse_date(data_path) -> str:
    date_time = [t for t in datefinder.find_dates(str(data_path))]
    date_str = datetime.strftime(date_time[0], '%Y%m%d')
    return date_str

In [4]:
# test for date string sort: OK
date_sorted = sorted([parse_date(dt) for dt in data_list])
print(min(date_sorted), max(date_sorted))

20200122 20221111


In [5]:
def extract_data_from_path(data_path):
    date_str = parse_date(data_path)
    df = pd.read_csv(data_path)

    if 'Country_Region' not in df.columns:
        df.rename(columns={'Country/Region': 'Country_Region'}, inplace=True)

    df.loc[df['Country_Region'] == 'Mainland China', 'Country_Region'] = 'China'
    df.loc[df['Country_Region'] == 'Korea, South', 'Country_Region'] = 'South Korea'
    df.loc[df['Country_Region'] == 'Taiwan*', 'Country_Region'] = 'Taiwan'
    
    df = (df
        .loc[:, ['Country_Region', 'Confirmed']]
        .rename(columns={'Confirmed': date_str})
        .groupby('Country_Region', as_index=False).sum(date_str)
    )
    return df

In [10]:
df_all = extract_data_from_path(data_list[0])
for filepath in tqdm.tqdm(data_list[1:]):
    df = extract_data_from_path(filepath)
    df_all = pd.merge(df_all, df, on='Country_Region', how='outer')

100%|██████████| 1024/1024 [01:16<00:00, 13.38it/s]


In [35]:
df_all_fillna = df_all.fillna(0)
df_all_fillna.head()

Unnamed: 0,Country_Region,20220121,20200226,20220120,20200227,20210704,20210705,20220212,20220213,20211231,...,20211012,20210729,20210930,20210728,20220303,20220302,20210526,20210527,20210615,20210614
0,Afghanistan,159516.0,5.0,159303.0,5.0,124748.0,125937.0,170152.0,170604.0,158084.0,...,155599.0,145996.0,155174.0,145552.0,174214.0,174073.0,67743.0,68366.0,93272.0,91458.0
1,Albania,244182.0,0.0,241512.0,0.0,132535.0,132537.0,267551.0,268008.0,210224.0,...,175664.0,132999.0,170131.0,132952.0,271825.0,271825.0,132244.0,132264.0,132469.0,132461.0
2,Algeria,232325.0,1.0,230470.0,1.0,141471.0,141966.0,261226.0,261752.0,218432.0,...,204790.0,168668.0,203359.0,167131.0,265130.0,265079.0,127646.0,127926.0,134115.0,133742.0
3,Andorra,33025.0,0.0,32201.0,0.0,13918.0,13918.0,37140.0,37140.0,23740.0,...,15307.0,14655.0,15222.0,14586.0,38342.0,38249.0,13671.0,13682.0,13828.0,13826.0
4,Angola,95676.0,0.0,95220.0,0.0,39230.0,39300.0,98514.0,98514.0,81593.0,...,61794.0,42486.0,56583.0,42288.0,98746.0,98746.0,33338.0,33607.0,36921.0,36790.0


In [36]:
df_all_sorted = df_all_fillna[1:].reindex(sorted(df_all_fillna.columns[1:]), axis=1)
df_all_sorted.insert(loc=0, column='Country_Region', value=df_all_fillna['Country_Region'])
df_all_sorted = df_all_sorted.sort_values('20221111', ascending=False)
df_all_sorted.head()

Unnamed: 0,Country_Region,20200122,20200123,20200124,20200125,20200126,20200127,20200128,20200129,20200130,...,20221102,20221103,20221104,20221105,20221106,20221107,20221108,20221109,20221110,20221111
186,US,1.0,1.0,2.0,2.0,5.0,5.0,5.0,5.0,5.0,...,97627774.0,97698174.0,97736547.0,97741608.0,97749152.0,97787763.0,97817525.0,97913411.0,97978279.0,97990681.0
80,India,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,44655926.0,44658365.0,44659447.0,44660579.0,44661504.0,44660293.0,44660293.0,44660293.0,44664810.0,44665643.0
63,France,0.0,0.0,2.0,3.0,3.0,3.0,4.0,5.0,5.0,...,37068741.0,37110800.0,37140238.0,37140238.0,37140238.0,37191901.0,37232493.0,37261364.0,37288432.0,37288432.0
67,Germany,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,4.0,...,35728277.0,35784912.0,35823771.0,35823771.0,35823771.0,35884834.0,35932654.0,35971322.0,36005025.0,36033394.0
24,Brazil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,34837035.0,34846308.0,34849063.0,34849063.0,34849063.0,34855492.0,34855492.0,34855492.0,34889576.0,34908198.0


In [37]:
df_all_sorted.to_csv('result.csv')

---
## Retrieve country flags

In [56]:
base_url = 'https://cdn.countryflags.com/thumbs/{}/flag-800.png'

In [57]:
df_flags = df_all_sorted.copy()

In [58]:
def make_flag_url(country_name):
    country_name = country_name.lower().replace(' ', '-')
    if country_name == "us":
        country_name = 'united-states-of-america'
    flag_url = base_url.format(country_name)
    return(flag_url)

In [59]:
df_flags['flag_url'] = df_flags['Country_Region'].apply(make_flag_url)

In [60]:
df_flags['flag_url'][186]

'https://cdn.countryflags.com/thumbs/united-states-of-america/flag-800.png'

In [61]:
df_flags['flag_url'].to_csv('flags_url.csv')