In [1]:
from datetime import datetime
from pathlib import Path

import datefinder
import pandas as pd

In [2]:
data_dir = Path('data/.')
data_list = [f for f in data_dir.glob('*.csv')]
len(data_list)

1025

In [3]:
def parse_date(data_path) -> str:
    date_time = [t for t in datefinder.find_dates(str(data_path))]
    date_str = datetime.strftime(date_time[0], '%Y%m%d')
    return date_str

In [4]:
# test for date string sort: OK
date_sorted = sorted([parse_date(dt) for dt in data_list])
print(min(date_sorted), max(date_sorted))

20200122 20221111


In [5]:
def extract_data_from_path(data_path):
    date_str = parse_date(data_path)
    df = pd.read_csv(data_path)

    if 'Country_Region' not in df.columns:
        df.rename(columns={'Country/Region': 'Country_Region'}, inplace=True)

    df.loc[df['Country_Region'] == 'Mainland China', 'Country_Region'] = 'China'
    df.loc[df['Country_Region'] == 'Korea, South', 'Country_Region'] = 'South Korea'
    df.loc[df['Country_Region'] == 'Taiwan*', 'Country_Region'] = 'Taiwan'
    
    df = (df
        .loc[:, ['Country_Region', 'Confirmed']]
        .rename(columns={'Confirmed': date_str})
        .groupby('Country_Region', as_index=False).sum(date_str)
    )
    return df

In [6]:
df_20220121 = extract_data_from_path(data_list[0])
df_20200226 = extract_data_from_path(data_list[1])

In [7]:
df = pd.merge(df_20220121, df_20200226, on='Country_Region', how='outer')
df = (df
    .fillna(0)
    .sort_values('20200226', ascending=False)
)
df

Unnamed: 0,Country_Region,20220121,20200226
37,China,132264.0,78065.0
165,South Korea,726274.0,1261.0
203,Others,0.0,705.0
86,Italy,9603856.0,453.0
88,Japan,2075528.0,189.0
...,...,...,...
91,Kenya,319838.0,0.0
92,Kiribati,39.0,0.0
93,"Korea, North",0.0,0.0
94,Kosovo,175283.0,0.0
