In [2]:
import logging
from github import Github, ContentFile
from github.Repository import Repository
import pandas as pd
import numpy as np
import glob
import os


from typing import List

In [3]:
GITHUB_TOKEN = os.getenv('GITHUB_PUBLIC_READ_TOKEN')
assert GITHUB_TOKEN is not None

AssertionError: 

In [4]:
def get_csvs():
    g = Github(login_or_token=GITHUB_TOKEN)
    repo = g.get_repo('CSSEGISandData/COVID-19')
    repo: Repository
    c: List[ContentFile] = repo.get_contents('csse_covid_19_data/csse_covid_19_daily_reports')

    local_csvs = glob.glob('*.csv')

    need_dowload = [f for f in c if f.path.endswith('.csv') and os.path.basename(f.path) not in local_csvs]
    for cf in need_dowload:
        fn = os.path.basename(cf.path)
        logging.info(f'writing {fn}')
        with open(fn, 'wb') as f:
            f.write(cf.decoded_content)

In [4]:
get_csvs()

In [5]:
def path_to_date(path):
    path = os.path.basename(path)
    path = os.path.splitext(path)[0]
    date = pd.Timestamp(f'{path[-4:]}-{path[:2]}-{path[3:5]}')
    return date

def df_from_csv(path):
    column_info = {
        'FIPS': '',
        'Admin2': '',
        'Province_State': 'Province/State',
        'Country_Region': 'Country/Region',
        'Last_Update': 'Last Update',
        'Lat': 'Latitude',
        'Long_': 'Longitude',
        'Confirmed': 'Confirmed',
        'Deaths': 'Deaths',
        'Recovered': 'Recovered',
        'Active': '',
        'Combined_Key': '',
    }
    def combined_key(row):
        keys = ['Admin2', 'Province_State', 'Country_Region']
        values = [row[key] for key in keys]
        r = ', '.join((v for v in values if v and pd.notna(v)))
        return r

    df = pd.read_csv(path)
    df['Date'] = path_to_date(path)
    column_names = list(df.columns)
    for column_name, old_column_name in column_info.items():
        if column_name in column_names:
            continue
        if old_column_name:
            if old_column_name in column_names:
                df.rename(columns={old_column_name:column_name}, inplace=True)
                continue
            else:
                df[column_name] = np.nan
                continue
        if column_name in ['FIPS', 'Admin2']:
            df[column_name] = np.nan
        elif column_name == 'Active':
            df['Active'] = df['Confirmed'] - df['Deaths'] - df['Recovered']
        elif column_name == 'Combined_Key':
            df['Combined_Key'] = df.apply(combined_key, axis=1)
        else:
            raise NotImplementedError(f'Unexpected column name: {column_name}')

    return df


In [7]:
csv_files = glob.glob('*.csv')
csv_files.sort(reverse=True)
df = pd.concat(df_from_csv(csv_file) for csv_file in csv_files)

In [8]:
df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Date
0,45001.0,Abbeville,South Carolina,US,2020-04-06 23:22:15,34.223334,-82.461707,6.0,0.0,0.0,0.0,"Abbeville, South Carolina, US",2020-04-06
1,22001.0,Acadia,Louisiana,US,2020-04-06 23:22:15,30.295065,-92.414197,79.0,2.0,0.0,0.0,"Acadia, Louisiana, US",2020-04-06
2,51001.0,Accomack,Virginia,US,2020-04-06 23:22:15,37.767072,-75.632346,11.0,0.0,0.0,0.0,"Accomack, Virginia, US",2020-04-06
3,16001.0,Ada,Idaho,US,2020-04-06 23:22:15,43.452658,-116.241552,402.0,3.0,0.0,0.0,"Ada, Idaho, US",2020-04-06
4,19001.0,Adair,Iowa,US,2020-04-06 23:22:15,41.330756,-94.471059,1.0,0.0,0.0,0.0,"Adair, Iowa, US",2020-04-06


In [24]:
# only data for Germany
germany=df[df['Country_Region']=='Germany']
germany.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Date
2690,,,,Germany,2020-04-06 23:21:55,51.165691,10.451526,103374.0,1810.0,28700.0,72864.0,Germany,2020-04-06
2646,,,,Germany,2020-04-05 23:06:26,51.165691,10.451526,100123.0,1584.0,28700.0,69839.0,Germany,2020-04-05
2563,,,,Germany,2020-04-04 23:34:04,51.165691,10.451526,96092.0,1444.0,26400.0,68248.0,Germany,2020-04-04
2509,,,,Germany,2020-04-03 22:46:20,51.165691,10.451526,91159.0,1275.0,24575.0,65309.0,Germany,2020-04-03
2453,,,,Germany,2020-04-02 23:25:14,51.165691,10.451526,84794.0,1107.0,22440.0,61247.0,Germany,2020-04-02


In [25]:
# get a df which is sorted by date
germany = germany.sort_values(axis=0, by='Date')
germany.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Date
51,,,Bavaria,Germany,1/28/20 23:00,,,4.0,,,,"Bavaria, Germany",2020-01-28
51,,,Bavaria,Germany,1/29/20 19:30,,,4.0,,,,"Bavaria, Germany",2020-01-29
53,,,Bavaria,Germany,1/30/20 16:00,,,4.0,,,,"Bavaria, Germany",2020-01-30
40,,,Bavaria,Germany,1/31/2020 23:59,,,5.0,,,,"Bavaria, Germany",2020-01-31
37,,,,Germany,2/1/2020 18:33,,,8.0,0.0,0.0,8.0,Germany,2020-02-01


In [26]:
# only a few rows seem to have province information
germany_has_state = germany[pd.notna(germany['Province_State'])]
germany_has_state.head(100)

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Date
51,,,Bavaria,Germany,1/28/20 23:00,,,4.0,,,,"Bavaria, Germany",2020-01-28
51,,,Bavaria,Germany,1/29/20 19:30,,,4.0,,,,"Bavaria, Germany",2020-01-29
53,,,Bavaria,Germany,1/30/20 16:00,,,4.0,,,,"Bavaria, Germany",2020-01-30
40,,,Bavaria,Germany,1/31/2020 23:59,,,5.0,,,,"Bavaria, Germany",2020-01-31
