# Imports

In [1]:
from pandas import DataFrame
import pandas as pd
import numpy as np
from datetime import date, datetime, timedelta
from urllib.error import HTTPError
import re
import os

# Downloading Data

In [2]:
# Preparing Links for daily files
def get_list_of_links(start=pd.Timestamp(2020,1,22), end=pd.Timestamp.today()):
    """Return a list of valid links corresponding to data files between start and end dates"""
    base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/"
    if start <= end <= pd.Timestamp.today():
        dates = pd.date_range(start, end, freq='D')
    else:
        print("Dates are not valid")

    return (base_url + day.strftime('%m-%d-%Y') + '.csv' for day in dates)

In [3]:
# 1-to-1 map of a column stub and a new column name (stub is chosen to be in that column in all raw files)
def fix_column_name(raw_col_name):
    """
    return a unified name for each column in raw data from stub_column_mapper if one of stubs in stub_column_mapper is in the raw column name,
    otherwise return the raw column name itself
    """
    stub_column_mapper = {
    'vinc': 'Province',
    'untr': 'Country',
    'ips': 'FIPS',
    'pda': 'Last_Update',
    'lat': 'Latitude',
    'long': 'Longitude',
    'firm': 'Confirmed',
    'eath': 'Deaths',
    'cov': 'Recovered',
    'tive': 'Active',
    'bin': 'Combined_Key'
    }
    for stub in stub_column_mapper:
        if stub in raw_col_name.lower():
            return stub_column_mapper[stub]
    return raw_col_name

# Fetching Data to One CSV File

In [4]:
Main_Data = []
for link in get_list_of_links():
    filename = link.split('/')[-1]
	# check if a daily data file is already downloaded
    if os.path.exists(f'Daily_files\\{filename}'):
        df = pd.read_csv(f'Daily_files\\{filename}')
    else:	# download if not
        try:
            df = pd.read_csv(link)
            columns = {oldCol: fix_column_name(oldCol) for oldCol in df.columns}
            df.rename(columns=columns, inplace=True)
            df = df[list(columns.values())]
            df.to_csv(f'Daily_files//{filename}', index=False)
            print(f'Successfully saved: {filename}')
        except HTTPError:
            print("No Covid data file for {}".format(filename.split('.')[0]))
    # one big list of dataframes       
    Main_Data.append(df)
    
# one dataframe for all days
covid = pd.concat(Main_Data, ignore_index=True, sort=False)
# one csv file for all days
covid.to_csv('alldays_data.csv', index=False)
print('Successfully updated alldays_data.csv')
del Main_Data

In [None]:
covid.head()

Unnamed: 0,Province,Country,Last_Update,Confirmed,Deaths,Recovered,Latitude,Longitude,FIPS,Active,Combined_Key,Admin2,Lat,Long_,Incident_Rate,Case_Fatality_Ratio
0,Anhui,Mainland China,1/22/2020 17:00,1.0,,,,,,,,,,,,
1,Beijing,Mainland China,1/22/2020 17:00,14.0,,,,,,,,,,,,
2,Chongqing,Mainland China,1/22/2020 17:00,6.0,,,,,,,,,,,,
3,Fujian,Mainland China,1/22/2020 17:00,1.0,,,,,,,,,,,,
4,Gansu,Mainland China,1/22/2020 17:00,,,,,,,,,,,,,


# Data for USA (scrapping Worldometer table)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

link = "https://www.worldometers.info/coronavirus/country/us/"

response = requests.get(link)
soup = BeautifulSoup(response.content, 'html.parser')

# table header
thead = soup.find_all('thead')[0] # many tables might exist
colnames = [tag.get_text() for tag in thead.find_all(['th'])[:-2]] # last two cells among header irrelevant

# table rows
tbody = soup.find_all('tbody')

rows_data = {} 
tr_tags = tbody[0].find_all(['tr'])   # tbody[0] as other tables exist
for tr in tr_tags:                  # loop over table rows
    tr_data = [td.get_text() for td in tr.find_all('td')[:-2]] # loop over row divisions (i.e cells) # last two cells irrelevant
    rows_data[tr_data[0].strip()] = tr_data[1:]                # mapping row index (1st cell) to row data (other cells)

In [None]:
# a dataframe of the scrapped table    
df = pd.DataFrame(rows_data).T
df.columns = [col.replace('\n', ' ') for col in colnames[1:]] # one used as index name
df.index.name = colnames[0]

df = df.applymap(lambda x: x\
                 .strip()\
                 .replace(',', '')\
                 .replace('+', ''))
# save to csv
df.to_csv('usa_worldometer.csv')
print('Successsfully saved: usa_worldometer.csv')

Successsfully saved: usa_worldometer.csv


# Worldometer main table

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

link = "https://www.worldometers.info/coronavirus/"

response = requests.get(link)
soup = BeautifulSoup(response.content, 'html.parser')

# table header
thead = soup.find_all('thead')[0] # many tables might exist
colnames = [tag.get_text() for tag in thead.find_all(['th'])[:-2]] # last two cells among header irrelevant

# table rows
tbody = soup.find_all('tbody')

rows_data = {} 
tr_tags = tbody[0].find_all(['tr'])   # tbody[0] as other tables exist
for tr in tr_tags:                  # loop over table rows
    tr_data = [td.get_text() for td in tr.find_all('td')[:-2]] # loop over row divisions (i.e cells) # last two cells irrelevant
    rows_data[tr_data[0].strip()] = tr_data[1:]                # mapping row index (1st cell) to row data (other cells)

In [None]:
# a dataframe of the scrapped table    
df = pd.DataFrame(rows_data).T
df.columns = [col.replace('\n', ' ').replace(',', ' ') for col in colnames[1:]] # one used as index name
df.index.name = colnames[0]

# replacing weird characters in the data
df = df.applymap(lambda x: x\
                 .strip()\
                 .replace(',', '')\
                 .replace('+', ''))
# save to csv
df.to_csv('world_worldometer.csv', index=False)
print('Successsfully saved: world_worldometer.csv')

Successsfully saved: world_worldometer.csv


In [None]:
df.head()

Unnamed: 0_level_0,Country Other,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,Serious Critical,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/ 1M pop,Population,Continent,1 Caseevery X ppl
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
,World,118630045,8800.0,2631996,729.0,94238652,6802.0,21759397,89790,15219,337.7,,,,All,
1.0,USA,29862124,,542191,,20640270,,8679663,12607,89854,1631.0,373104726.0,1122655.0,332341490.0,North America,11.0
2.0,India,11284311,,158213,,10935803,,190295,8944,8122,114.0,223479877.0,160853.0,1389345702.0,Asia,123.0
3.0,Brazil,11205972,,270917,,9913739,,1021316,8318,52462,1268.0,28600000.0,133894.0,213601044.0,South America,19.0
4.0,Russia,4351553,,90275,,3945527,,315751,2300,29810,618.0,113800000.0,779571.0,145977728.0,Europe,34.0
