# Web Scraping JHU Daily Reports on COVID-19

In [1]:
# Import libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta

## GET list of links to csv files of daily reports

In [2]:
def get_csv_links():
    '''
    Fetches all links to csv files of daily reports
    and returns it as a list
    '''
    # URL containing list of csv files of daily reports
    url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"

    # Create requests object: res
    res = requests.get(url)
    res.raise_for_status()

    # Create a Beautiful Soup object: soup
    soup = BeautifulSoup(res.text,'lxml')
    
    # Get table rows containing links to csv files
    table_rows = soup.table.tbody.find_all('tr', class_="js-navigation-item")[1:-1]
    
    # Set url prefix for the csv file links
    href_prefix = 'https://github.com'
    
    return [href_prefix + table_rows[i].span.a.get('href') for i in range(len(table_rows))]

In [3]:
get_csv_links()

['https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-22-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-23-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-24-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-25-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-26-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-27-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-28-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-29-2020.csv',
 'https://github.com/CSSEGISandD

## Scrape the Table from the csv file page

In [3]:
def get_day_report_df(csvpage_url):
    '''
    Scrapes the table content of a single csv page link
    to transform and return as a pandas dataframe
    '''
    print("GET request from",csvpage_url)

    # Create requests object: res
    res = requests.get(csvpage_url)
    res.raise_for_status()

    # Create a Beautiful Soup object: soup
    soup = BeautifulSoup(res.text,'lxml')
    
    # Read html string and generate a pandas dataframe
    df = pd.read_html(str(soup.table))[0]

    # Add additional column
    df['ObservationDate'] = '/'.join(csvpage_url[-14:-4].split('-'))
    
    # Rename column 'Country_Region' to 'Country/Region'
    df.rename(columns={'Country_Region':'Country/Region'},inplace=True)
    
    # Fill 'nan' with 0
    df.fillna(value=0,inplace=True)
    
    # Group DataFrame by 'Country/Region'
    df_bycountry = df.groupby(['ObservationDate','Country/Region']).sum().copy()
    df_bycountry.reset_index(inplace=True) # Reset index    
    df_bycountry = df_bycountry[['ObservationDate', # Reduce columns included
                                 'Country/Region',
                                 'Confirmed',
                                 'Deaths',
                                 'Recovered']]
    
    return df_bycountry

## Concatenate all daily reports into 1 DataFrame

In [11]:
def concat_daily_reports():
    '''
    Concatenates all daily reports by using these functions:
        get_csv_links, get_day_report_df
    Generates a csv file with the filename 'covid_19_data.csv'
        This csv file is meant to be cleaned using the function revise_covid19_data()
    Returns a pandas dataframe containing the scraped raw dataset
    '''
    csv_links = get_csv_links()

    df_list = [get_day_report_df(csv_links[i]) for i in range(len(csv_links))]

    df = pd.concat(df_list,axis=0)
    
    #df.to_csv('covid_19_data.csv',index=False)
    
    return df

In [14]:
df = concat_daily_reports()

GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-22-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-23-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-24-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-25-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-26-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-27-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-28-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/bl

GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/03-24-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/03-25-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/03-26-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/03-27-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/03-28-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/03-29-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/03-30-2020.csv
GET request from https://github.com/CSSEGISandData/COVID-19/bl

In [17]:
df['Country/Region'].unique()

array(['Hong Kong', 'Japan', 'Macau', 'Mainland China', 'South Korea',
       'Taiwan', 'Thailand', 'US', 'Australia', 'Brazil', 'Colombia',
       'Malaysia', 'Mexico', 'Philippines', 'Singapore', 'Vietnam',
       'France', 'Nepal', 'Canada', 'Cambodia', 'Ivory Coast',
       'Sri Lanka', 'Germany', 'Finland', 'United Arab Emirates', 'India',
       'Italy', 'Russia', 'Sweden', 'UK', 'Spain', 'Belgium', 'Others',
       'Egypt', 'Iran', 'Israel', 'Lebanon', 'Iraq', 'Afghanistan',
       'Bahrain', 'Kuwait', 'Oman', 'Algeria', 'Austria', 'Croatia',
       'Switzerland', 'Georgia', 'Greece', 'North Macedonia', 'Norway',
       'Pakistan', 'Romania', 'Denmark', 'Estonia', 'Netherlands',
       'San Marino', 'Azerbaijan', 'Belarus', 'Iceland', 'Lithuania',
       'New Zealand', 'Nigeria', 'North Ireland', 'Ireland', 'Luxembourg',
       'Monaco', 'Qatar', 'Armenia', 'Czech Republic',
       'Dominican Republic', 'Ecuador', 'Andorra', 'Indonesia', 'Latvia',
       'Morocco', 'Portugal', '