# Exploratory Data Analysis on COVID-19 Global Data

In [1]:
# Import libraries
import pandas as pd

# Read CSV file
df = pd.read_csv('revised_covid_19_data.csv')

df.head()

Unnamed: 0,date,country,confirmed,deaths,recovered,active,death_rate_float,recovery_rate_float,death_rate,recovery_rate
0,01/22/2020,China,547,17,28,502,0.031079,0.051188,3.11 %,5.12 %
1,01/22/2020,Hong Kong,0,0,0,0,0.0,0.0,nan %,nan %
2,01/22/2020,Japan,2,0,0,2,0.0,0.0,0.0 %,0.0 %
3,01/22/2020,Macau,1,0,0,1,0.0,0.0,0.0 %,0.0 %
4,01/22/2020,South Korea,1,0,0,1,0.0,0.0,0.0 %,0.0 %


## Filter Latest Data
Create a dataframe out of it

In [2]:
# Filter latest data
latest_date = df['date'].unique()[-1]
df_latest =  df[df['date'] == latest_date].copy()
df_latest.reset_index(inplace=True)
df_latest.drop(columns=['index'],axis=1,inplace=True)
df_latest.head()

Unnamed: 0,date,country,confirmed,deaths,recovered,active,death_rate_float,recovery_rate_float,death_rate,recovery_rate
0,04/14/2020,China,82249,3341,77753,1155,0.040621,0.945337,4.06 %,94.53 %
1,04/14/2020,Hong Kong,1012,4,434,574,0.003953,0.428854,0.4 %,42.89 %
2,04/14/2020,Japan,7645,143,799,6703,0.018705,0.104513,1.87 %,10.45 %
3,04/14/2020,Macau,45,0,13,32,0.0,0.288889,0.0 %,28.89 %
4,04/14/2020,South Korea,10564,222,7534,2808,0.021015,0.713177,2.1 %,71.32 %


In [3]:
df_country_latest = df_latest.set_index(['country']).copy()
df_country_latest.head()

Unnamed: 0_level_0,date,confirmed,deaths,recovered,active,death_rate_float,recovery_rate_float,death_rate,recovery_rate
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
China,04/14/2020,82249,3341,77753,1155,0.040621,0.945337,4.06 %,94.53 %
Hong Kong,04/14/2020,1012,4,434,574,0.003953,0.428854,0.4 %,42.89 %
Japan,04/14/2020,7645,143,799,6703,0.018705,0.104513,1.87 %,10.45 %
Macau,04/14/2020,45,0,13,32,0.0,0.288889,0.0 %,28.89 %
South Korea,04/14/2020,10564,222,7534,2808,0.021015,0.713177,2.1 %,71.32 %


## Focus on ASEAN Nations
Countries: Indonesia, Thailand, Singapore, Malaysia, Philippines, Vietnam, Cambodia, Laos, Brunei, Myanmar (Burma)

In [4]:
asean_countries = ['Indonesia','Thailand','Singapore','Malaysia','Philippines',
                   'Vietnam','Cambodia','Laos','Brunei','Burma']

asean_latest = df_country_latest.loc[asean_countries].copy()
asean_latest.reset_index(inplace=True)
asean_latest.index = list(range(1,11))
asean_latest

Unnamed: 0,country,date,confirmed,deaths,recovered,active,death_rate_float,recovery_rate_float,death_rate,recovery_rate
1,Indonesia,04/14/2020,4839,459,426,3954,0.094854,0.088035,9.49 %,8.8 %
2,Thailand,04/14/2020,2613,41,1405,1167,0.015691,0.537696,1.57 %,53.77 %
3,Singapore,04/14/2020,3252,10,611,2631,0.003075,0.187884,0.31 %,18.79 %
4,Malaysia,04/14/2020,4987,82,2478,2427,0.016443,0.496892,1.64 %,49.69 %
5,Philippines,04/14/2020,5223,335,295,4593,0.064139,0.056481,6.41 %,5.65 %
6,Vietnam,04/14/2020,266,0,169,97,0.0,0.635338,0.0 %,63.53 %
7,Cambodia,04/14/2020,122,0,91,31,0.0,0.745902,0.0 %,74.59 %
8,Laos,04/14/2020,19,0,1,18,0.0,0.052632,0.0 %,5.26 %
9,Brunei,04/14/2020,136,1,107,28,0.007353,0.786765,0.74 %,78.68 %
10,Burma,04/14/2020,63,4,2,57,0.063492,0.031746,6.35 %,3.17 %


## ASEAN Rankings
Change the second argument from the `.nlargest` function.

**List of arguments:**
1. `'confirmed'`
2. `'deaths'`
3. `'recovered'`
4. `'active'`
5. `'death_rate_float'`
6. `'recovery_rate_float'`

In [5]:
asean_latest.nlargest(10,'confirmed').\
set_index(pd.Series(range(1,11)))[['date','country',
                                   'confirmed','deaths',
                                   'recovered','active',
                                   'death_rate','recovery_rate']]

Unnamed: 0,date,country,confirmed,deaths,recovered,active,death_rate,recovery_rate
1,04/14/2020,Philippines,5223,335,295,4593,6.41 %,5.65 %
2,04/14/2020,Malaysia,4987,82,2478,2427,1.64 %,49.69 %
3,04/14/2020,Indonesia,4839,459,426,3954,9.49 %,8.8 %
4,04/14/2020,Singapore,3252,10,611,2631,0.31 %,18.79 %
5,04/14/2020,Thailand,2613,41,1405,1167,1.57 %,53.77 %
6,04/14/2020,Vietnam,266,0,169,97,0.0 %,63.53 %
7,04/14/2020,Brunei,136,1,107,28,0.74 %,78.68 %
8,04/14/2020,Cambodia,122,0,91,31,0.0 %,74.59 %
9,04/14/2020,Burma,63,4,2,57,6.35 %,3.17 %
10,04/14/2020,Laos,19,0,1,18,0.0 %,5.26 %


# Scraping 2020 Population Estimates
Source URL: "https://www.worldometers.info/world-population/population-by-country/"

In [6]:
# Import Additional Libraries
import requests
from bs4 import BeautifulSoup

# Create requests object: 'res'
url = "https://www.worldometers.info/world-population/population-by-country/"

res = requests.get(url)
try:
    res.raise_for_status()
except:
    print("There's an error in requesting this URL.")
    
# Create BeautifulSoup object: 'soup'
soup = BeautifulSoup(res.text,'lxml')

In [7]:
# Turn soup into a pandas dataframe
html_table = str(soup.find_all('table')[0])

pop2020 = pd.read_html(html_table)[0]
pop2020.head()

Unnamed: 0,#,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,1,China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
1,2,India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
2,3,United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
3,4,Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
4,5,Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %


In [8]:
# Turn soup into a pandas dataframe
html_table = str(soup.find_all('table')[0])

pop2020 = pd.read_html(html_table)[0]

# Fix Country Names
pop2020.replace({'Caribbean Netherlands':'Netherlands',
                 'British Virgin Islands':'United Kingdom',
                 'Myanmar':'Burma',
                 'Congo':'Congo (Brazzaville)',
                 'DR Congo':'Congo (Kinshasa)',
                 'Czech Republic (Czechia)':'Czech Republic',
                 "Côte d'Ivoire":'Ivory Coast',
                 'Curaçao':'Curacao',
                 'Timor-Leste':'East Timor',
                 'Faeroe Islands':'Faroe Islands',
                 'Macao':'Macau',
                 'State of Palestine':'Palestine',
                 'Réunion':'Reunion',
                 'Saint Kitts & Nevis':'Saint Kitts and Nevis',
                 'St. Vincent & Grenadines':'Saint Vincent and the Grenadines',
                 'Sao Tome & Principe':'Sao Tome and Principe'
                },inplace=True)

pop2020 = pop2020.groupby(['Country (or dependency)']).sum()
pop2020.drop(columns=['#'],inplace=True)

# Drop Countries
pop2020.drop(['American Samoa','Anguilla','Bermuda','Comoros','Cook Islands',
              'Falkland Islands','French Polynesia','Isle of Man','Marshall Islands',
              'Micronesia','Montserrat','Nauru','New Caledonia','Niue',
              'North Korea','Northern Mariana Islands','Palau','Saint Helena',
              'Saint Pierre & Miquelon','Samoa','Sint Maarten','Solomon Islands',
              'Tajikistan','Turkmenistan','Turks and Caicos','Tuvalu','U.S. Virgin Islands',
              'Wallis & Futuna','Vanuatu'
             ],axis=0,inplace=True)
pop2020.tail()

Unnamed: 0_level_0,Population (2020),Net Change,Density (P/Km²),Land Area (Km²),Migrants (net)
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Vietnam,97338579,876473,314,310070,-80000.0
Western Sahara,597339,14876,2,266000,5582.0
Yemen,29825964,664042,56,527970,-30000.0
Zambia,18383955,522925,25,743390,-8000.0
Zimbabwe,14862924,217456,38,386850,-116858.0


In [9]:
pop2020.columns

Index(['Population (2020)', 'Net Change', 'Density (P/Km²)', 'Land Area (Km²)',
       'Migrants (net)'],
      dtype='object')

## Check country matches between these two dataframes:
- `df`
- `pop2020`

In [10]:
# Filter Countries
covid_countries = df['country'].unique().copy()
pop_countries = pop2020.index

print('No. of Countries from COVID-19 Dataset:',len(covid_countries))
print('No. of Countries from Worldometer Dataset:',len(pop_countries))

No. of Countries from COVID-19 Dataset: 206
No. of Countries from Worldometer Dataset: 204


## Data to be added
Not in pop_countries
- 'Diamond Princess', 3711, 0, 0, 0, 0, 0, 0, 0, 0, 0
- 'Kosovo', 1810463, 0, 166, 10887, 0, 0, 0, 0, 0, 0
- 'MS Zaandam', 1829, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [13]:
columns = ['Country (or dependency)','Population (2020)','Net Change',
           'Density (P/Km²)','Land Area (Km²)','Migrants (net)']
data = [['Diamond Princess', 3711, 0, 0, 0, 0],
        ['Kosovo', 1810463, 0, 166, 10887, 0],
        ['MS Zaandam', 1829, 0, 0, 0, 0]]

add_data = pd.DataFrame(data,columns=columns).set_index('Country (or dependency)')
add_data['Population (2020)'].to_csv('add_pop.csv',index=True)

# Append additional data
pop2020.append(add_data)

## Generate a csv file out of the `pop2020` dataframe
But only include countries and population column

In [23]:
pop2020.to_csv('pop2020_estimates.csv',index=True)