# First look at the data


In [18]:
import pandas as pd
import numpy as np

# We first take a look at the data file, and understand that it is a CSV file with 28 KB on your disk

# read the file
covid_data = pd.read_csv("data/covidtotals.csv", parse_dates=["lastdate"]) # note: Pandas can automatically infer standard date formats, such as ISO 8601 (YYYY-MM-DD), if you specify which columns contain dates.

In [19]:
# Inspect the shape (numbers of rows and columns) of the dataframe.
# pd.shape returns a tuple representing the dimensionality of the DataFrame.
# it has 231 rows and 17 columns
covid_data.shape

(231, 17)

In [20]:
# Inspect the columns

print(f"This file include the following columns: {covid_data.columns}")

This file include the following columns: Index(['iso_code', 'lastdate', 'location', 'total_cases', 'total_deaths',
       'total_cases_pm', 'total_deaths_pm', 'population', 'pop_density',
       'median_age', 'gdp_per_capita', 'hosp_beds', 'vac_per_hund',
       'aged_65_older', 'life_expectancy', 'hum_dev_ind', 'region'],
      dtype='object')


In [21]:
# Inspect some samples of the data, using head( ), tail ( ), or sample( )
print(f"The head of the dataset:\n")
print(covid_data.head())

The head of the dataset:

  iso_code   lastdate        location  total_cases  total_deaths  \
0      AFG 2024-02-04     Afghanistan     231539.0        7982.0   
1      ALB 2024-01-28         Albania     334863.0        3605.0   
2      DZA 2023-12-03         Algeria     272010.0        6881.0   
3      ASM 2023-09-17  American Samoa       8359.0          34.0   
4      AND 2023-05-07         Andorra      48015.0         159.0   

   total_cases_pm  total_deaths_pm  population  pop_density  median_age  \
0        5629.611          194.073    41128772       54.422        18.6   
1      117813.348         1268.331     2842318      104.871        38.0   
2        6057.694          153.241    44903228       17.348        29.1   
3      188712.044          767.581       44295      278.205         NaN   
4      601367.684         1991.408       79843      163.755         NaN   

   gdp_per_capita  hosp_beds  vac_per_hund  aged_65_older  life_expectancy  \
0        1803.987       0.50        

In [15]:
print(f"The tail of the dataset:\n")
print(covid_data.tail(n=3)) # You can specify how many rows you wanted to inspect

The tail of the dataset:

    iso_code    lastdate  location  total_cases  total_deaths  total_cases_pm  \
228      YEM  2022-11-06     Yemen      11945.0        2159.0         354.487   
229      ZMB  2023-12-03    Zambia     349304.0        4069.0       17449.783   
230      ZWE  2024-01-28  Zimbabwe     266265.0        5737.0       16314.719   

     total_deaths_pm  population  pop_density  median_age  gdp_per_capita  \
228           64.072    33696612       53.508        20.3        1479.147   
229          203.270    20017670       22.995        17.7        3689.251   
230          351.520    16320539       42.729        19.6        1899.775   

     hosp_beds  vac_per_hund  aged_65_older  life_expectancy  hum_dev_ind  \
228        0.7           NaN          2.922            66.12        0.470   
229        2.0           NaN          2.480            63.89        0.584   
230        1.7           NaN          2.822            61.49        0.571   

              region  
228     

In [16]:
print(f"A random sample of the dataset:\n")
print(covid_data.sample())

A random sample of the dataset:

   iso_code    lastdate location  total_cases  total_deaths  total_cases_pm  \
21      BEN  2023-09-24    Benin      28036.0         163.0        2099.624   

    total_deaths_pm  population  pop_density  median_age  gdp_per_capita  \
21           12.207    13352864        99.11        18.8        2064.236   

    hosp_beds  vac_per_hund  aged_65_older  life_expectancy  hum_dev_ind  \
21        0.5           NaN          3.244            61.77        0.545   

         region  
21  West Africa  


In [17]:
print(f"A random sample of the datasets with 7 rows, with random seed 42")
print(covid_data.sample(n=7, random_state=42))

A random sample of the datasets with 7 rows, with random seed 42
    iso_code    lastdate                         location  total_cases  \
218      GBR  2024-01-28                   United Kingdom   24892903.0   
66       ETH  2023-12-03                         Ethiopia     501117.0   
9        ARM  2023-12-17                          Armenia     451426.0   
170      RUS  2024-01-28                           Russia   23774451.0   
15       BHR  2022-12-04                          Bahrain     696614.0   
200      SWE  2024-01-28                           Sweden    2749793.0   
25       BES  2023-07-09  Bonaire Sint Eustatius and Saba      11922.0   

     total_deaths  total_cases_pm  total_deaths_pm  population  pop_density  \
218      232112.0      368734.933         3438.241    67508936      272.898   
66         7574.0        4061.576           61.388   123379928      104.957   
9          8775.0      162355.888         3155.939     2780472      102.931   
170      401884.0      164

In [None]:
%%sql


In [24]:
covid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231 entries, 0 to 230
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   iso_code         231 non-null    object        
 1   lastdate         231 non-null    datetime64[ns]
 2   location         231 non-null    object        
 3   total_cases      231 non-null    float64       
 4   total_deaths     231 non-null    float64       
 5   total_cases_pm   231 non-null    float64       
 6   total_deaths_pm  231 non-null    float64       
 7   population       231 non-null    int64         
 8   pop_density      209 non-null    float64       
 9   median_age       194 non-null    float64       
 10  gdp_per_capita   191 non-null    float64       
 11  hosp_beds        170 non-null    float64       
 12  vac_per_hund     13 non-null     float64       
 13  aged_65_older    188 non-null    float64       
 14  life_expectancy  227 non-null    float64  

In [26]:
covid_data.dtypes

iso_code                   object
lastdate           datetime64[ns]
location                   object
total_cases               float64
total_deaths              float64
total_cases_pm            float64
total_deaths_pm           float64
population                  int64
pop_density               float64
median_age                float64
gdp_per_capita            float64
hosp_beds                 float64
vac_per_hund              float64
aged_65_older             float64
life_expectancy           float64
hum_dev_ind               float64
region                     object
dtype: object

## Summary statistics (numerical)


In [33]:
covid_data.describe() # generate a descriptive statistics for numerical columns

Unnamed: 0,lastdate,total_cases,total_deaths,total_cases_pm,total_deaths_pm,population,pop_density,median_age,gdp_per_capita,hosp_beds,vac_per_hund,aged_65_older,life_expectancy,hum_dev_ind
count,231,231.0,231.0,231.0,231.0,231.0,209.0,194.0,191.0,170.0,13.0,188.0,227.0,187.0
mean,2023-09-11 22:32:43.636363520,3351599.0,30214.2,206177.794623,1261.77839,34246090.0,323.603359,30.341753,18614.95899,3.014576,196.476923,8.675835,73.64652,0.721251
min,2020-10-18 00:00:00,4.0,0.0,354.487,0.0,47.0,0.137,15.1,661.24,0.1,38.47,1.144,53.28,0.394
25%,2023-06-07 12:00:00,25671.5,177.5,21821.863,141.177,370207.5,37.728,22.05,3821.198,1.3,155.72,3.49675,69.545,0.6015
50%,2023-12-03 00:00:00,191496.0,1937.0,133946.251,827.046,5434324.0,88.125,29.6,12236.706,2.3985,214.07,6.2585,75.05,0.74
75%,2024-01-28 00:00:00,1294286.0,14150.0,345689.831,1997.513,21978700.0,222.873,38.7,27012.3045,3.96525,227.81,13.9905,79.285,0.8285
max,2024-02-04 00:00:00,103436800.0,1127152.0,763475.441,6507.656,1425887000.0,19347.5,48.2,116935.6,13.8,406.39,27.049,86.75,0.957
std,,11483210.0,104778.9,203858.096252,1314.981529,137653000.0,1468.42734,9.117782,19470.167828,2.434476,96.305392,6.150235,7.42389,0.149669


In [34]:
covid_data.describe(percentiles=[0.05, 0.95])

Unnamed: 0,lastdate,total_cases,total_deaths,total_cases_pm,total_deaths_pm,population,pop_density,median_age,gdp_per_capita,hosp_beds,vac_per_hund,aged_65_older,life_expectancy,hum_dev_ind
count,231,231.0,231.0,231.0,231.0,231.0,209.0,194.0,191.0,170.0,13.0,188.0,227.0,187.0
mean,2023-09-11 22:32:43.636363520,3351599.0,30214.2,206177.794623,1261.77839,34246090.0,323.603359,30.341753,18614.95899,3.014576,196.476923,8.675835,73.64652,0.721251
min,2020-10-18 00:00:00,4.0,0.0,354.487,0.0,47.0,0.137,15.1,661.24,0.1,38.47,1.144,53.28,0.394
5%,2022-10-23 00:00:00,5239.0,8.0,1638.859,13.193,14284.0,5.2458,17.665,1423.1265,0.5,57.196,2.4821,59.505,0.4623
50%,2023-12-03 00:00:00,191496.0,1937.0,133946.251,827.046,5434324.0,88.125,29.6,12236.706,2.3985,214.07,6.2585,75.05,0.74
95%,2024-02-04 00:00:00,15492510.0,144763.0,600293.0335,3749.868,119469500.0,675.2552,44.14,55543.458,7.2125,324.538,19.7219,82.984,0.938
max,2024-02-04 00:00:00,103436800.0,1127152.0,763475.441,6507.656,1425887000.0,19347.5,48.2,116935.6,13.8,406.39,27.049,86.75,0.957
std,,11483210.0,104778.9,203858.096252,1314.981529,137653000.0,1468.42734,9.117782,19470.167828,2.434476,96.305392,6.150235,7.42389,0.149669


## Summary statistics (categorical)