# AGGREGATES_IN_SQL - lecture 6b

Coronavirus disease 2019 (COVID-19) time series listing confirmed cases, reported deaths and reported recoveries.
https://github.com/datasets/covid-19

In [None]:
!wget "https://raw.githubusercontent.com/datasets/covid-19/master/data/countries-aggregated.csv"
!mv "countries-aggregated.csv" "covid.csv"

In [None]:
# Import required modules
import csv
import sqlite3

# Connecting to the covid database
connection = sqlite3.connect('covid.db')

# Creating a cursor object to execute
# SQL queries on a database table
cursor = connection.cursor()
# It is an object that is used to make the connection for executing SQL queries.
# It acts as middleware between SQLite database connection and SQL query.
# It is created after giving connection to SQLite database.

# IF NOT EXISTS option to create a new table if it does not exist.
# Attempting to create a table  that already exists without using
#the IF NOT EXISTS option will result in an error.
create_table = '''CREATE TABLE IF NOT EXISTS covid (
    date date,
    country text,
    confirmed int,
    recovered int,
    deaths int,
    PRIMARY KEY (date, country)
);'''



# Creating the table into our
# database
cursor.execute(create_table)
# Opening the covid.csv file
file = open('covid.csv')

# Reading the contents of the
# covid.csv file
contents = csv.reader(file)

# SQL query to insert data into the
# person table
insert_records = "INSERT INTO covid (date, country, confirmed, recovered, deaths) VALUES(?, ?, ?, ?, ?)"

# Importing the contents of the file
# into our covid table
cursor.executemany(insert_records, contents)

# cursor.executemany(insert_records, contents) is a
# method provided by the sqlite3 library in Python that allows
# for executing multiple SQL statements with a single call.
# The executemany() method takes the SQL INSERT statement and the
# list of data as its arguments, and iterates over the list,
# inserting each row of data into the table using the provided SQL statement.
# This allows for efficient insertion of multiple rows of data into the table,
# as opposed to executing an individual INSERT statement for each row.
# which is used to insert data into the covid table.
# The contents variable is a list of lists, where each inner list represents a single row of data from the covid.csv file.


# Committing the changes
connection.commit()

# closing the database connection
connection.close()

In [None]:
%load_ext sql


The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [None]:
%%sql
sqlite:///covid.db
SELECT * FROM covid LIMIT 5;

Done.


date,country,confirmed,recovered,deaths
Date,Country,Confirmed,Recovered,Deaths
2020-01-22,Afghanistan,0,0,0
2020-01-23,Afghanistan,0,0,0
2020-01-24,Afghanistan,0,0,0
2020-01-25,Afghanistan,0,0,0


In [None]:
#How many confirmed cases are there in each country?

%%sql

SELECT country, sum(confirmed) as total_confirmed
FROM covid
GROUP BY country
ORDER BY total_confirmed DESC
LIMIT 10;

 * sqlite:///covid.db
Done.


country,total_confirmed
US,22963151317
India,14681429726
Brazil,9991736059
France,4588778075
United Kingdom,4376490715
Russia,4055403879
Turkey,3505368946
Germany,3007332317
Italy,2914673720
Spain,2754459962


In [None]:
%%sql

SELECT country, date, confirmed
FROM covid
WHERE country='US' AND confirmed>0
ORDER BY date
LIMIT 50

 * sqlite:///covid.db
Done.


country,date,confirmed
US,2020-01-22,1
US,2020-01-23,1
US,2020-01-24,2
US,2020-01-25,2
US,2020-01-26,5
US,2020-01-27,5
US,2020-01-28,5
US,2020-01-29,6
US,2020-01-30,6
US,2020-01-31,8


In [None]:
%%sql

SELECT country, max(confirmed) as total_confirmed
FROM covid
GROUP BY country
ORDER BY total_confirmed DESC
LIMIT 10;

 * sqlite:///covid.db
Done.


country,total_confirmed
Country,Confirmed
US,80625120
India,43042097
Brazil,30250077
France,27874269
Germany,23416663
United Kingdom,21916961
Russia,17801103
"Korea, South",16305752
Italy,15659835


In [None]:
#Retrive the top 3 countries with the highest number of death
%%sql

SELECT country, max(deaths) as total_deaths
FROM covid
GROUP BY country
ORDER BY total_deaths DESC
LIMIT 10;

 * sqlite:///covid.db
Done.


country,total_deaths
Country,Deaths
US,988609
Brazil,662185
India,521751
Russia,365774
Mexico,323938
Peru,212619
United Kingdom,172014
Italy,161602
Indonesia,155844


In [None]:
#How many confirmed cases are there in each country for the month of March of 2020?
%%sql

SELECT country, max(confirmed) as total_confirmed
FROM covid
WHERE date >= '2020-03-01' and date < '2020-04-01'
GROUP BY country
ORDER BY total_confirmed DESC
LIMIT 10

 * sqlite:///covid.db
Done.


country,total_confirmed
US,192079
Italy,105792
Spain,95923
China,83786
Germany,61913
France,52281
Iran,44605
United Kingdom,38788
Switzerland,16605
Netherlands,13686


One way to analyze the impact of COVID-19 on different countries is by combining data from the COVID-19 dataset with data on countries, such as population density and GDP. By joining these two datasets, we can gain a better understanding of how different factors might be associated with the spread and severity of COVID-19 in different countries.

For example, we might investigate whether countries with higher population densities have had higher rates of COVID-19 transmission, or whether countries with lower GDPs have had higher death rates from COVID-19. By combining these two datasets, we can gain a more comprehensive understanding of the ongoing COVID-19 pandemic and inform public health policy decisions.

In [None]:
# follow this instruction: https://www.kaggle.com/general/74235
#
!pip install kaggle
from google.colab import files
files.upload()
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d fernandol/countries-of-the-world
!unzip countries-of-the-world.zip
!mv "countries of the world.csv" "countries.csv"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Saving kaggle.json to kaggle.json
Downloading countries-of-the-world.zip to /content
  0% 0.00/13.4k [00:00<?, ?B/s]
100% 13.4k/13.4k [00:00<00:00, 10.3MB/s]
Archive:  countries-of-the-world.zip
  inflating: countries of the world.csv  


In [None]:
# Import required modules
import csv
import sqlite3

# Connecting to the geeks database
connection = sqlite3.connect('covid.db')

# Creating a cursor object to execute
# SQL queries on a database table
cursor = connection.cursor()


# Table Definition
create_table = '''CREATE  TABLE IF NOT EXISTS countries(
    Country text PRIMARY KEY,
    Region text,
    Population integer,
    Area integer,
    Pop_density integer,
    Coastline_area_ratio integer,
    Net_migration integer,
    Infant_mortality integer,
    GDP integer,
    Literacy integer,
    Phones integer,
    Arable integer,
    Crops integer,
    Other integer,
    Climate integer,
    Birthrate integer,
    Deathrate integer,
    Agriculture integer,
    Industry integer,
    Service integer
)'''

# Creating the table into our
# database
cursor.execute(create_table)
# Opening the covid.csv file
file = open('countries.csv')

# Reading the contents of the
# covid.csv file
contents = csv.reader(file)

# SQL query to insert data into the
# person table
insert_records = "INSERT INTO countries (Country, Region, Population, Area, Pop_density, Coastline_area_ratio, Net_migration, Infant_mortality, GDP, Literacy, Phones, Arable, Crops, Other, Climate, Birthrate, Deathrate, Agriculture, Industry, Service) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"

# Importing the contents of the file
# into our covid table
cursor.executemany(insert_records, contents)

# Committing the changes
connection.commit()

# closing the database connection
connection.close()

In [None]:
%%sql

SELECT * FROM countries LIMIT 10;

 * sqlite:///covid.db
Done.


Country,Region,Population,Area,Pop_density,Coastline_area_ratio,Net_migration,Infant_mortality,GDP,Literacy,Phones,Arable,Crops,Other,Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,480,000,2306,16307,700,360,32,1213,022,8765,1,466,2034,038,024,038
Albania,EASTERN EUROPE,3581655,28748,1246,126,-493,2152,4500,865,712,2109,442,7449,3,1511,522,0232,0188,0579
Algeria,NORTHERN AFRICA,32930091,2381740,138,004,-039,31,6000,700,781,322,025,9653,1,1714,461,0101,06,0298
American Samoa,OCEANIA,57794,199,2904,5829,-2071,927,8000,970,2595,10,15,75,2,2246,327,,,
Andorra,WESTERN EUROPE,71201,468,1521,000,66,405,19000,1000,4972,222,0,9778,3,871,625,,,
Angola,SUB-SAHARAN AFRICA,12127071,1246700,97,013,0,19119,1900,420,78,241,024,9735,,4511,242,0096,0658,0246
Anguilla,LATIN AMER. & CARIB,13477,102,1321,5980,1076,2103,8600,950,4600,0,0,100,2,1417,534,004,018,078
Antigua & Barbuda,LATIN AMER. & CARIB,69108,443,1560,3454,-615,1946,11000,890,5499,1818,455,7727,2,1693,537,0038,022,0743
Argentina,LATIN AMER. & CARIB,39921833,2766890,144,018,061,1518,11200,971,2204,1231,048,8721,3,1673,755,0095,0358,0547


In [None]:
%%sql
SELECT * FROM covid where Country='Afghanistan' LIMIT 2;

 * sqlite:///covid.db
Done.


date,country,confirmed,recovered,deaths
2020-01-22,Afghanistan,0,0,0
2020-01-23,Afghanistan,0,0,0


In [None]:
%%sql

SELECT * FROM countries where Country='Afghanistan ' LIMIT 2;

 * sqlite:///covid.db
Done.


Country,Region,Population,Area,Pop_density,Coastline_area_ratio,Net_migration,Infant_mortality,GDP,Literacy,Phones,Arable,Crops,Other,Climate,Birthrate,Deathrate,Agriculture,Industry,Service


In [None]:
%%sql

SELECT c1.Country,  c2.Country
FROM countries c1
JOIN covid c2
ON c1.Country = c2.Country limit 4;


 * sqlite:///covid.db
Done.


Country,country
Afghanistan,Afghanistan
Albania,Albania
Algeria,Algeria
Andorra,Andorra


In [None]:

# The SQLite trim() function removes specified characters at
# the beginning and the end of a string. If you don’t specify the characters to remove,
# the trim() function will remove space characters by default.

%%sql

UPDATE countries
SET Country = TRIM(Country)


 * sqlite:///covid.db
(sqlite3.OperationalError) attempt to write a readonly database
[SQL: UPDATE countries
SET Country = TRIM(Country)]
(Background on this error at: https://sqlalche.me/e/14/e3q8)


In [None]:
%%sql
SELECT c1.Country,  c2.Country
FROM countries c1
JOIN covid c2
ON c1.Country = c2.Country LIMIT 10;


 * sqlite:///covid.db
Done.


Country,country
Afghanistan,Afghanistan
Albania,Albania
Algeria,Algeria
Andorra,Andorra
Angola,Angola
Argentina,Argentina
Armenia,Armenia
Australia,Australia
Austria,Austria
Azerbaijan,Azerbaijan


In [None]:
%%sql

SELECT covid.country, (sum(covid.deaths) *1.0 / countries.population) * 100000 AS death_rate_per_100
FROM covid
JOIN countries ON covid.country = countries.Country
GROUP BY covid.country
HAVING sum(covid.deaths)>0
ORDER BY (sum(covid.deaths) *1.0 / countries.population) DESC
LIMIT 10


 * sqlite:///covid.db
Done.


country,death_rate_per_100
Peru,353685.5744328534
San Marino,187976.4794366005
Hungary,146818.9823123843
Belgium,144898.92010524645
Bulgaria,143189.71555509698
Brazil,140790.00808530592
Colombia,124977.12077170127
United Kingdom,124309.9140487906
Mexico,124262.9346197668
Argentina,121828.61693750384


In [None]:
%%sql

SELECT covid.country, (max(covid.deaths) *1.0 / countries.population) * 100000 AS death_rate_per_100
FROM covid
JOIN countries ON covid.country = countries.Country
GROUP BY covid.country
HAVING MAX(covid.deaths)>0
ORDER BY (max(covid.deaths) *1.0 / countries.population) DESC
LIMIT 10


 * sqlite:///covid.db
Done.


country,death_rate_per_100
Peru,751.2347892524232
Bulgaria,498.0388923123252
Hungary,459.5077171047477
San Marino,389.7302656319442
Slovakia,362.5551710394143
Georgia,360.16512377096257
Chile,354.71813045304515
Brazil,352.0795631490082
Croatia,349.852683653748
Slovenia,326.1128551439129


In [None]:
#Investigate the relationship between population density and COVID-19
%%sql

SELECT covid.country, (max(covid.deaths) *1.0 / countries.population) * 100000 AS death_rate_per_100
FROM covid
JOIN countries ON covid.country = countries.Country
GROUP BY covid.country
ORDER BY (max(covid.deaths) *1.0 / countries.population) DESC
LIMIT 10


 * sqlite:///covid.db
Done.


country,death_rate_per_100
Peru,751.2347892524232
Bulgaria,498.0388923123252
Hungary,459.5077171047477
San Marino,389.7302656319442
Slovakia,362.5551710394143
Georgia,360.16512377096257
Chile,354.71813045304515
Brazil,352.0795631490082
Croatia,349.852683653748
Slovenia,326.1128551439129


In [None]:
#Investigate the relationship between population density and COVID-19 transmission
%%sql

SELECT covid.country,
(max(covid.confirmed) *1.0 / countries.population) * 100000 AS confirmed_rate_per_100
FROM covid
JOIN countries ON covid.country = countries.Country
GROUP BY covid.country
ORDER BY (max(covid.confirmed) *1.0 / countries.population) DESC
LIMIT 10


 * sqlite:///covid.db
Done.


country,confirmed_rate_per_100
Bahrain,80556.98304429668
Israel,63428.71203411397
Iceland,61450.02471708952
Cyprus,59207.62564372607
Denmark,57674.54626145343
Andorra,57174.75878147779
San Marino,54268.23014597791
Seychelles,51090.86226560872
Liechtenstein,49945.56742283815
Netherlands,49692.05578571844


In [None]:
%%sql

DROP TABLE IF EXISTS covid_agg;
CREATE Table covid_agg AS
SELECT country,  MAX(confirmed) as confirmed, MAX(recovered) as recovered, MAX(deaths) as deaths
FROM covid
GROUP BY country
HAVING MAX(confirmed)>0


 * sqlite:///covid.db
(sqlite3.OperationalError) attempt to write a readonly database
[SQL: DROP TABLE IF EXISTS covid_agg;]
(Background on this error at: https://sqlalche.me/e/14/e3q8)


In [None]:
%%sql
SELECT * FROM covid_agg limit 10;

 * sqlite:///covid.db
Done.


country,confirmed,recovered,deaths
Afghanistan,178387,82586,7676
Albania,274462,130314,3496
Algeria,265739,118409,6874
Andorra,40709,14380,155
Angola,99194,39582,1900
Antarctica,11,0,0
Antigua and Barbuda,7535,1239,135
Argentina,9060495,4615834,128344
Armenia,422747,220438,8621
Australia,5384615,24203,6779


In [None]:
# Investigate the relationship between GDP and death rates from COVID-19

#The SQLite CASE expression evaluates a list of conditions
#and returns an  expression based on the result of the evaluation.

#The CASE expression is similar to the
#IF-THEN-ELSE statement in other programming languages.
%%sql
SELECT GDP,
    CASE
        WHEN GDP BETWEEN 0 AND 10000 THEN '0-10k'
        WHEN GDP BETWEEN 10001 AND 20000 THEN '10k-20k'
        WHEN GDP BETWEEN 20001 AND 30000 THEN '20k-30k'
        ELSE '30k+'
    END AS GDP_range
FROM countries LIMIT 10;

 * sqlite:///covid.db
Done.


GDP,GDP_range
GDP ($ per capita),30k+
700,0-10k
4500,0-10k
6000,0-10k
8000,0-10k
19000,10k-20k
1900,0-10k
8600,0-10k
11000,10k-20k
11200,10k-20k


In [None]:
%%sql

SELECT
    CASE
        WHEN GDP BETWEEN 0 AND 10000 THEN '0-10k'
        WHEN GDP BETWEEN 10001 AND 20000 THEN '10k-20k'
        WHEN GDP BETWEEN 20001 AND 30000 THEN '20k-30k'
        ELSE '30k+'
    END AS GDP_range,
    confirmed as total_confirmed
FROM covid_agg
JOIN countries ON covid_agg.country = countries.Country
WHERE GDP>0
GROUP BY GDP_range
ORDER BY total_confirmed DESC


 * sqlite:///covid.db
Done.


GDP_range,total_confirmed
30k+,Confirmed
20k-30k,5384615
0-10k,178387
10k-20k,40709


In [None]:
%%sql

SELECT
    CASE
        WHEN GDP BETWEEN 0 AND 10000 THEN '0-10k'
        WHEN GDP BETWEEN 10001 AND 20000 THEN '10k-20k'
        WHEN GDP BETWEEN 20001 AND 30000 THEN '20k-30k'
        ELSE '30k+'
    END AS GDP_range,
    deaths as total_deaths
FROM covid_agg
JOIN countries ON covid_agg.country = countries.Country
WHERE GDP>0
GROUP BY GDP_range
ORDER BY total_deaths DESC


 * sqlite:///covid.db
Done.


GDP_range,total_deaths
30k+,Deaths
0-10k,7676
20k-30k,6779
10k-20k,155


In [None]:
%%sql

SELECT countries.Region,
    CASE
        WHEN GDP BETWEEN 0 AND 10000 THEN '0-10k'
        WHEN GDP BETWEEN 10001 AND 20000 THEN '10k-20k'
        WHEN GDP BETWEEN 20001 AND 30000 THEN '20k-30k'
        ELSE '30k+'
    END AS GDP_range,
    deaths as total_deaths
FROM covid_agg
JOIN countries ON covid_agg.country = countries.Country
WHERE GDP>0
GROUP BY GDP_range, countries.Region
ORDER BY countries.Region,total_deaths DESC


 * sqlite:///covid.db
Done.


Region,GDP_range,total_deaths
ASIA (EX. NEAR EAST),20k-30k,28998
ASIA (EX. NEAR EAST),0-10k,7676
ASIA (EX. NEAR EAST),10k-20k,217
BALTICS,10k-20k,2511
C.W. OF IND. STATES,0-10k,8621
EASTERN EUROPE,10k-20k,15725
EASTERN EUROPE,0-10k,3496
LATIN AMER. & CARIB,10k-20k,128344
LATIN AMER. & CARIB,0-10k,672
NEAR EAST,0-10k,25198
