There are two tables. Facts and cities. Cities has a common column with Facts, facts.id, which represents the country id in facts.

In [4]:
import sqlite3
import pandas as pd

**INNER JOIN**

In [5]:
#Joining both tables: FACT and City based on Country ID
#City considers country id as column facts.id, and facts considers it as ID
conn = sqlite3.connect("factbook.db")
all_join = "SELECT * FROM facts INNER JOIN cities on cities.facts_id = facts.id"
pd.read_sql_query(all_join, conn).head(10)

Unnamed: 0,id,code,name,area,area_land,area_water,population,population_growth,birth_rate,death_rate,migration_rate,id.1,name.1,population.1,capital,facts_id
0,216,aa,Aruba,180.0,180.0,0.0,112162,1.33,12.56,8.18,8.92,1,Oranjestad,37000,1,216
1,6,ac,Antigua and Barbuda,442.0,442.0,0.0,92436,1.24,15.85,5.69,2.21,2,Saint John'S,27000,1,6
2,184,ae,United Arab Emirates,83600.0,83600.0,0.0,5779760,2.58,15.43,1.97,12.36,3,Abu Dhabi,942000,1,184
3,184,ae,United Arab Emirates,83600.0,83600.0,0.0,5779760,2.58,15.43,1.97,12.36,4,Dubai,1978000,0,184
4,184,ae,United Arab Emirates,83600.0,83600.0,0.0,5779760,2.58,15.43,1.97,12.36,5,Sharjah,983000,0,184
5,1,af,Afghanistan,652230.0,652230.0,0.0,32564342,2.32,38.57,13.89,1.51,6,Kabul,3097000,1,1
6,3,ag,Algeria,2381741.0,2381741.0,0.0,39542166,1.84,23.67,4.31,0.92,7,Algiers,2916000,1,3
7,3,ag,Algeria,2381741.0,2381741.0,0.0,39542166,1.84,23.67,4.31,0.92,8,Oran,783000,0,3
8,11,aj,Azerbaijan,86600.0,82629.0,3971.0,9780780,0.96,16.64,7.07,0.0,9,Baku,2123000,1,11
9,2,al,Albania,28748.0,27398.0,1350.0,3029278,0.3,12.92,6.58,3.3,10,Tirana,419000,1,2


**INNER JOIN with Aliases and city column**

In [7]:
#Query returns all city columns and facts name column as "country_name"
#Introduces aliases c for city and f for facts
inner_alias_name = "SELECT c.*, f.name country_name FROM facts f INNER JOIN cities c on c.facts_id = f.id LIMIT 5"
pd.read_sql_query(inner_alias_name, conn)

Unnamed: 0,id,name,population,capital,facts_id,country_name
0,1,Oranjestad,37000,1,216,Aruba
1,2,Saint John'S,27000,1,6,Antigua and Barbuda
2,3,Abu Dhabi,942000,1,184,United Arab Emirates
3,4,Dubai,1978000,0,184,United Arab Emirates
4,5,Sharjah,983000,0,184,United Arab Emirates


**INNER JOIN with country and capital only if capital is TRUE**

In [10]:
#Query selects country and capital if capital (capital =1 on cities table)
inner_country_capital = "SELECT f.name country, c.name capital_city FROM cities c INNER JOIN facts f ON f.id = c.facts_id WHERE c.capital = 1"
pd.read_sql_query(inner_country_capital, conn).head()

Unnamed: 0,country,capital_city
0,Aruba,Oranjestad
1,Antigua and Barbuda,Saint John'S
2,United Arab Emirates,Abu Dhabi
3,Afghanistan,Kabul
4,Algeria,Algiers


## LEFT JOIN

In [19]:
facts_values = "SELECT COUNT(DISTINCT(name)) fact_count FROM facts;"
cities_values = "SELECT COUNT(DISTINCT(facts_id)) cities_count FROM cities"

In [20]:
pd.read_sql_query(facts_values, conn)

Unnamed: 0,fact_count
0,261


In [21]:
pd.read_sql_query(cities_values, conn)

Unnamed: 0,cities_count
0,210


Since there is a discrepancy in number of values for both tables: fact and cities; cities is missing some values, and therefore we want to keep all values, and we will do a left join on fact to keep all fact values

In [28]:
#Selecting fact country, country population
#LEFT join based on fact
left_fact ="SELECT f.name country, f.population FROM facts f LEFT JOIN cities c ON c.facts_id = f.id"  
pd.read_sql_query(left_fact, conn).head()

Unnamed: 0,country,population
0,Afghanistan,32564342.0
1,Albania,3029278.0
2,Algeria,39542166.0
3,Algeria,39542166.0
4,Andorra,85580.0


In [32]:
#Selecting fact country, country population
#LEFT join based on fact
#Only shows unique fact country values
left_only_fact ="SELECT f.name country, f.population FROM facts f LEFT JOIN cities c ON c.facts_id = f.id WHERE c.name IS NULL"  
pd.read_sql_query(left_only_fact, conn)

Unnamed: 0,country,population
0,Kosovo,1870981.0
1,Monaco,30535.0
2,Nauru,9540.0
3,San Marino,33020.0
4,Singapore,5674472.0
5,Holy See (Vatican City),842.0
6,Taiwan,23415130.0
7,European Union,513949400.0
8,Ashmore and Cartier Islands,
9,Christmas Island,1530.0


These are the countries in fact that do not have a corresponding ID that matches in cities table.
The table matches the previous shown missing data = 51 rows missing on cities table.
* These countries/states/small territories do not show a city merely because these are small territories.

## Rank population by capital city

In [38]:
rank_pop_city = "SELECT c.name capital_city, f.name country, c.population population FROM facts f INNER JOIN cities c ON c.facts_id = f.id WHERE c.capital = 1 ORDER BY 3 DESC LIMIT 10"
pd.read_sql_query(rank_pop_city, conn)

Unnamed: 0,capital_city,country,population
0,Tokyo,Japan,37217000
1,New Delhi,India,22654000
2,Mexico City,Mexico,20446000
3,Beijing,China,15594000
4,Dhaka,Bangladesh,15391000
5,Buenos Aires,Argentina,13528000
6,Manila,Philippines,11862000
7,Moscow,Russia,11621000
8,Cairo,Egypt,11169000
9,Jakarta,Indonesia,9769000


Above table shows the ranking of population based on capital cities

In [42]:
#Query does same as previous query, but this time it uses subquery
#Subquery is used to do the inner join on the selected subquery: capital in cities table
join_sub_cap = "SELECT c.name capital_city, f.name country, c.population FROM facts f INNER JOIN (SELECT * FROM cities WHERE capital =1) c ON c.facts_id = f.id LIMIT 10"
pd.read_sql_query(join_sub_cap, conn)

Unnamed: 0,capital_city,country,population
0,Oranjestad,Aruba,37000
1,Saint John'S,Antigua and Barbuda,27000
2,Abu Dhabi,United Arab Emirates,942000
3,Kabul,Afghanistan,3097000
4,Algiers,Algeria,2916000
5,Baku,Azerbaijan,2123000
6,Tirana,Albania,419000
7,Yerevan,Armenia,1116000
8,Andorra La Vella,Andorra,23000
9,Luanda,Angola,5068000


In [45]:
#Query has a subquery with two requirements for the inner join
#city id is a capital and population is over 10million
#Ordered by population
cap_pop_desc = "SELECT c.name capital_city, f.name country, c.population FROM facts f INNER JOIN (SELECT * FROM cities WHERE capital = 1 AND population > 10000000) c ON c.facts_id = f.id ORDER BY 3 DESC"
pd.read_sql_query(cap_pop_desc, conn)

Unnamed: 0,capital_city,country,population
0,Tokyo,Japan,37217000
1,New Delhi,India,22654000
2,Mexico City,Mexico,20446000
3,Beijing,China,15594000
4,Dhaka,Bangladesh,15391000
5,Buenos Aires,Argentina,13528000
6,Manila,Philippines,11862000
7,Moscow,Russia,11621000
8,Cairo,Egypt,11169000


In [54]:
urban_pct = "SELECT f.name country, urban_pop, f.population total_pop, (urban_pop / CAST(population AS FLOAT)) urban_pct FROM facts f INNER JOIN (SELECT facts_id, SUM(population) urban_pop FROM cities GROUP BY 1) c ON c.facts_id = f.id WHERE urban_pct > .5 ORDER BY 4 ASC"
pd.read_sql_query(urban_pct, conn)

Unnamed: 0,country,urban_pop,total_pop,urban_pct
0,Uruguay,1672000,3341893,0.500315
1,"Congo, Republic of the",2445000,4755097,0.514185
2,Brunei,241000,429646,0.560927
3,New Caledonia,157000,271615,0.578024
4,Virgin Islands,60000,103574,0.579296
5,Falkland Islands (Islas Malvinas),2000,3361,0.595061
6,Djibouti,496000,828324,0.5988
7,Australia,13789000,22751014,0.606083
8,Iceland,206000,331918,0.620635
9,Israel,5226000,8049314,0.649248
