In [1]:
import numpy as np
import pandas as pd
import requests
import io

# Fetching

In [2]:
uri = 'https://query.wikidata.org/sparql'

In [3]:
query = """
SELECT ?id ?name (MAX(?areas) AS ?area) WHERE {
  ?item p:P31 ?stat.
  ?stat ps:P31 wd:Q3624078.
  FILTER(NOT EXISTS { ?stat pq:P582 ?end_time. })
  FILTER(NOT EXISTS { 
    ?item p:P31 [ ps:P31 wd:Q3024240 ]
  })
  ?item wdt:P2046 ?areas.
  BIND(STRAFTER(STR(?item), "http://www.wikidata.org/entity/") AS ?id)
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en".
    ?item rdfs:label ?name.
  }
}
GROUP BY ?id ?name
"""

In [4]:
params = {'query' : query}
headers = { 'Accept': 'text/csv'}

In [5]:
r = requests.get(uri, params=params, headers=headers)
r.raise_for_status()

In [6]:
df = pd.read_csv(io.StringIO(r.content.decode('utf-8')))
df.head()

Unnamed: 0,id,name,area
0,Q16,Canada,9984670.0
1,Q17,Japan,377972.28
2,Q20,Norway,385207.0
3,Q27,Ireland,70273.0
4,Q28,Hungary,93011.4


# Cleaning

## Missing Values

###### Kingdom of Denmark

In [7]:
assert len(df[df.id == 'Q756617']) == 0

In [8]:
denmark_area = 42993.0
denmark = pd.DataFrame([{
    'id': 'Q756617',
    'name': 'Kingdom of Denmark',
    'area': denmark_area
}])
denmark

Unnamed: 0,area,id,name
0,42993.0,Q756617,Kingdom of Denmark


In [9]:
df = pd.concat([df, denmark], ignore_index=True, sort=False)

# Testing

In [10]:
assert len(df) == 195

# Writing

In [11]:
df = df.set_index(['id', 'name']).sort_index()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,area
id,name,Unnamed: 2_level_1
Q1000,Gabon,267667.0
Q1005,Gambia,11300.0
Q1006,Guinea,245857.0
Q1007,Guinea-Bissau,36125.0
Q1008,Ivory Coast,322463.0


In [12]:
df.to_csv('country_areas.csv')

# References

- https://query.wikidata.org
- https://en.wikipedia.org/wiki/Denmark