In [1]:
import numpy as np
import pandas as pd
import requests
import io

# Fetching

In [2]:
uri = 'https://query.wikidata.org/sparql'

In [3]:
query = """
SELECT DISTINCT ?id ?name ?continent_id ?continent_name WHERE {
  ?item p:P31 ?stat.
  ?stat ps:P31 wd:Q3624078.
  FILTER(NOT EXISTS { ?stat pq:P582 ?end_time. })
  FILTER(NOT EXISTS { 
    ?item p:P31 [ ps:P31 wd:Q3024240 ]
  })
  ?item wdt:P30 ?continent.
  BIND(STRAFTER(STR(?item), "http://www.wikidata.org/entity/") AS ?id)
  BIND(STRAFTER(STR(?continent), "http://www.wikidata.org/entity/") AS ?continent_id)
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en".
    ?item rdfs:label ?name.
    ?continent rdfs:label ?continent_name.
  }
}
"""

In [4]:
params = {'query' : query}
headers = { 'Accept': 'text/csv'}

In [5]:
r = requests.get(uri, params=params, headers=headers)
r.raise_for_status()

In [6]:
raw = pd.read_csv(io.StringIO(r.content.decode('utf-8')))
raw = raw.set_index(['id', 'name']).sort_index()
raw.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,continent_id,continent_name
id,name,Unnamed: 2_level_1,Unnamed: 3_level_1
Q1000,Gabon,Q15,Africa
Q1005,Gambia,Q15,Africa
Q1006,Guinea,Q15,Africa
Q1007,Guinea-Bissau,Q15,Africa
Q1008,Ivory Coast,Q15,Africa


# Cleaning

## Add `continent` code

In [7]:
clean = raw.copy()
clean.loc[clean.continent_id == 'Q15', 'continent'] = 'AF'
clean.loc[clean.continent_id == 'Q18', 'continent'] = 'SA'
clean.loc[clean.continent_id == 'Q46', 'continent'] = 'EU'
clean.loc[clean.continent_id == 'Q48', 'continent'] = 'AS'
clean.loc[clean.continent_id == 'Q49', 'continent'] = 'NA'
clean.loc[clean.continent_id == 'Q51', 'continent'] = 'AN'
clean.loc[clean.continent_id == 'Q538', 'continent'] = 'OC'
clean.loc[clean.continent_id == 'Q3960', 'continent'] = 'OC'
clean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,continent_id,continent_name,continent
id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q1000,Gabon,Q15,Africa,AF
Q1005,Gambia,Q15,Africa,AF
Q1006,Guinea,Q15,Africa,AF
Q1007,Guinea-Bissau,Q15,Africa,AF
Q1008,Ivory Coast,Q15,Africa,AF


## Drop `continent_id` and `continent_name` columns

In [8]:
clean = clean.drop(columns=['continent_id', 'continent_name'])
clean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,continent
id,name,Unnamed: 2_level_1
Q1000,Gabon,AF
Q1005,Gambia,AF
Q1006,Guinea,AF
Q1007,Guinea-Bissau,AF
Q1008,Ivory Coast,AF


# Testing

In [9]:
assert len(clean.index.unique()) == 195

In [10]:
assert clean.continent.notnull().all()

# Writing

In [11]:
out = clean.copy()
out.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,continent
id,name,Unnamed: 2_level_1
Q1000,Gabon,AF
Q1005,Gambia,AF
Q1006,Guinea,AF
Q1007,Guinea-Bissau,AF
Q1008,Ivory Coast,AF


In [12]:
out.to_csv('country_continents.csv')