# Querying Wikidata
In this notebook we 
- query Wikidata via its SPARQL endpoint using SPARQLWrapper 
- transform the result set to a Pandas DataFrame
- and visualize it using plotly 



In [11]:
!pip install SPARQLWrapper

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import plotly.express as px

def sparql_select_to_dataframe(sparql_wrapper, query):
    sparql_wrapper.setQuery(query)
    sparql_wrapper.setReturnFormat(JSON)
    results = sparql_wrapper.query().convert()

    columns = results['head']['vars']
    rows = []
    for row in results['results']['bindings']:
        item = {}
        for column in columns:
            item[column] = row[column]['value']
        rows.append(item)

    return pd.DataFrame(rows, columns=columns)

wikidata_sparql_endpoint = SPARQLWrapper("https://query.wikidata.org/sparql")

In [13]:
df = sparql_select_to_dataframe(wikidata_sparql_endpoint,"""
  PREFIX wd: <http://www.wikidata.org/entity/>
  PREFIX wdt: <http://www.wikidata.org/prop/direct/>
  PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

  SELECT ?country ?countryLabel ?area ?population ?continent ?continentLabel
  WHERE {
    ?country wdt:P31 wd:Q6256 ;
        rdfs:label ?countryLabel ; 
        wdt:P2046 ?area ;
        wdt:P1082 ?population ;
        wdt:P30 ?continent .
    ?continent rdfs:label ?continentLabel .
    
    FILTER (lang(?countryLabel) = "en" && lang(?continentLabel) = "en")
  }
  ORDER BY ?continentLabel ?countryLabel
""")

df # display the dataframe

Unnamed: 0,country,countryLabel,area,population,continent,continentLabel
0,http://www.wikidata.org/entity/Q262,Algeria,2381741,43900000,http://www.wikidata.org/entity/Q15,Africa
1,http://www.wikidata.org/entity/Q916,Angola,1246700,32866270,http://www.wikidata.org/entity/Q15,Africa
2,http://www.wikidata.org/entity/Q962,Benin,114763,11175692,http://www.wikidata.org/entity/Q15,Africa
3,http://www.wikidata.org/entity/Q963,Botswana,581737,2291661,http://www.wikidata.org/entity/Q15,Africa
4,http://www.wikidata.org/entity/Q965,Burkina Faso,274200,20488000,http://www.wikidata.org/entity/Q15,Africa
...,...,...,...,...,...,...
195,http://www.wikidata.org/entity/Q734,Guyana,214970,777859,http://www.wikidata.org/entity/Q18,South America
196,http://www.wikidata.org/entity/Q733,Paraguay,406756,6811297,http://www.wikidata.org/entity/Q18,South America
197,http://www.wikidata.org/entity/Q419,Peru,1285216,29381884,http://www.wikidata.org/entity/Q18,South America
198,http://www.wikidata.org/entity/Q77,Uruguay,176215,3456750,http://www.wikidata.org/entity/Q18,South America


In [14]:
fig = px.scatter(df, x='population', y='area', hover_name='countryLabel', color='continentLabel',
                 log_x=True, log_y=True, title='Country Population vs Area by Continent',
                 labels={'population': 'Population', 'area': 'Area (sq km)', 'continentLabel': 'Continent'},
                 width=800, height=600)

fig.show()